Compare commits

..

1 Commits

Author SHA1 Message Date
Simone Scarduzio f99f6c20d8 style: Apply code formatting with ruff format
- Formatted core service implementation
- Formatted CLI main module
- Formatted test file with proper line breaks and indentation

All formatting, linting, and type checks now pass.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-29 16:16:31 +02:00
13 changed files with 122 additions and 1094 deletions
+26
View File
@@ -3,6 +3,7 @@ name: CI
on:
push:
branches: [main, develop]
tags: ["v*"]
pull_request:
branches: [main]
@@ -142,3 +143,28 @@ jobs:
run: |
uv run pytest tests/e2e -v --tb=short
pypi-publish:
needs: [lint, typecheck, test, e2e-test]
runs-on: ubuntu-latest
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
steps:
- uses: actions/checkout@v4
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Build package
run: |
uv build
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
-249
View File
@@ -1,249 +0,0 @@
name: Manual Release (Simple)
on:
workflow_dispatch:
inputs:
version:
description: 'Version to release (e.g., 0.3.2) - make sure tag v0.3.2 exists!'
required: true
type: string
pypi_environment:
description: 'PyPI environment'
required: true
type: choice
options:
- 'pypi'
- 'testpypi'
default: 'pypi'
env:
UV_VERSION: "0.5.13"
PYTHON_VERSION: "3.12"
jobs:
validate:
runs-on: ubuntu-latest
outputs:
tag_name: ${{ steps.validate_tag.outputs.tag }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Validate version format
run: |
if ! echo "${{ github.event.inputs.version }}" | grep -E '^[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9]+)?$'; then
echo "Error: Version must be in format X.Y.Z or X.Y.Z-suffix"
exit 1
fi
- name: Check if tag exists
id: validate_tag
run: |
TAG="v${{ github.event.inputs.version }}"
if ! git rev-parse "$TAG" >/dev/null 2>&1; then
echo "Error: Tag $TAG does not exist!"
echo "Please create it first with:"
echo " git tag $TAG"
echo " git push origin $TAG"
exit 1
fi
echo "tag=$TAG" >> $GITHUB_OUTPUT
lint:
needs: validate
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.validate.outputs.tag_name }}
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: |
uv pip install --system -e ".[dev]"
- name: Run ruff check
run: |
uv run ruff check src tests
- name: Run ruff format check
run: |
uv run ruff format --check src tests
typecheck:
needs: validate
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.validate.outputs.tag_name }}
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: |
uv pip install --system -e ".[dev]"
- name: Run mypy
run: |
uv run mypy src
test:
needs: validate
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.validate.outputs.tag_name }}
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install xdelta3
run: |
sudo apt-get update
sudo apt-get install -y xdelta3
- name: Install dependencies
run: |
uv pip install --system -e ".[dev]"
- name: Run unit tests
run: |
uv run pytest tests/unit -v --tb=short
- name: Run integration tests
run: |
uv run pytest tests/integration -v --tb=short
e2e-test:
needs: validate
runs-on: ubuntu-latest
services:
localstack:
image: localstack/localstack:latest
ports:
- 4566:4566
env:
SERVICES: s3
DEBUG: 0
DATA_DIR: /tmp/localstack/data
options: >-
--health-cmd "curl -f http://localhost:4566/_localstack/health"
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.validate.outputs.tag_name }}
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install xdelta3
run: |
sudo apt-get update
sudo apt-get install -y xdelta3
- name: Install dependencies
run: |
uv pip install --system -e ".[dev]"
- name: Run E2E tests
env:
AWS_ACCESS_KEY_ID: test
AWS_SECRET_ACCESS_KEY: test
AWS_DEFAULT_REGION: us-east-1
AWS_ENDPOINT_URL: http://localhost:4566
run: |
uv run pytest tests/e2e -v --tb=short
publish:
needs: [validate, lint, typecheck, test, e2e-test]
runs-on: ubuntu-latest
environment: ${{ github.event.inputs.pypi_environment }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.validate.outputs.tag_name }}
fetch-depth: 0 # Important for setuptools-scm
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Build package
run: |
uv build
- name: Publish to TestPyPI
if: github.event.inputs.pypi_environment == 'testpypi'
uses: pypa/gh-action-pypi-publish@release/v1
with:
repository-url: https://test.pypi.org/legacy/
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
- name: Publish to PyPI
if: github.event.inputs.pypi_environment == 'pypi'
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
- name: Create GitHub Release
uses: softprops/action-gh-release@v1
with:
tag_name: ${{ needs.validate.outputs.tag_name }}
name: Release v${{ github.event.inputs.version }}
body: |
## DeltaGlider v${{ github.event.inputs.version }}
Published to ${{ github.event.inputs.pypi_environment == 'pypi' && 'PyPI' || 'TestPyPI' }}
### Installation
```bash
pip install deltaglider==${{ github.event.inputs.version }}
```
draft: false
prerelease: ${{ contains(github.event.inputs.version, '-') }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-253
View File
@@ -1,253 +0,0 @@
name: Manual Release
on:
workflow_dispatch:
inputs:
version:
description: 'Version to release (e.g., 0.3.2)'
required: true
type: string
pypi_environment:
description: 'PyPI environment'
required: true
type: choice
options:
- 'pypi'
- 'testpypi'
default: 'pypi'
env:
UV_VERSION: "0.5.13"
PYTHON_VERSION: "3.12"
jobs:
validate-and-tag:
runs-on: ubuntu-latest
outputs:
tag_name: ${{ steps.create_tag.outputs.tag }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.PAT_TOKEN }}
- name: Validate version format
run: |
if ! echo "${{ github.event.inputs.version }}" | grep -E '^[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9]+)?$'; then
echo "Error: Version must be in format X.Y.Z or X.Y.Z-suffix"
exit 1
fi
- name: Check if tag already exists
run: |
if git rev-parse "v${{ github.event.inputs.version }}" >/dev/null 2>&1; then
echo "Error: Tag v${{ github.event.inputs.version }} already exists"
exit 1
fi
- name: Create and push tag
id: create_tag
run: |
git config --global user.name "github-actions[bot]"
git config --global user.email "github-actions[bot]@users.noreply.github.com"
git tag -a "v${{ github.event.inputs.version }}" -m "Release v${{ github.event.inputs.version }}"
git push origin "v${{ github.event.inputs.version }}"
echo "tag=v${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
lint:
needs: validate-and-tag
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.validate-and-tag.outputs.tag_name }}
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: |
uv pip install --system -e ".[dev]"
- name: Run ruff check
run: |
uv run ruff check src tests
- name: Run ruff format check
run: |
uv run ruff format --check src tests
typecheck:
needs: validate-and-tag
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.validate-and-tag.outputs.tag_name }}
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: |
uv pip install --system -e ".[dev]"
- name: Run mypy
run: |
uv run mypy src
test:
needs: validate-and-tag
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.validate-and-tag.outputs.tag_name }}
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install xdelta3
run: |
sudo apt-get update
sudo apt-get install -y xdelta3
- name: Install dependencies
run: |
uv pip install --system -e ".[dev]"
- name: Run unit tests
run: |
uv run pytest tests/unit -v --tb=short
- name: Run integration tests
run: |
uv run pytest tests/integration -v --tb=short
e2e-test:
needs: validate-and-tag
runs-on: ubuntu-latest
services:
localstack:
image: localstack/localstack:latest
ports:
- 4566:4566
env:
SERVICES: s3
DEBUG: 0
DATA_DIR: /tmp/localstack/data
options: >-
--health-cmd "curl -f http://localhost:4566/_localstack/health"
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.validate-and-tag.outputs.tag_name }}
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install xdelta3
run: |
sudo apt-get update
sudo apt-get install -y xdelta3
- name: Install dependencies
run: |
uv pip install --system -e ".[dev]"
- name: Run E2E tests
env:
AWS_ACCESS_KEY_ID: test
AWS_SECRET_ACCESS_KEY: test
AWS_DEFAULT_REGION: us-east-1
AWS_ENDPOINT_URL: http://localhost:4566
run: |
uv run pytest tests/e2e -v --tb=short
publish:
needs: [validate-and-tag, lint, typecheck, test, e2e-test]
runs-on: ubuntu-latest
environment: ${{ github.event.inputs.pypi_environment }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.validate-and-tag.outputs.tag_name }}
fetch-depth: 0 # Important for setuptools-scm
- name: Install UV
run: |
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Build package
run: |
uv build
- name: Publish to TestPyPI
if: github.event.inputs.pypi_environment == 'testpypi'
uses: pypa/gh-action-pypi-publish@release/v1
with:
repository-url: https://test.pypi.org/legacy/
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
- name: Publish to PyPI
if: github.event.inputs.pypi_environment == 'pypi'
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
- name: Create GitHub Release
uses: softprops/action-gh-release@v1
with:
tag_name: ${{ needs.validate-and-tag.outputs.tag_name }}
name: Release v${{ github.event.inputs.version }}
body: |
## DeltaGlider v${{ github.event.inputs.version }}
Published to ${{ github.event.inputs.pypi_environment == 'pypi' && 'PyPI' || 'TestPyPI' }}
### Installation
```bash
pip install deltaglider==${{ github.event.inputs.version }}
```
draft: false
prerelease: ${{ contains(github.event.inputs.version, '-') }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+2 -17
View File
@@ -220,23 +220,8 @@ response = client.get_object(Bucket='releases', Key='v2.0.0/my-app.zip')
with open('downloaded.zip', 'wb') as f:
f.write(response['Body'].read())
# Smart list_objects with optimized performance (NEW!)
# Fast listing (default) - no metadata fetching, ~50ms for 1000 objects
response = client.list_objects(Bucket='releases', Prefix='v2.0.0/')
# Paginated listing for large buckets
response = client.list_objects(Bucket='releases', MaxKeys=100)
while response.is_truncated:
response = client.list_objects(
Bucket='releases',
MaxKeys=100,
ContinuationToken=response.next_continuation_token
)
# Get bucket statistics with smart defaults
stats = client.get_bucket_stats('releases') # Quick stats (50ms)
stats = client.get_bucket_stats('releases', detailed_stats=True) # With compression metrics
# All boto3 S3 methods supported
client.list_objects(Bucket='releases', Prefix='v2.0.0/')
client.delete_object(Bucket='releases', Key='old-version.zip')
client.head_object(Bucket='releases', Key='v2.0.0/my-app.zip')
```
-8
View File
@@ -1,8 +0,0 @@
export AWS_ENDPOINT_URL=http://localhost:9000
export AWS_ACCESS_KEY_ID=deltadmin
export AWS_SECRET_ACCESS_KEY=deltasecret
ror-data-importer \
--source-bucket=dg-demo \
--dest-bucket=new-buck \
--yes
-44
View File
@@ -1,44 +0,0 @@
fix: Optimize list_objects performance by eliminating N+1 query problem
BREAKING CHANGE: list_objects and get_bucket_stats signatures updated
## Problem
The list_objects method was making a separate HEAD request for every object
in the bucket to fetch metadata, causing severe performance degradation:
- 100 objects = 101 API calls (1 LIST + 100 HEAD)
- Response time: ~2.6 seconds for 1000 objects
## Solution
Implemented smart metadata fetching with intelligent defaults:
- Added FetchMetadata parameter (default: False) to list_objects
- Added detailed_stats parameter (default: False) to get_bucket_stats
- NEVER fetch metadata for non-delta files (they don't need it)
- Only fetch metadata for delta files when explicitly requested
## Performance Impact
- Before: ~2.6 seconds for 1000 objects (N+1 API calls)
- After: ~50ms for 1000 objects (1 API call)
- Improvement: ~5x faster for typical operations
## API Changes
- list_objects(..., FetchMetadata=False) - Smart performance default
- get_bucket_stats(..., detailed_stats=False) - Quick stats by default
- Full pagination support with ContinuationToken
- Backwards compatible with existing code
## Implementation Details
- Eliminated unnecessary HEAD requests for metadata
- Smart detection: only delta files can benefit from metadata
- Preserved boto3 compatibility while adding performance optimizations
- Updated documentation with performance notes and examples
## Testing
- All existing tests pass
- Added test coverage for new parameters
- Linting (ruff) passes
- Type checking (mypy) passes
- 61 tests passing (18 unit + 43 integration)
Fixes #[issue-number] - Web UI /buckets/ endpoint 2.6s latency
Co-authored-by: Claude <noreply@anthropic.com>
+1 -16
View File
@@ -33,22 +33,7 @@ client = create_client()
# Standard boto3 S3 methods - just work!
client.put_object(Bucket='releases', Key='v1.0.0/app.zip', Body=data)
response = client.get_object(Bucket='releases', Key='v1.0.0/app.zip')
# Optimized list_objects with smart performance defaults (NEW!)
# Fast by default - no unnecessary metadata fetching
response = client.list_objects(Bucket='releases', Prefix='v1.0.0/')
# Pagination for large buckets
response = client.list_objects(Bucket='releases', MaxKeys=100,
ContinuationToken=response.next_continuation_token)
# Get detailed compression stats only when needed
response = client.list_objects(Bucket='releases', FetchMetadata=True) # Slower but detailed
# Quick bucket statistics
stats = client.get_bucket_stats('releases') # Fast overview
stats = client.get_bucket_stats('releases', detailed_stats=True) # With compression metrics
client.list_objects(Bucket='releases', Prefix='v1.0.0/')
client.delete_object(Bucket='releases', Key='old-version.zip')
```
+1 -141
View File
@@ -75,147 +75,7 @@ class DeltaGliderClient:
**Note**: Use `create_client()` instead of instantiating directly.
### boto3-Compatible Methods (Recommended)
These methods provide 100% compatibility with boto3's S3 client, making DeltaGlider a drop-in replacement.
#### `list_objects`
List objects in a bucket with smart performance optimizations.
```python
def list_objects(
self,
Bucket: str,
Prefix: str = "",
Delimiter: str = "",
MaxKeys: int = 1000,
ContinuationToken: Optional[str] = None,
StartAfter: Optional[str] = None,
FetchMetadata: bool = False,
**kwargs
) -> ListObjectsResponse
```
##### Parameters
- **Bucket** (`str`): S3 bucket name.
- **Prefix** (`str`): Filter results to keys beginning with prefix.
- **Delimiter** (`str`): Delimiter for grouping keys (e.g., '/' for folders).
- **MaxKeys** (`int`): Maximum number of keys to return (for pagination). Default: 1000.
- **ContinuationToken** (`Optional[str]`): Token from previous response for pagination.
- **StartAfter** (`Optional[str]`): Start listing after this key (alternative pagination).
- **FetchMetadata** (`bool`): If True, fetch compression metadata for delta files only. Default: False.
- **IMPORTANT**: Non-delta files NEVER trigger metadata fetching (no performance impact).
- With `FetchMetadata=False`: ~50ms for 1000 objects (1 API call)
- With `FetchMetadata=True`: ~2-3s for 1000 objects (1 + N delta files API calls)
##### Performance Optimization
The method intelligently optimizes performance by:
1. **Never** fetching metadata for non-delta files (they don't need it)
2. Only fetching metadata for delta files when explicitly requested
3. Supporting efficient pagination for large buckets
##### Examples
```python
# Fast listing for UI display (no metadata fetching)
response = client.list_objects(Bucket='releases')
# Paginated listing for large buckets
response = client.list_objects(Bucket='releases', MaxKeys=100)
while response.is_truncated:
response = client.list_objects(
Bucket='releases',
MaxKeys=100,
ContinuationToken=response.next_continuation_token
)
# Get detailed compression stats (slower, only for analytics)
response = client.list_objects(
Bucket='releases',
FetchMetadata=True # Only fetches for delta files
)
```
#### `get_bucket_stats`
Get statistics for a bucket with optional detailed compression metrics.
```python
def get_bucket_stats(
self,
bucket: str,
detailed_stats: bool = False
) -> BucketStats
```
##### Parameters
- **bucket** (`str`): S3 bucket name.
- **detailed_stats** (`bool`): If True, fetch accurate compression ratios for delta files. Default: False.
- With `detailed_stats=False`: ~50ms for any bucket size (LIST calls only)
- With `detailed_stats=True`: ~2-3s per 1000 objects (adds HEAD calls for delta files)
##### Examples
```python
# Quick stats for dashboard display
stats = client.get_bucket_stats('releases')
print(f"Objects: {stats.object_count}, Size: {stats.total_size}")
# Detailed stats for analytics (slower but accurate)
stats = client.get_bucket_stats('releases', detailed_stats=True)
print(f"Compression ratio: {stats.average_compression_ratio:.1%}")
```
#### `put_object`
Upload an object to S3 with automatic delta compression (boto3-compatible).
```python
def put_object(
self,
Bucket: str,
Key: str,
Body: bytes | str | Path | None = None,
Metadata: Optional[Dict[str, str]] = None,
ContentType: Optional[str] = None,
**kwargs
) -> Dict[str, Any]
```
##### Parameters
- **Bucket** (`str`): S3 bucket name.
- **Key** (`str`): Object key (path in bucket).
- **Body** (`bytes | str | Path`): Object data.
- **Metadata** (`Optional[Dict[str, str]]`): Custom metadata.
- **ContentType** (`Optional[str]`): MIME type (for compatibility).
##### Returns
Dict with ETag and DeltaGlider compression info.
#### `get_object`
Download an object from S3 with automatic delta reconstruction (boto3-compatible).
```python
def get_object(
self,
Bucket: str,
Key: str,
**kwargs
) -> Dict[str, Any]
```
##### Returns
Dict with Body stream and metadata (identical to boto3).
### Simple API Methods
### Methods
#### `upload`
+8 -199
View File
@@ -4,205 +4,14 @@ Real-world examples and patterns for using DeltaGlider in production application
## Table of Contents
1. [Performance-Optimized Bucket Listing](#performance-optimized-bucket-listing)
2. [Software Release Management](#software-release-management)
3. [Database Backup System](#database-backup-system)
4. [CI/CD Pipeline Integration](#cicd-pipeline-integration)
5. [Container Registry Storage](#container-registry-storage)
6. [Machine Learning Model Versioning](#machine-learning-model-versioning)
7. [Game Asset Distribution](#game-asset-distribution)
8. [Log Archive Management](#log-archive-management)
9. [Multi-Region Replication](#multi-region-replication)
## Performance-Optimized Bucket Listing
DeltaGlider's smart `list_objects` method eliminates the N+1 query problem by intelligently managing metadata fetching.
### Fast Web UI Listing (No Metadata)
```python
from deltaglider import create_client
import time
client = create_client()
def fast_bucket_listing(bucket: str):
"""Ultra-fast listing for web UI display (~50ms for 1000 objects)."""
start = time.time()
# Default: FetchMetadata=False - no HEAD requests
response = client.list_objects(
Bucket=bucket,
MaxKeys=100 # Pagination for UI
)
# Process objects for display
items = []
for obj in response.contents:
items.append({
"key": obj.key,
"size": obj.size,
"last_modified": obj.last_modified,
"is_delta": obj.is_delta, # Determined from filename
# No compression_ratio - would require HEAD request
})
elapsed = time.time() - start
print(f"Listed {len(items)} objects in {elapsed*1000:.0f}ms")
return items, response.next_continuation_token
# Example: List first page
items, next_token = fast_bucket_listing('releases')
```
### Paginated Listing for Large Buckets
```python
def paginated_listing(bucket: str, page_size: int = 50):
"""Efficiently paginate through large buckets."""
all_objects = []
continuation_token = None
while True:
response = client.list_objects(
Bucket=bucket,
MaxKeys=page_size,
ContinuationToken=continuation_token,
FetchMetadata=False # Keep it fast
)
all_objects.extend(response.contents)
if not response.is_truncated:
break
continuation_token = response.next_continuation_token
print(f"Fetched {len(all_objects)} objects so far...")
return all_objects
# Example: List all objects efficiently
all_objects = paginated_listing('releases', page_size=100)
print(f"Total objects: {len(all_objects)}")
```
### Analytics Dashboard with Compression Stats
```python
def dashboard_with_stats(bucket: str):
"""Dashboard view with optional detailed stats."""
# Quick overview (fast - no metadata)
stats = client.get_bucket_stats(bucket, detailed_stats=False)
print(f"Quick Stats for {bucket}:")
print(f" Total Objects: {stats.object_count}")
print(f" Delta Files: {stats.delta_objects}")
print(f" Regular Files: {stats.direct_objects}")
print(f" Total Size: {stats.total_size / (1024**3):.2f} GB")
print(f" Stored Size: {stats.compressed_size / (1024**3):.2f} GB")
# Detailed compression analysis (slower - fetches metadata for deltas only)
if stats.delta_objects > 0:
detailed_stats = client.get_bucket_stats(bucket, detailed_stats=True)
print(f"\nDetailed Compression Stats:")
print(f" Average Compression: {detailed_stats.average_compression_ratio:.1%}")
print(f" Space Saved: {detailed_stats.space_saved / (1024**3):.2f} GB")
# Example usage
dashboard_with_stats('releases')
```
### Smart Metadata Fetching for Analytics
```python
def compression_analysis(bucket: str, prefix: str = ""):
"""Analyze compression effectiveness with selective metadata fetching."""
# Only fetch metadata when we need compression stats
response = client.list_objects(
Bucket=bucket,
Prefix=prefix,
FetchMetadata=True # Fetches metadata ONLY for .delta files
)
# Analyze compression effectiveness
delta_files = [obj for obj in response.contents if obj.is_delta]
if delta_files:
total_original = sum(obj.original_size for obj in delta_files)
total_compressed = sum(obj.compressed_size for obj in delta_files)
avg_ratio = (total_original - total_compressed) / total_original
print(f"Compression Analysis for {prefix or 'all files'}:")
print(f" Delta Files: {len(delta_files)}")
print(f" Original Size: {total_original / (1024**2):.1f} MB")
print(f" Compressed Size: {total_compressed / (1024**2):.1f} MB")
print(f" Average Compression: {avg_ratio:.1%}")
# Find best and worst compression
best = max(delta_files, key=lambda x: x.compression_ratio or 0)
worst = min(delta_files, key=lambda x: x.compression_ratio or 1)
print(f" Best Compression: {best.key} ({best.compression_ratio:.1%})")
print(f" Worst Compression: {worst.key} ({worst.compression_ratio:.1%})")
# Example: Analyze v2.0 releases
compression_analysis('releases', 'v2.0/')
```
### Performance Comparison
```python
def performance_comparison(bucket: str):
"""Compare performance with and without metadata fetching."""
import time
# Test 1: Fast listing (no metadata)
start = time.time()
response_fast = client.list_objects(
Bucket=bucket,
MaxKeys=100,
FetchMetadata=False # Default
)
time_fast = (time.time() - start) * 1000
# Test 2: Detailed listing (with metadata for deltas)
start = time.time()
response_detailed = client.list_objects(
Bucket=bucket,
MaxKeys=100,
FetchMetadata=True # Fetches for delta files only
)
time_detailed = (time.time() - start) * 1000
delta_count = sum(1 for obj in response_fast.contents if obj.is_delta)
print(f"Performance Comparison for {bucket}:")
print(f" Fast Listing: {time_fast:.0f}ms (1 API call)")
print(f" Detailed Listing: {time_detailed:.0f}ms (1 + {delta_count} API calls)")
print(f" Speed Improvement: {time_detailed/time_fast:.1f}x slower with metadata")
print(f"\nRecommendation: Use FetchMetadata=True only when you need:")
print(" - Exact original file sizes for delta files")
print(" - Accurate compression ratios")
print(" - Reference key information")
# Example: Compare performance
performance_comparison('releases')
```
### Best Practices
1. **Default to Fast Mode**: Always use `FetchMetadata=False` (default) unless you specifically need compression stats.
2. **Never Fetch for Non-Deltas**: The SDK automatically skips metadata fetching for non-delta files even when `FetchMetadata=True`.
3. **Use Pagination**: For large buckets, use `MaxKeys` and `ContinuationToken` to paginate results.
4. **Cache Results**: If you need metadata frequently, consider caching the results to avoid repeated HEAD requests.
5. **Batch Analytics**: When doing analytics, fetch metadata once and process the results rather than making multiple calls.
1. [Software Release Management](#software-release-management)
2. [Database Backup System](#database-backup-system)
3. [CI/CD Pipeline Integration](#cicd-pipeline-integration)
4. [Container Registry Storage](#container-registry-storage)
5. [Machine Learning Model Versioning](#machine-learning-model-versioning)
6. [Game Asset Distribution](#game-asset-distribution)
7. [Log Archive Management](#log-archive-management)
8. [Multi-Region Replication](#multi-region-replication)
## Software Release Management
+3 -1
View File
@@ -13,7 +13,7 @@ maintainers = [
{name = "Beshu Tech Team", email = "info@beshu.tech"},
]
readme = "README.md"
license = "MIT"
license = {text = "MIT"}
requires-python = ">=3.11"
keywords = [
"s3",
@@ -35,6 +35,7 @@ classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Intended Audience :: System Administrators",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.11",
@@ -114,6 +115,7 @@ dev-dependencies = [
[tool.setuptools_scm]
# Automatically determine version from git tags
write_to = "src/deltaglider/_version.py"
version_scheme = "release-branch-semver"
local_scheme = "no-local-version"
[tool.ruff]
+3 -3
View File
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
commit_id: COMMIT_ID
__commit_id__: COMMIT_ID
__version__ = version = '0.3.2.dev0'
__version_tuple__ = version_tuple = (0, 3, 2, 'dev0')
__version__ = version = '0.2.0.dev0'
__version_tuple__ = version_tuple = (0, 2, 0, 'dev0')
__commit_id__ = commit_id = 'g23357e240'
__commit_id__ = commit_id = 'g432ddd89c'
+77 -151
View File
@@ -129,97 +129,86 @@ class DeltaGliderClient:
Tagging: str | None = None,
**kwargs: Any,
) -> dict[str, Any]:
"""Upload an object to S3 with delta compression (boto3-compatible).
This method uses DeltaGlider's delta compression for archive files.
Files will be stored as .delta when appropriate (subsequent similar files).
The GET operation transparently reconstructs the original file.
"""Upload an object to S3 (boto3-compatible).
Args:
Bucket: S3 bucket name
Key: Object key (specifies the deltaspace and filename)
Key: Object key
Body: Object data (bytes, string, or file path)
Metadata: Object metadata
ContentType: MIME type (currently unused but kept for compatibility)
Tagging: Object tags as URL-encoded string (currently unused)
ContentType: MIME type
Tagging: Object tags as URL-encoded string
**kwargs: Additional S3 parameters (for compatibility)
Returns:
Response dict with ETag and compression info
Response dict with ETag and version info
"""
import tempfile
# Handle Body parameter
if Body is None:
raise ValueError("Body parameter is required")
# Write body to a temporary file for DeltaService.put()
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(Key).suffix) as tmp_file:
tmp_path = Path(tmp_file.name)
# Create temp file if Body is bytes or string
cleanup_temp = False
if isinstance(Body, (bytes, str)):
# Create temp file with the actual key name to ensure proper naming
temp_dir = Path(tempfile.gettempdir())
tmp_path = temp_dir / Path(Key).name
# Write Body to temp file
if isinstance(Body, bytes):
tmp_file.write(Body)
elif isinstance(Body, str):
tmp_file.write(Body.encode("utf-8"))
elif isinstance(Body, Path):
tmp_file.write(Body.read_bytes())
# If file exists, add unique suffix
if tmp_path.exists():
import uuid
tmp_path = temp_dir / f"{uuid.uuid4()}_{Path(Key).name}"
if isinstance(Body, str):
tmp_path.write_text(Body)
else:
# Handle any other type by converting to string path
path_str = str(Body)
try:
tmp_file.write(Path(path_str).read_bytes())
except Exception as e:
raise ValueError(
f"Invalid Body parameter: cannot read from {path_str}: {e}"
) from e
tmp_path.write_bytes(Body)
cleanup_temp = True
elif isinstance(Body, Path):
tmp_path = Body
else:
tmp_path = Path(str(Body))
try:
# Extract deltaspace prefix from Key
# If Key has path separators, use parent as prefix
key_path = Path(Key)
if "/" in Key:
# Use the parent directories as the deltaspace prefix
prefix = str(key_path.parent)
# Copy temp file with original filename for proper extension detection
named_tmp = tmp_path.parent / key_path.name
tmp_path.rename(named_tmp)
tmp_path = named_tmp
# For boto3 compatibility, we need to handle the key differently
# The base upload method expects a prefix and appends the filename
# But put_object should store exactly at the specified key
# Extract the directory part of the key
key_parts = Key.rsplit("/", 1)
if len(key_parts) > 1:
# Key has a path component
prefix = key_parts[0]
s3_url = f"s3://{Bucket}/{prefix}/"
else:
# No path, use empty prefix
prefix = ""
# Rename temp file to have the proper filename
named_tmp = tmp_path.parent / Key
tmp_path.rename(named_tmp)
tmp_path = named_tmp
# Key is just a filename
s3_url = f"s3://{Bucket}/"
# Create DeltaSpace and use DeltaService for compression
delta_space = DeltaSpace(bucket=Bucket, prefix=prefix)
# Use our upload method
result = self.upload(
file_path=tmp_path,
s3_url=s3_url,
tags=self._parse_tagging(Tagging) if Tagging else None,
)
# Use the service to put the file (handles delta compression automatically)
summary = self.service.put(tmp_path, delta_space, max_ratio=0.5)
# Calculate ETag from file content
sha256_hash = self.service.hasher.sha256(tmp_path)
# Return boto3-compatible response with delta info
# Return boto3-compatible response
return {
"ETag": f'"{sha256_hash}"',
"ETag": f'"{self.service.hasher.sha256(tmp_path)}"',
"ResponseMetadata": {
"HTTPStatusCode": 200,
},
# DeltaGlider extensions
"DeltaGlider": {
"original_size": summary.file_size,
"stored_size": summary.delta_size or summary.file_size,
"is_delta": summary.delta_size is not None,
"compression_ratio": summary.delta_ratio or 1.0,
"stored_as": summary.key,
"operation": summary.operation,
"original_size": result.original_size,
"stored_size": result.stored_size,
"is_delta": result.is_delta,
"compression_ratio": result.delta_ratio,
},
}
finally:
# Clean up temp file
if tmp_path.exists():
if cleanup_temp and tmp_path.exists():
tmp_path.unlink()
def get_object(
@@ -274,83 +263,59 @@ class DeltaGliderClient:
MaxKeys: int = 1000,
ContinuationToken: str | None = None,
StartAfter: str | None = None,
FetchMetadata: bool = False,
**kwargs: Any,
) -> ListObjectsResponse:
"""List objects in bucket with smart metadata fetching.
This method optimizes performance by:
- Never fetching metadata for non-delta files (they don't need it)
- Only fetching metadata for delta files when explicitly requested
- Supporting efficient pagination for large buckets
"""List objects in bucket (boto3-compatible).
Args:
Bucket: S3 bucket name
Prefix: Filter results to keys beginning with prefix
Delimiter: Delimiter for grouping keys (e.g., '/' for folders)
MaxKeys: Maximum number of keys to return (for pagination)
ContinuationToken: Token from previous response for pagination
StartAfter: Start listing after this key (for pagination)
FetchMetadata: If True, fetch metadata ONLY for delta files (default: False)
MaxKeys: Maximum number of keys to return
ContinuationToken: Token for pagination
StartAfter: Start listing after this key
**kwargs: Additional parameters for compatibility
Returns:
ListObjectsResponse with objects and pagination info
Performance Notes:
- With FetchMetadata=False: ~50ms for 1000 objects (1 S3 API call)
- With FetchMetadata=True: ~2-3s for 1000 objects (1 + N delta files API calls)
- Non-delta files NEVER trigger HEAD requests (no metadata needed)
Example:
# Fast listing for UI display (no metadata)
response = client.list_objects(Bucket='releases', MaxKeys=100)
# Paginated listing
response = client.list_objects(
Bucket='releases',
MaxKeys=50,
ContinuationToken=response.next_continuation_token
)
# Detailed listing with compression stats (slower, only for analytics)
response = client.list_objects(
Bucket='releases',
FetchMetadata=True # Only fetches for delta files
)
ListObjectsResponse with objects and common prefixes
"""
# Use storage adapter's list_objects method
# Use storage adapter's list_objects method if available
if hasattr(self.service.storage, "list_objects"):
# Use list_objects method if available
result = self.service.storage.list_objects(
bucket=Bucket,
prefix=Prefix,
delimiter=Delimiter,
max_keys=MaxKeys,
start_after=StartAfter or ContinuationToken, # Support both pagination methods
start_after=StartAfter,
)
elif isinstance(self.service.storage, S3StorageAdapter):
# Fallback to S3StorageAdapter specific implementation
result = self.service.storage.list_objects(
bucket=Bucket,
prefix=Prefix,
delimiter=Delimiter,
max_keys=MaxKeys,
start_after=StartAfter or ContinuationToken,
start_after=StartAfter,
)
else:
# Fallback
# Last resort fallback - should rarely be needed
result = {
"objects": [],
"common_prefixes": [],
"is_truncated": False,
}
# Convert to ObjectInfo objects with smart metadata fetching
# Convert to ObjectInfo objects
contents = []
for obj in result.get("objects", []):
# Determine file type
# Check if it's a delta file or direct upload
is_delta = obj["key"].endswith(".delta")
# Create object info with basic data (no HEAD request)
# Get metadata if available
obj_head = self.service.storage.head(f"{Bucket}/{obj['key']}")
metadata = obj_head.metadata if obj_head else {}
info = ObjectInfo(
key=obj["key"],
size=obj["size"],
@@ -358,32 +323,15 @@ class DeltaGliderClient:
etag=obj.get("etag"),
storage_class=obj.get("storage_class", "STANDARD"),
# DeltaGlider fields
original_size=obj["size"], # For non-delta, original = stored
original_size=int(metadata.get("file_size", obj["size"])),
compressed_size=obj["size"],
is_delta=is_delta,
compression_ratio=0.0 if not is_delta else None,
reference_key=None,
compression_ratio=float(metadata.get("compression_ratio", 0.0)),
reference_key=metadata.get("ref_key"),
)
# SMART METADATA FETCHING:
# 1. NEVER fetch metadata for non-delta files (no point)
# 2. Only fetch for delta files when explicitly requested
if FetchMetadata and is_delta:
try:
obj_head = self.service.storage.head(f"{Bucket}/{obj['key']}")
if obj_head and obj_head.metadata:
metadata = obj_head.metadata
# Update with actual compression stats
info.original_size = int(metadata.get("file_size", obj["size"]))
info.compression_ratio = float(metadata.get("compression_ratio", 0.0))
info.reference_key = metadata.get("ref_key")
except Exception as e:
# Log but don't fail the listing
self.service.logger.debug(f"Failed to fetch metadata for {obj['key']}: {e}")
contents.append(info)
# Build response with pagination support
# Build response
response = ListObjectsResponse(
name=Bucket,
prefix=Prefix,
@@ -953,12 +901,11 @@ class DeltaGliderClient:
Returns:
List of similar files with scores
"""
# List objects in the prefix (no metadata needed for similarity check)
# List objects in the prefix
response = self.list_objects(
Bucket=bucket,
Prefix=prefix,
MaxKeys=1000,
FetchMetadata=False, # Don't need metadata for similarity
)
similar: list[dict[str, Any]] = []
@@ -1042,34 +989,16 @@ class DeltaGliderClient:
reference_key=metadata.get("ref_key"),
)
def get_bucket_stats(self, bucket: str, detailed_stats: bool = False) -> BucketStats:
"""Get statistics for a bucket with optional detailed compression metrics.
This method provides two modes:
- Quick stats (default): Fast overview using LIST only (~50ms)
- Detailed stats: Accurate compression metrics with HEAD requests (slower)
def get_bucket_stats(self, bucket: str) -> BucketStats:
"""Get statistics for a bucket.
Args:
bucket: S3 bucket name
detailed_stats: If True, fetch accurate compression ratios for delta files (default: False)
Returns:
BucketStats with compression and space savings info
Performance:
- With detailed_stats=False: ~50ms for any bucket size (1 LIST call per 1000 objects)
- With detailed_stats=True: ~2-3s per 1000 objects (adds HEAD calls for delta files only)
Example:
# Quick stats for dashboard display
stats = client.get_bucket_stats('releases')
print(f"Objects: {stats.object_count}, Size: {stats.total_size}")
# Detailed stats for analytics (slower but accurate)
stats = client.get_bucket_stats('releases', detailed_stats=True)
print(f"Compression ratio: {stats.average_compression_ratio:.1%}")
"""
# List all objects with smart metadata fetching
# List all objects
all_objects = []
continuation_token = None
@@ -1078,7 +1007,6 @@ class DeltaGliderClient:
Bucket=bucket,
MaxKeys=1000,
ContinuationToken=continuation_token,
FetchMetadata=detailed_stats, # Only fetch metadata if detailed stats requested
)
all_objects.extend(response.contents)
@@ -1088,7 +1016,7 @@ class DeltaGliderClient:
continuation_token = response.next_continuation_token
# Calculate statistics
# Calculate stats
total_size = 0
compressed_size = 0
delta_count = 0
@@ -1099,11 +1027,9 @@ class DeltaGliderClient:
if obj.is_delta:
delta_count += 1
# Use actual original size if we have it, otherwise estimate
total_size += obj.original_size or obj.size
else:
direct_count += 1
# For non-delta files, original equals compressed
total_size += obj.size
space_saved = total_size - compressed_size
+1 -12
View File
@@ -198,18 +198,13 @@ class TestBoto3Compatibility:
def test_list_objects(self, client):
"""Test list_objects with various options."""
# List all objects (default: FetchMetadata=False)
# List all objects
response = client.list_objects(Bucket="test-bucket")
assert isinstance(response, ListObjectsResponse)
assert response.key_count > 0
assert len(response.contents) > 0
# Test with FetchMetadata=True (should only affect delta files)
response_with_metadata = client.list_objects(Bucket="test-bucket", FetchMetadata=True)
assert isinstance(response_with_metadata, ListObjectsResponse)
assert response_with_metadata.key_count > 0
def test_list_objects_with_delimiter(self, client):
"""Test list_objects with delimiter for folder simulation."""
response = client.list_objects(Bucket="test-bucket", Prefix="", Delimiter="/")
@@ -330,7 +325,6 @@ class TestDeltaGliderFeatures:
def test_get_bucket_stats(self, client):
"""Test getting bucket statistics."""
# Test quick stats (default: detailed_stats=False)
stats = client.get_bucket_stats("test-bucket")
assert isinstance(stats, BucketStats)
@@ -338,11 +332,6 @@ class TestDeltaGliderFeatures:
assert stats.total_size > 0
assert stats.delta_objects >= 1 # We have archive.zip.delta
# Test with detailed_stats=True
detailed_stats = client.get_bucket_stats("test-bucket", detailed_stats=True)
assert isinstance(detailed_stats, BucketStats)
assert detailed_stats.object_count == stats.object_count
def test_upload_chunked(self, client, tmp_path):
"""Test chunked upload with progress callback."""
# Create a test file