mirror of
https://github.com/beshu-tech/deltaglider.git
synced 2026-04-30 12:14:32 +02:00
Compare commits
61 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5e333254ba | ||
|
|
04cc984d4a | ||
|
|
ac7d4e067f | ||
|
|
e8fb926fd6 | ||
|
|
626e28eaf6 | ||
|
|
90a342dc33 | ||
|
|
f9f2b036e3 | ||
|
|
778d7f0148 | ||
|
|
37ea2f138c | ||
|
|
5e3b76791e | ||
|
|
fb2877bfd3 | ||
|
|
88fd1f51cd | ||
|
|
0857e02edd | ||
|
|
689cf00d02 | ||
|
|
743d52e783 | ||
|
|
8bc0a0eaf3 | ||
|
|
4cf25e4681 | ||
|
|
69ed9056d2 | ||
|
|
38134f28f5 | ||
|
|
fa1f8b85a9 | ||
|
|
a06cc2939c | ||
|
|
5b8477ed61 | ||
|
|
e706ddebdd | ||
|
|
50db9bbb27 | ||
|
|
c25568e315 | ||
|
|
ca1186a3f6 | ||
|
|
4217535e8c | ||
|
|
0064d7e74b | ||
|
|
9c1659a1f1 | ||
|
|
34c871b0d7 | ||
|
|
db0662c175 | ||
|
|
2efa760785 | ||
|
|
74207f4ee4 | ||
|
|
4668b10c3f | ||
|
|
8cea5a3527 | ||
|
|
07f630d855 | ||
|
|
09c0893244 | ||
|
|
ac2e2b5a0a | ||
|
|
b760890a61 | ||
|
|
03106b76a8 | ||
|
|
dd39595c67 | ||
|
|
12c71c1d6e | ||
|
|
cf10a689cc | ||
|
|
b6ea6d734a | ||
|
|
673e87e5b8 | ||
|
|
c9103cfd4b | ||
|
|
23357e240b | ||
|
|
13fcc8738c | ||
|
|
4a633802b7 | ||
|
|
9f839cc8b7 | ||
|
|
4852f373f1 | ||
|
|
a7ec85b064 | ||
|
|
09a5899a56 | ||
|
|
6faffc1ea8 | ||
|
|
e0b8bac859 | ||
|
|
0699283ca2 | ||
|
|
3074b2cff1 | ||
|
|
0c1d0373a9 | ||
|
|
02120a764e | ||
|
|
f1cdc10fd5 | ||
|
|
3b580a4070 |
26
.github/workflows/ci.yml
vendored
26
.github/workflows/ci.yml
vendored
@@ -3,7 +3,6 @@ name: CI
|
||||
on:
|
||||
push:
|
||||
branches: [main, develop]
|
||||
tags: ["v*"]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
@@ -143,28 +142,3 @@ jobs:
|
||||
run: |
|
||||
uv run pytest tests/e2e -v --tb=short
|
||||
|
||||
pypi-publish:
|
||||
needs: [lint, typecheck, test, e2e-test]
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Build package
|
||||
run: |
|
||||
uv build
|
||||
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
250
.github/workflows/release-manual.yml
vendored
Normal file
250
.github/workflows/release-manual.yml
vendored
Normal file
@@ -0,0 +1,250 @@
|
||||
name: Manual Release (Simple)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: 'Version to release (e.g., 0.3.2) - make sure tag v0.3.2 exists!'
|
||||
required: true
|
||||
type: string
|
||||
pypi_environment:
|
||||
description: 'PyPI environment'
|
||||
required: true
|
||||
type: choice
|
||||
options:
|
||||
- 'pypi'
|
||||
- 'testpypi'
|
||||
default: 'pypi'
|
||||
|
||||
env:
|
||||
UV_VERSION: "0.5.13"
|
||||
PYTHON_VERSION: "3.12"
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
tag_name: ${{ steps.validate_tag.outputs.tag }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Validate version format
|
||||
run: |
|
||||
if ! echo "${{ github.event.inputs.version }}" | grep -E '^[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9]+)?$'; then
|
||||
echo "Error: Version must be in format X.Y.Z or X.Y.Z-suffix"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Check if tag exists
|
||||
id: validate_tag
|
||||
run: |
|
||||
TAG="v${{ github.event.inputs.version }}"
|
||||
if ! git rev-parse "$TAG" >/dev/null 2>&1; then
|
||||
echo "Error: Tag $TAG does not exist!"
|
||||
echo "Please create it first with:"
|
||||
echo " git tag $TAG"
|
||||
echo " git push origin $TAG"
|
||||
exit 1
|
||||
fi
|
||||
echo "tag=$TAG" >> $GITHUB_OUTPUT
|
||||
|
||||
lint:
|
||||
needs: validate
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ needs.validate.outputs.tag_name }}
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install --system -e ".[dev]"
|
||||
|
||||
- name: Run ruff check
|
||||
run: |
|
||||
uv run ruff check src tests
|
||||
|
||||
- name: Run ruff format check
|
||||
run: |
|
||||
uv run ruff format --check src tests
|
||||
|
||||
typecheck:
|
||||
needs: validate
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ needs.validate.outputs.tag_name }}
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install --system -e ".[dev]"
|
||||
|
||||
- name: Run mypy
|
||||
run: |
|
||||
uv run mypy src
|
||||
|
||||
test:
|
||||
needs: validate
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ needs.validate.outputs.tag_name }}
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install xdelta3
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y xdelta3
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install --system -e ".[dev]"
|
||||
|
||||
- name: Run unit tests
|
||||
run: |
|
||||
uv run pytest tests/unit -v --tb=short
|
||||
|
||||
- name: Run integration tests
|
||||
run: |
|
||||
uv run pytest tests/integration -v --tb=short
|
||||
|
||||
e2e-test:
|
||||
needs: validate
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
SERVICES: s3
|
||||
DEBUG: 0
|
||||
DATA_DIR: /tmp/localstack/data
|
||||
options: >-
|
||||
--health-cmd "curl -f http://localhost:4566/_localstack/health"
|
||||
--health-interval 10s
|
||||
--health-timeout 5s
|
||||
--health-retries 5
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ needs.validate.outputs.tag_name }}
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install xdelta3
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y xdelta3
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install --system -e ".[dev]"
|
||||
|
||||
- name: Run E2E tests
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: test
|
||||
AWS_SECRET_ACCESS_KEY: test
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
AWS_ENDPOINT_URL: http://localhost:4566
|
||||
run: |
|
||||
uv run pytest tests/e2e -v --tb=short
|
||||
|
||||
publish:
|
||||
needs: [validate, lint, typecheck, test, e2e-test]
|
||||
runs-on: ubuntu-latest
|
||||
environment: ${{ github.event.inputs.pypi_environment }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ needs.validate.outputs.tag_name }}
|
||||
fetch-depth: 0 # Important for setuptools-scm
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Build package
|
||||
run: |
|
||||
uv build
|
||||
|
||||
- name: Publish to TestPyPI
|
||||
if: github.event.inputs.pypi_environment == 'testpypi'
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
repository-url: https://test.pypi.org/legacy/
|
||||
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
|
||||
|
||||
- name: Publish to PyPI
|
||||
if: github.event.inputs.pypi_environment == 'pypi'
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
continue-on-error: true # Don't fail if GitHub release creation fails
|
||||
with:
|
||||
tag_name: ${{ needs.validate.outputs.tag_name }}
|
||||
name: Release v${{ github.event.inputs.version }}
|
||||
body: |
|
||||
## DeltaGlider v${{ github.event.inputs.version }}
|
||||
|
||||
Published to ${{ github.event.inputs.pypi_environment == 'pypi' && 'PyPI' || 'TestPyPI' }}
|
||||
|
||||
### Installation
|
||||
```bash
|
||||
pip install deltaglider==${{ github.event.inputs.version }}
|
||||
```
|
||||
draft: false
|
||||
prerelease: ${{ contains(github.event.inputs.version, '-') }}
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
254
.github/workflows/release.yml
vendored
Normal file
254
.github/workflows/release.yml
vendored
Normal file
@@ -0,0 +1,254 @@
|
||||
name: Manual Release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: 'Version to release (e.g., 0.3.2)'
|
||||
required: true
|
||||
type: string
|
||||
pypi_environment:
|
||||
description: 'PyPI environment'
|
||||
required: true
|
||||
type: choice
|
||||
options:
|
||||
- 'pypi'
|
||||
- 'testpypi'
|
||||
default: 'pypi'
|
||||
|
||||
env:
|
||||
UV_VERSION: "0.5.13"
|
||||
PYTHON_VERSION: "3.12"
|
||||
|
||||
jobs:
|
||||
validate-and-tag:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
tag_name: ${{ steps.create_tag.outputs.tag }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.PAT_TOKEN }}
|
||||
|
||||
- name: Validate version format
|
||||
run: |
|
||||
if ! echo "${{ github.event.inputs.version }}" | grep -E '^[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9]+)?$'; then
|
||||
echo "Error: Version must be in format X.Y.Z or X.Y.Z-suffix"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Check if tag already exists
|
||||
run: |
|
||||
if git rev-parse "v${{ github.event.inputs.version }}" >/dev/null 2>&1; then
|
||||
echo "Error: Tag v${{ github.event.inputs.version }} already exists"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create and push tag
|
||||
id: create_tag
|
||||
run: |
|
||||
git config --global user.name "github-actions[bot]"
|
||||
git config --global user.email "github-actions[bot]@users.noreply.github.com"
|
||||
git tag -a "v${{ github.event.inputs.version }}" -m "Release v${{ github.event.inputs.version }}"
|
||||
git push origin "v${{ github.event.inputs.version }}"
|
||||
echo "tag=v${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
|
||||
|
||||
lint:
|
||||
needs: validate-and-tag
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ needs.validate-and-tag.outputs.tag_name }}
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install --system -e ".[dev]"
|
||||
|
||||
- name: Run ruff check
|
||||
run: |
|
||||
uv run ruff check src tests
|
||||
|
||||
- name: Run ruff format check
|
||||
run: |
|
||||
uv run ruff format --check src tests
|
||||
|
||||
typecheck:
|
||||
needs: validate-and-tag
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ needs.validate-and-tag.outputs.tag_name }}
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install --system -e ".[dev]"
|
||||
|
||||
- name: Run mypy
|
||||
run: |
|
||||
uv run mypy src
|
||||
|
||||
test:
|
||||
needs: validate-and-tag
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ needs.validate-and-tag.outputs.tag_name }}
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install xdelta3
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y xdelta3
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install --system -e ".[dev]"
|
||||
|
||||
- name: Run unit tests
|
||||
run: |
|
||||
uv run pytest tests/unit -v --tb=short
|
||||
|
||||
- name: Run integration tests
|
||||
run: |
|
||||
uv run pytest tests/integration -v --tb=short
|
||||
|
||||
e2e-test:
|
||||
needs: validate-and-tag
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
SERVICES: s3
|
||||
DEBUG: 0
|
||||
DATA_DIR: /tmp/localstack/data
|
||||
options: >-
|
||||
--health-cmd "curl -f http://localhost:4566/_localstack/health"
|
||||
--health-interval 10s
|
||||
--health-timeout 5s
|
||||
--health-retries 5
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ needs.validate-and-tag.outputs.tag_name }}
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install xdelta3
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y xdelta3
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install --system -e ".[dev]"
|
||||
|
||||
- name: Run E2E tests
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: test
|
||||
AWS_SECRET_ACCESS_KEY: test
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
AWS_ENDPOINT_URL: http://localhost:4566
|
||||
run: |
|
||||
uv run pytest tests/e2e -v --tb=short
|
||||
|
||||
publish:
|
||||
needs: [validate-and-tag, lint, typecheck, test, e2e-test]
|
||||
runs-on: ubuntu-latest
|
||||
environment: ${{ github.event.inputs.pypi_environment }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ needs.validate-and-tag.outputs.tag_name }}
|
||||
fetch-depth: 0 # Important for setuptools-scm
|
||||
|
||||
- name: Install UV
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/${{ env.UV_VERSION }}/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Build package
|
||||
run: |
|
||||
uv build
|
||||
|
||||
- name: Publish to TestPyPI
|
||||
if: github.event.inputs.pypi_environment == 'testpypi'
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
repository-url: https://test.pypi.org/legacy/
|
||||
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
|
||||
|
||||
- name: Publish to PyPI
|
||||
if: github.event.inputs.pypi_environment == 'pypi'
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
continue-on-error: true # Don't fail if GitHub release creation fails
|
||||
with:
|
||||
tag_name: ${{ needs.validate-and-tag.outputs.tag_name }}
|
||||
name: Release v${{ github.event.inputs.version }}
|
||||
body: |
|
||||
## DeltaGlider v${{ github.event.inputs.version }}
|
||||
|
||||
Published to ${{ github.event.inputs.pypi_environment == 'pypi' && 'PyPI' || 'TestPyPI' }}
|
||||
|
||||
### Installation
|
||||
```bash
|
||||
pip install deltaglider==${{ github.event.inputs.version }}
|
||||
```
|
||||
draft: false
|
||||
prerelease: ${{ contains(github.event.inputs.version, '-') }}
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,4 +1,5 @@
|
||||
# Python
|
||||
ror-data-importer/
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
@@ -85,3 +86,4 @@ docs/_templates/
|
||||
|
||||
# Temporary downloads
|
||||
temp_downloads/
|
||||
src/deltaglider/_version.py
|
||||
|
||||
225
BOTO3_COMPATIBILITY.md
Normal file
225
BOTO3_COMPATIBILITY.md
Normal file
@@ -0,0 +1,225 @@
|
||||
# boto3 S3 Client Compatibility
|
||||
|
||||
DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the most commonly used operations. This is **not** a 100% drop-in replacement, but covers the core functionality needed for most use cases.
|
||||
|
||||
## ✅ Implemented Methods (21 core methods)
|
||||
|
||||
### Object Operations
|
||||
- ✅ `put_object()` - Upload objects (with automatic delta compression)
|
||||
- ✅ `get_object()` - Download objects (with automatic delta reconstruction)
|
||||
- ✅ `delete_object()` - Delete single object
|
||||
- ✅ `delete_objects()` - Delete multiple objects
|
||||
- ✅ `head_object()` - Get object metadata
|
||||
- ✅ `list_objects()` - List objects (list_objects_v2 compatible)
|
||||
- ✅ `copy_object()` - Copy objects between locations
|
||||
|
||||
### Bucket Operations
|
||||
- ✅ `create_bucket()` - Create buckets
|
||||
- ✅ `delete_bucket()` - Delete empty buckets
|
||||
- ✅ `list_buckets()` - List all buckets
|
||||
|
||||
### Presigned URLs
|
||||
- ✅ `generate_presigned_url()` - Generate presigned URLs
|
||||
- ✅ `generate_presigned_post()` - Generate presigned POST data
|
||||
|
||||
### DeltaGlider Extensions
|
||||
- ✅ `upload()` - Simple upload with S3 URL
|
||||
- ✅ `download()` - Simple download with S3 URL
|
||||
- ✅ `verify()` - Verify object integrity
|
||||
- ✅ `upload_chunked()` - Upload with progress callback
|
||||
- ✅ `upload_batch()` - Batch upload multiple files
|
||||
- ✅ `download_batch()` - Batch download multiple files
|
||||
- ✅ `estimate_compression()` - Estimate compression ratio
|
||||
- ✅ `find_similar_files()` - Find similar files for delta reference
|
||||
- ✅ `get_object_info()` - Get detailed object info with compression stats
|
||||
- ✅ `get_bucket_stats()` - Get bucket statistics
|
||||
- ✅ `delete_objects_recursive()` - Recursively delete objects
|
||||
|
||||
## ❌ Not Implemented (80+ methods)
|
||||
|
||||
### Multipart Upload
|
||||
- ❌ `create_multipart_upload()`
|
||||
- ❌ `upload_part()`
|
||||
- ❌ `complete_multipart_upload()`
|
||||
- ❌ `abort_multipart_upload()`
|
||||
- ❌ `list_multipart_uploads()`
|
||||
- ❌ `list_parts()`
|
||||
|
||||
### Access Control (ACL)
|
||||
- ❌ `get_bucket_acl()`
|
||||
- ❌ `put_bucket_acl()`
|
||||
- ❌ `get_object_acl()`
|
||||
- ❌ `put_object_acl()`
|
||||
- ❌ `get_public_access_block()`
|
||||
- ❌ `put_public_access_block()`
|
||||
- ❌ `delete_public_access_block()`
|
||||
|
||||
### Bucket Configuration
|
||||
- ❌ `get_bucket_location()`
|
||||
- ❌ `get_bucket_versioning()`
|
||||
- ❌ `put_bucket_versioning()`
|
||||
- ❌ `get_bucket_logging()`
|
||||
- ❌ `put_bucket_logging()`
|
||||
- ❌ `get_bucket_website()`
|
||||
- ❌ `put_bucket_website()`
|
||||
- ❌ `delete_bucket_website()`
|
||||
- ❌ `get_bucket_cors()`
|
||||
- ❌ `put_bucket_cors()`
|
||||
- ❌ `delete_bucket_cors()`
|
||||
- ❌ `get_bucket_lifecycle_configuration()`
|
||||
- ❌ `put_bucket_lifecycle_configuration()`
|
||||
- ❌ `delete_bucket_lifecycle()`
|
||||
- ❌ `get_bucket_policy()`
|
||||
- ❌ `put_bucket_policy()`
|
||||
- ❌ `delete_bucket_policy()`
|
||||
- ❌ `get_bucket_encryption()`
|
||||
- ❌ `put_bucket_encryption()`
|
||||
- ❌ `delete_bucket_encryption()`
|
||||
- ❌ `get_bucket_notification_configuration()`
|
||||
- ❌ `put_bucket_notification_configuration()`
|
||||
- ❌ `get_bucket_accelerate_configuration()`
|
||||
- ❌ `put_bucket_accelerate_configuration()`
|
||||
- ❌ `get_bucket_request_payment()`
|
||||
- ❌ `put_bucket_request_payment()`
|
||||
- ❌ `get_bucket_replication()`
|
||||
- ❌ `put_bucket_replication()`
|
||||
- ❌ `delete_bucket_replication()`
|
||||
|
||||
### Tagging & Metadata
|
||||
- ❌ `get_object_tagging()`
|
||||
- ❌ `put_object_tagging()`
|
||||
- ❌ `delete_object_tagging()`
|
||||
- ❌ `get_bucket_tagging()`
|
||||
- ❌ `put_bucket_tagging()`
|
||||
- ❌ `delete_bucket_tagging()`
|
||||
|
||||
### Advanced Features
|
||||
- ❌ `restore_object()` - Glacier restore
|
||||
- ❌ `select_object_content()` - S3 Select
|
||||
- ❌ `get_object_torrent()` - BitTorrent
|
||||
- ❌ `get_object_legal_hold()` - Object Lock
|
||||
- ❌ `put_object_legal_hold()`
|
||||
- ❌ `get_object_retention()`
|
||||
- ❌ `put_object_retention()`
|
||||
- ❌ `get_bucket_analytics_configuration()`
|
||||
- ❌ `put_bucket_analytics_configuration()`
|
||||
- ❌ `delete_bucket_analytics_configuration()`
|
||||
- ❌ `list_bucket_analytics_configurations()`
|
||||
- ❌ `get_bucket_metrics_configuration()`
|
||||
- ❌ `put_bucket_metrics_configuration()`
|
||||
- ❌ `delete_bucket_metrics_configuration()`
|
||||
- ❌ `list_bucket_metrics_configurations()`
|
||||
- ❌ `get_bucket_inventory_configuration()`
|
||||
- ❌ `put_bucket_inventory_configuration()`
|
||||
- ❌ `delete_bucket_inventory_configuration()`
|
||||
- ❌ `list_bucket_inventory_configurations()`
|
||||
- ❌ `get_bucket_intelligent_tiering_configuration()`
|
||||
- ❌ `put_bucket_intelligent_tiering_configuration()`
|
||||
- ❌ `delete_bucket_intelligent_tiering_configuration()`
|
||||
- ❌ `list_bucket_intelligent_tiering_configurations()`
|
||||
|
||||
### Helper Methods
|
||||
- ❌ `download_file()` - High-level download
|
||||
- ❌ `upload_file()` - High-level upload
|
||||
- ❌ `download_fileobj()` - Download to file object
|
||||
- ❌ `upload_fileobj()` - Upload from file object
|
||||
|
||||
### Other
|
||||
- ❌ `get_bucket_ownership_controls()`
|
||||
- ❌ `put_bucket_ownership_controls()`
|
||||
- ❌ `delete_bucket_ownership_controls()`
|
||||
- ❌ `get_bucket_policy_status()`
|
||||
- ❌ `list_object_versions()`
|
||||
- ❌ `create_session()` - S3 Express
|
||||
- And 20+ more metadata/configuration methods...
|
||||
|
||||
## Coverage Analysis
|
||||
|
||||
**Implemented:** ~21 methods
|
||||
**Total boto3 S3 methods:** ~100+ methods
|
||||
**Coverage:** ~20%
|
||||
|
||||
## What's Covered
|
||||
|
||||
DeltaGlider focuses on:
|
||||
1. ✅ **Core CRUD operations** - put, get, delete, list
|
||||
2. ✅ **Bucket management** - create, delete, list buckets
|
||||
3. ✅ **Basic metadata** - head_object
|
||||
4. ✅ **Presigned URLs** - generate_presigned_url/post
|
||||
5. ✅ **Delta compression** - automatic for archive files
|
||||
6. ✅ **Batch operations** - upload_batch, download_batch
|
||||
7. ✅ **Compression stats** - get_bucket_stats, estimate_compression
|
||||
|
||||
## What's NOT Covered
|
||||
|
||||
❌ **Advanced bucket configuration** (versioning, lifecycle, logging, etc.)
|
||||
❌ **Access control** (ACLs, bucket policies)
|
||||
❌ **Multipart uploads** (for >5GB files)
|
||||
❌ **Advanced features** (S3 Select, Glacier, Object Lock)
|
||||
❌ **Tagging APIs** (object/bucket tags)
|
||||
❌ **High-level transfer utilities** (upload_file, download_file)
|
||||
|
||||
## Use Cases
|
||||
|
||||
### ✅ DeltaGlider is PERFECT for:
|
||||
- Storing versioned releases/builds
|
||||
- Backup storage with deduplication
|
||||
- CI/CD artifact storage
|
||||
- Docker layer storage
|
||||
- Archive file storage (zip, tar, etc.)
|
||||
- Simple S3 storage needs
|
||||
|
||||
### ❌ Use boto3 directly for:
|
||||
- Complex bucket policies
|
||||
- Versioning/lifecycle management
|
||||
- Multipart uploads (>5GB files)
|
||||
- S3 Select queries
|
||||
- Glacier deep archive
|
||||
- Object Lock/Legal Hold
|
||||
- Advanced ACL management
|
||||
|
||||
## Migration Strategy
|
||||
|
||||
If you need both boto3 and DeltaGlider:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import boto3
|
||||
|
||||
# Use DeltaGlider for objects (with compression!)
|
||||
dg_client = create_client()
|
||||
dg_client.put_object(Bucket='releases', Key='app.zip', Body=data)
|
||||
|
||||
# Use boto3 for advanced features
|
||||
s3_client = boto3.client('s3')
|
||||
s3_client.put_bucket_versioning(
|
||||
Bucket='releases',
|
||||
VersioningConfiguration={'Status': 'Enabled'}
|
||||
)
|
||||
```
|
||||
|
||||
## Future Additions
|
||||
|
||||
Likely to be added:
|
||||
- `upload_file()` / `download_file()` - High-level helpers
|
||||
- `copy_object()` - Object copying
|
||||
- Basic tagging support
|
||||
- Multipart upload (for large files)
|
||||
|
||||
Unlikely to be added:
|
||||
- Advanced bucket configuration
|
||||
- ACL management
|
||||
- S3 Select
|
||||
- Glacier operations
|
||||
|
||||
## Conclusion
|
||||
|
||||
**DeltaGlider is NOT a 100% drop-in boto3 replacement.**
|
||||
|
||||
It implements the **20% of boto3 methods that cover 80% of use cases**, with a focus on:
|
||||
- Core object operations
|
||||
- Bucket management
|
||||
- Delta compression for storage savings
|
||||
- Simple, clean API
|
||||
|
||||
For advanced S3 features, use boto3 directly or in combination with DeltaGlider.
|
||||
163
CHANGELOG.md
Normal file
163
CHANGELOG.md
Normal file
@@ -0,0 +1,163 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [5.0.3] - 2025-10-10
|
||||
|
||||
### Security
|
||||
- **BREAKING**: Removed all legacy shared cache code for security
|
||||
- **BREAKING**: Encryption is now ALWAYS ON (cannot be disabled)
|
||||
- Ephemeral process-isolated cache is now the ONLY mode (no opt-out)
|
||||
- **Content-Addressed Storage (CAS)**: Implemented SHA256-based cache storage
|
||||
- Zero collision risk (SHA256 namespace guarantees uniqueness)
|
||||
- Automatic deduplication (same content = same filename)
|
||||
- Tampering protection (changing content changes SHA, breaks lookup)
|
||||
- Two-level directory structure for filesystem optimization
|
||||
- **Encrypted Cache**: All cache data encrypted at rest using Fernet (AES-128-CBC + HMAC)
|
||||
- Ephemeral encryption keys per process (forward secrecy)
|
||||
- Optional persistent keys via `DG_CACHE_ENCRYPTION_KEY` for shared filesystems
|
||||
- Automatic cleanup of corrupted cache files on decryption failures
|
||||
- Fixed TOCTOU vulnerabilities with atomic SHA validation at use-time
|
||||
- Added `get_validated_ref()` method to prevent cache poisoning
|
||||
- Eliminated multi-user data exposure through mandatory cache isolation
|
||||
|
||||
### Removed
|
||||
- **BREAKING**: Removed `DG_UNSAFE_SHARED_CACHE` environment variable
|
||||
- **BREAKING**: Removed `DG_CACHE_DIR` environment variable
|
||||
- **BREAKING**: Removed `DG_CACHE_ENCRYPTION` environment variable (encryption always on)
|
||||
- **BREAKING**: Removed `cache_dir` parameter from `create_client()`
|
||||
|
||||
### Changed
|
||||
- Cache is now auto-created in `/tmp/deltaglider-*` and cleaned on exit
|
||||
- All cache operations use file locking (Unix) and SHA validation
|
||||
- Added `CacheMissError` and `CacheCorruptionError` exceptions
|
||||
|
||||
### Added
|
||||
- New `ContentAddressedCache` adapter in `adapters/cache_cas.py`
|
||||
- New `EncryptedCache` wrapper in `adapters/cache_encrypted.py`
|
||||
- New `MemoryCache` adapter in `adapters/cache_memory.py` with LRU eviction
|
||||
- Self-describing cache structure with SHA256-based filenames
|
||||
- Configurable cache backends via `DG_CACHE_BACKEND` (filesystem or memory)
|
||||
- Memory cache size limit via `DG_CACHE_MEMORY_SIZE_MB` (default: 100MB)
|
||||
|
||||
### Internal
|
||||
- Updated all tests to use Content-Addressed Storage and encryption
|
||||
- All 119 tests passing with zero errors (99 original + 20 new cache tests)
|
||||
- Type checking: 0 errors (mypy)
|
||||
- Linting: All checks passed (ruff)
|
||||
- Completed Phase 1, 2, and 7 of SECURITY_FIX_ROADMAP.md
|
||||
- Added comprehensive test suites for encryption (13 tests) and memory cache (10 tests)
|
||||
|
||||
## [5.0.1] - 2025-01-10
|
||||
|
||||
### Changed
|
||||
- **Code Organization**: Refactored client.py from 1560 to 1154 lines (26% reduction)
|
||||
- Extracted client operations into modular `client_operations/` package:
|
||||
- `bucket.py` - S3 bucket management operations
|
||||
- `presigned.py` - Presigned URL generation
|
||||
- `batch.py` - Batch upload/download operations
|
||||
- `stats.py` - Analytics and statistics operations
|
||||
- Improved code maintainability with logical separation of concerns
|
||||
- Better developer experience with cleaner module structure
|
||||
|
||||
### Internal
|
||||
- Full type safety maintained with mypy (0 errors)
|
||||
- All 99 tests passing
|
||||
- Code quality checks passing (ruff)
|
||||
- No breaking changes - all public APIs remain unchanged
|
||||
|
||||
## [5.0.0] - 2025-01-10
|
||||
|
||||
### Added
|
||||
- boto3-compatible TypedDict types for S3 responses (no boto3 import needed)
|
||||
- Complete boto3 compatibility vision document
|
||||
- Type-safe response builders using TypedDict patterns
|
||||
|
||||
### Changed
|
||||
- **BREAKING**: `list_objects()` now returns boto3-compatible dict instead of custom dataclass
|
||||
- Use `response['Contents']` instead of `response.contents`
|
||||
- Use `response.get('IsTruncated')` instead of `response.is_truncated`
|
||||
- Use `response.get('NextContinuationToken')` instead of `response.next_continuation_token`
|
||||
- DeltaGlider metadata now in `Metadata` field of each object
|
||||
- Internal response building now uses TypedDict for compile-time type safety
|
||||
- All S3 responses are dicts at runtime (TypedDict is a dict!)
|
||||
|
||||
### Fixed
|
||||
- Updated all documentation examples to use dict-based responses
|
||||
- Fixed pagination examples in README and API docs
|
||||
- Corrected SDK documentation with accurate method signatures
|
||||
|
||||
## [4.2.4] - 2025-01-10
|
||||
|
||||
### Fixed
|
||||
- Show only filename in `ls` output instead of full path for cleaner display
|
||||
- Correct `ls` command path handling and prefix display logic
|
||||
|
||||
## [4.2.3] - 2025-01-07
|
||||
|
||||
### Added
|
||||
- Comprehensive test coverage for `delete_objects_recursive()` method with 19 thorough tests
|
||||
- Tests cover delta suffix handling, error/warning aggregation, statistics tracking, and edge cases
|
||||
- Better code organization with separate `client_models.py` and `client_delete_helpers.py` modules
|
||||
|
||||
### Fixed
|
||||
- Fixed all mypy type errors using proper `cast()` for type safety
|
||||
- Improved type hints for dictionary operations in client code
|
||||
|
||||
### Changed
|
||||
- Refactored client code into logical modules for better maintainability
|
||||
- Enhanced code quality with comprehensive linting and type checking
|
||||
- All 99 integration/unit tests passing with zero type errors
|
||||
|
||||
### Internal
|
||||
- Better separation of concerns in client module
|
||||
- Improved developer experience with clearer code structure
|
||||
|
||||
## [4.2.2] - 2024-10-06
|
||||
|
||||
### Fixed
|
||||
- Add .delta suffix fallback for `delete_object()` method
|
||||
- Handle regular S3 objects without DeltaGlider metadata
|
||||
- Update mypy type ignore comment for compatibility
|
||||
|
||||
## [4.2.1] - 2024-10-06
|
||||
|
||||
### Fixed
|
||||
- Make GitHub release creation non-blocking in workflows
|
||||
|
||||
## [4.2.0] - 2024-10-03
|
||||
|
||||
### Added
|
||||
- AWS credential parameters to `create_client()` function
|
||||
- Support for custom endpoint URLs
|
||||
- Enhanced boto3 compatibility
|
||||
|
||||
## [4.1.0] - 2024-09-29
|
||||
|
||||
### Added
|
||||
- boto3-compatible client API
|
||||
- Bucket management methods
|
||||
- Comprehensive SDK documentation
|
||||
|
||||
## [4.0.0] - 2024-09-21
|
||||
|
||||
### Added
|
||||
- Initial public release
|
||||
- CLI with AWS S3 compatibility
|
||||
- Delta compression for versioned artifacts
|
||||
- 99%+ compression for similar files
|
||||
|
||||
[5.0.1]: https://github.com/beshu-tech/deltaglider/compare/v5.0.0...v5.0.1
|
||||
[5.0.0]: https://github.com/beshu-tech/deltaglider/compare/v4.2.4...v5.0.0
|
||||
[4.2.4]: https://github.com/beshu-tech/deltaglider/compare/v4.2.3...v4.2.4
|
||||
[4.2.3]: https://github.com/beshu-tech/deltaglider/compare/v4.2.2...v4.2.3
|
||||
[4.2.2]: https://github.com/beshu-tech/deltaglider/compare/v4.2.1...v4.2.2
|
||||
[4.2.1]: https://github.com/beshu-tech/deltaglider/compare/v4.2.0...v4.2.1
|
||||
[4.2.0]: https://github.com/beshu-tech/deltaglider/compare/v4.1.0...v4.2.0
|
||||
[4.1.0]: https://github.com/beshu-tech/deltaglider/compare/v4.0.0...v4.1.0
|
||||
[4.0.0]: https://github.com/beshu-tech/deltaglider/releases/tag/v4.0.0
|
||||
50
CLAUDE.md
50
CLAUDE.md
@@ -97,13 +97,15 @@ src/deltaglider/
|
||||
│ ├── logger.py # LoggerPort protocol for logging
|
||||
│ └── metrics.py # MetricsPort protocol for observability
|
||||
├── adapters/ # Concrete implementations
|
||||
│ ├── storage_s3.py # S3StorageAdapter using boto3
|
||||
│ ├── diff_xdelta.py # XdeltaAdapter using xdelta3 binary
|
||||
│ ├── hash_sha256.py # Sha256Adapter for checksums
|
||||
│ ├── cache_fs.py # FsCacheAdapter for file system cache
|
||||
│ ├── clock_utc.py # UtcClockAdapter for UTC timestamps
|
||||
│ ├── logger_std.py # StdLoggerAdapter for console output
|
||||
│ └── metrics_noop.py # NoopMetricsAdapter (placeholder)
|
||||
│ ├── storage_s3.py # S3StorageAdapter using boto3
|
||||
│ ├── diff_xdelta.py # XdeltaAdapter using xdelta3 binary
|
||||
│ ├── hash_sha256.py # Sha256Adapter for checksums
|
||||
│ ├── cache_cas.py # ContentAddressedCache (SHA256-based storage)
|
||||
│ ├── cache_encrypted.py # EncryptedCache (Fernet encryption wrapper)
|
||||
│ ├── cache_memory.py # MemoryCache (LRU in-memory cache)
|
||||
│ ├── clock_utc.py # UtcClockAdapter for UTC timestamps
|
||||
│ ├── logger_std.py # StdLoggerAdapter for console output
|
||||
│ └── metrics_noop.py # NoopMetricsAdapter (placeholder)
|
||||
└── app/
|
||||
└── cli/ # Click-based CLI application
|
||||
├── main.py # Main CLI entry point with AWS S3 commands
|
||||
@@ -129,7 +131,6 @@ src/deltaglider/
|
||||
4. **AWS S3 CLI Compatibility**:
|
||||
- Commands (`cp`, `ls`, `rm`, `sync`) mirror AWS CLI syntax exactly
|
||||
- Located in `app/cli/main.py` with helpers in `aws_compat.py`
|
||||
- Maintains backward compatibility with original `put`/`get` commands
|
||||
|
||||
### Key Algorithms
|
||||
|
||||
@@ -141,7 +142,13 @@ src/deltaglider/
|
||||
2. **Reference Management** (`core/service.py`):
|
||||
- Reference stored at `{deltaspace.prefix}/reference.bin`
|
||||
- SHA256 verification on every read/write
|
||||
- Local cache in `/tmp/.deltaglider/reference_cache` for performance
|
||||
- **Content-Addressed Storage (CAS)** cache in `/tmp/deltaglider-*` (ephemeral)
|
||||
- Cache uses SHA256 as filename with two-level directory structure (ab/cd/abcdef...)
|
||||
- Automatic deduplication: same content = same SHA = same cache file
|
||||
- Zero collision risk: SHA256 namespace guarantees uniqueness
|
||||
- **Encryption**: Optional Fernet (AES-128-CBC + HMAC) encryption at rest (enabled by default)
|
||||
- Ephemeral encryption keys per process for forward secrecy
|
||||
- **Cache Backends**: Configurable filesystem or in-memory cache with LRU eviction
|
||||
|
||||
3. **Sync Algorithm** (`app/cli/sync.py`):
|
||||
- Compares local vs S3 using size and modification time
|
||||
@@ -182,13 +189,22 @@ Core delta logic is in `src/deltaglider/core/service.py`:
|
||||
## Environment Variables
|
||||
|
||||
- `DG_LOG_LEVEL`: Logging level (default: "INFO")
|
||||
- `DG_CACHE_DIR`: Local reference cache directory (default: "/tmp/.deltaglider/reference_cache")
|
||||
- `DG_MAX_RATIO`: Maximum acceptable delta/file ratio (default: "0.5")
|
||||
- `DG_CACHE_BACKEND`: Cache backend type - "filesystem" (default) or "memory"
|
||||
- `DG_CACHE_MEMORY_SIZE_MB`: Memory cache size limit in MB (default: "100")
|
||||
- `DG_CACHE_ENCRYPTION_KEY`: Optional base64-encoded Fernet key for persistent encryption (ephemeral by default)
|
||||
- `AWS_ENDPOINT_URL`: Override S3 endpoint for MinIO/LocalStack
|
||||
- `AWS_ACCESS_KEY_ID`: AWS credentials
|
||||
- `AWS_SECRET_ACCESS_KEY`: AWS credentials
|
||||
- `AWS_DEFAULT_REGION`: AWS region
|
||||
|
||||
**Security Notes**:
|
||||
- **Encryption Always On**: Cache data is ALWAYS encrypted (cannot be disabled)
|
||||
- **Ephemeral Keys**: Encryption keys auto-generated per process for maximum security
|
||||
- **Auto-Cleanup**: Corrupted cache files automatically deleted on decryption failures
|
||||
- **Process Isolation**: Each process gets isolated cache in `/tmp/deltaglider-*`, cleaned up on exit
|
||||
- **Persistent Keys**: Set `DG_CACHE_ENCRYPTION_KEY` only if you need cross-process cache sharing (e.g., shared filesystems)
|
||||
|
||||
## Important Implementation Details
|
||||
|
||||
1. **xdelta3 Binary Dependency**: The system requires xdelta3 binary installed on the system. The `XdeltaAdapter` uses subprocess to call it.
|
||||
@@ -203,7 +219,11 @@ Core delta logic is in `src/deltaglider/core/service.py`:
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- Local reference caching dramatically improves performance for repeated operations
|
||||
- **Content-Addressed Storage**: SHA256-based deduplication eliminates redundant storage
|
||||
- **Cache Backends**:
|
||||
- Filesystem cache (default): persistent across processes, good for shared workflows
|
||||
- Memory cache: faster, zero I/O, perfect for ephemeral CI/CD pipelines
|
||||
- **Encryption Overhead**: ~10-15% performance impact, provides security at rest
|
||||
- Delta compression is CPU-intensive; consider parallelization for bulk uploads
|
||||
- The default max_ratio of 0.5 prevents storing inefficient deltas
|
||||
- For files <1MB, delta overhead may exceed benefits
|
||||
@@ -213,4 +233,10 @@ Core delta logic is in `src/deltaglider/core/service.py`:
|
||||
- Never store AWS credentials in code
|
||||
- Use IAM roles when possible
|
||||
- All S3 operations respect bucket policies and encryption settings
|
||||
- SHA256 checksums prevent tampering and corruption
|
||||
- SHA256 checksums prevent tampering and corruption
|
||||
- **Encryption Always On**: Cache data is ALWAYS encrypted using Fernet (AES-128-CBC + HMAC) - cannot be disabled
|
||||
- **Ephemeral Keys**: Encryption keys auto-generated per process for forward secrecy and process isolation
|
||||
- **Auto-Cleanup**: Corrupted or tampered cache files automatically deleted on decryption failures
|
||||
- **Persistent Keys**: Set `DG_CACHE_ENCRYPTION_KEY` only for cross-process cache sharing (use secrets management)
|
||||
- **Content-Addressed Storage**: SHA256-based filenames prevent collision attacks
|
||||
- **Zero-Trust Cache**: All cache operations include cryptographic validation
|
||||
33
Dockerfile
33
Dockerfile
@@ -30,7 +30,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# Runtime stage - minimal image
|
||||
FROM python:${PYTHON_VERSION}
|
||||
|
||||
# Install xdelta3
|
||||
# Skip man pages and docs to speed up builds
|
||||
RUN mkdir -p /etc/dpkg/dpkg.cfg.d && \
|
||||
echo 'path-exclude /usr/share/doc/*' > /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/man/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/groff/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/info/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/lintian/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/linda/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc
|
||||
|
||||
# Install xdelta3 (now much faster without man pages)
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends xdelta3 && \
|
||||
apt-get clean && \
|
||||
@@ -57,10 +66,28 @@ USER deltaglider
|
||||
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
||||
CMD deltaglider --help || exit 1
|
||||
|
||||
# Environment variables (all optional, can be overridden at runtime)
|
||||
# Logging
|
||||
ENV DG_LOG_LEVEL=INFO
|
||||
|
||||
# Performance & Compression
|
||||
ENV DG_MAX_RATIO=0.5
|
||||
|
||||
# Cache Configuration
|
||||
ENV DG_CACHE_BACKEND=filesystem
|
||||
ENV DG_CACHE_MEMORY_SIZE_MB=100
|
||||
# ENV DG_CACHE_ENCRYPTION_KEY=<base64-key> # Optional: Set for cross-process cache sharing
|
||||
|
||||
# AWS Configuration (override at runtime)
|
||||
# ENV AWS_ENDPOINT_URL=https://s3.amazonaws.com
|
||||
# ENV AWS_ACCESS_KEY_ID=<your-key>
|
||||
# ENV AWS_SECRET_ACCESS_KEY=<your-secret>
|
||||
# ENV AWS_DEFAULT_REGION=us-east-1
|
||||
|
||||
# Labels
|
||||
LABEL org.opencontainers.image.title="DeltaGlider" \
|
||||
org.opencontainers.image.description="Delta-aware S3 file storage wrapper" \
|
||||
org.opencontainers.image.version="0.1.0" \
|
||||
org.opencontainers.image.description="Delta-aware S3 file storage wrapper with encryption" \
|
||||
org.opencontainers.image.version="5.0.3" \
|
||||
org.opencontainers.image.authors="Beshu Limited" \
|
||||
org.opencontainers.image.source="https://github.com/beshu-tech/deltaglider"
|
||||
|
||||
|
||||
122
PYPI_RELEASE.md
122
PYPI_RELEASE.md
@@ -1,122 +0,0 @@
|
||||
# Publishing DeltaGlider to PyPI
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Create PyPI account at https://pypi.org
|
||||
2. Create API token at https://pypi.org/manage/account/token/
|
||||
3. Install build tools:
|
||||
```bash
|
||||
pip install build twine
|
||||
```
|
||||
|
||||
## Build the Package
|
||||
|
||||
```bash
|
||||
# Clean previous builds
|
||||
rm -rf dist/ build/ *.egg-info/
|
||||
|
||||
# Build source distribution and wheel
|
||||
python -m build
|
||||
|
||||
# This creates:
|
||||
# - dist/deltaglider-0.1.0.tar.gz (source distribution)
|
||||
# - dist/deltaglider-0.1.0-py3-none-any.whl (wheel)
|
||||
```
|
||||
|
||||
## Test with TestPyPI (Optional but Recommended)
|
||||
|
||||
1. Upload to TestPyPI:
|
||||
```bash
|
||||
python -m twine upload --repository testpypi dist/*
|
||||
```
|
||||
|
||||
2. Test installation:
|
||||
```bash
|
||||
pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ deltaglider
|
||||
```
|
||||
|
||||
## Upload to PyPI
|
||||
|
||||
```bash
|
||||
# Upload to PyPI
|
||||
python -m twine upload dist/*
|
||||
|
||||
# You'll be prompted for:
|
||||
# - username: __token__
|
||||
# - password: <your-pypi-api-token>
|
||||
```
|
||||
|
||||
## Verify Installation
|
||||
|
||||
```bash
|
||||
# Install from PyPI
|
||||
pip install deltaglider
|
||||
|
||||
# Test it works
|
||||
deltaglider --help
|
||||
```
|
||||
|
||||
## GitHub Release
|
||||
|
||||
After PyPI release, create a GitHub release:
|
||||
|
||||
```bash
|
||||
git tag -a v0.1.0 -m "Release version 0.1.0"
|
||||
git push origin v0.1.0
|
||||
```
|
||||
|
||||
Then create a release on GitHub:
|
||||
1. Go to https://github.com/beshu-tech/deltaglider/releases
|
||||
2. Click "Create a new release"
|
||||
3. Select the tag v0.1.0
|
||||
4. Add release notes from CHANGELOG
|
||||
5. Attach the wheel and source distribution from dist/
|
||||
6. Publish release
|
||||
|
||||
## Version Bumping
|
||||
|
||||
For next release:
|
||||
1. Update version in `pyproject.toml`
|
||||
2. Update CHANGELOG
|
||||
3. Commit changes
|
||||
4. Follow steps above
|
||||
|
||||
## Automated Release (GitHub Actions)
|
||||
|
||||
Consider adding `.github/workflows/publish.yml`:
|
||||
|
||||
```yaml
|
||||
name: Publish to PyPI
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.11'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install build twine
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
- name: Publish to PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
|
||||
run: |
|
||||
twine upload dist/*
|
||||
```
|
||||
|
||||
## Marketing After Release
|
||||
|
||||
1. **Hacker News**: Post with compelling title focusing on the 99.9% compression
|
||||
2. **Reddit**: r/Python, r/devops, r/aws
|
||||
3. **Twitter/X**: Tag AWS, Python, and DevOps influencers
|
||||
4. **Dev.to / Medium**: Write technical article about the architecture
|
||||
5. **PyPI Description**: Ensure it's compelling and includes the case study link
|
||||
645
README.md
645
README.md
@@ -7,16 +7,16 @@
|
||||
[](https://github.com/jmacd/xdelta)
|
||||
|
||||
<div align="center">
|
||||
<img src="https://github.com/sscarduzio/deltaglider/raw/main/docs/deltaglider.png" alt="DeltaGlider Logo" width="500"/>
|
||||
<img src="https://github.com/beshu-tech/deltaglider/raw/main/docs/deltaglider.png" alt="DeltaGlider Logo" width="500"/>
|
||||
</div>
|
||||
|
||||
**Store 4TB of similar files in 5GB. No, that's not a typo.**
|
||||
|
||||
DeltaGlider is a drop-in S3 replacement that achieves 99.9% compression for versioned artifacts, backups, and release archives through intelligent binary delta compression.
|
||||
DeltaGlider is a drop-in S3 replacement that may achieve 99.9% size reduction for versioned compressed artifacts, backups, and release archives through intelligent binary delta compression (via xdelta3).
|
||||
|
||||
## The Problem We Solved
|
||||
|
||||
You're storing hundreds of versions of your releases. Each 100MB build differs by <1% from the previous version. You're paying to store 100GB of what's essentially 100MB of unique data.
|
||||
You're storing hundreds of versions of your software releases. Each 100MB build differs by <1% from the previous version. You're paying to store 100GB of what's essentially 100MB of unique data.
|
||||
|
||||
Sound familiar?
|
||||
|
||||
@@ -28,7 +28,99 @@ From our [ReadOnlyREST case study](docs/case-study-readonlyrest.md):
|
||||
- **Compression**: 99.9% (not a typo)
|
||||
- **Integration time**: 5 minutes
|
||||
|
||||
## How It Works
|
||||
## Quick Start
|
||||
|
||||
The quickest way to start is using the GUI
|
||||
* https://github.com/sscarduzio/dg_commander/
|
||||
|
||||
### CLI Installation
|
||||
|
||||
```bash
|
||||
# Via pip (Python 3.11+)
|
||||
pip install deltaglider
|
||||
|
||||
# Via uv (faster)
|
||||
uv pip install deltaglider
|
||||
|
||||
# Via Docker
|
||||
docker run -v ~/.aws:/root/.aws deltaglider/deltaglider --help
|
||||
```
|
||||
|
||||
### Docker Usage
|
||||
|
||||
DeltaGlider provides a secure, production-ready Docker image with encryption always enabled:
|
||||
|
||||
```bash
|
||||
# Basic usage with AWS credentials from environment
|
||||
docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider ls s3://my-bucket/
|
||||
|
||||
# Mount AWS credentials
|
||||
docker run -v ~/.aws:/root/.aws:ro \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
|
||||
# Use memory cache for ephemeral CI/CD pipelines (faster)
|
||||
docker run -e DG_CACHE_BACKEND=memory \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider sync ./dist/ s3://releases/v1.0.0/
|
||||
|
||||
# Configure memory cache size (default: 100MB)
|
||||
docker run -e DG_CACHE_BACKEND=memory \
|
||||
-e DG_CACHE_MEMORY_SIZE_MB=500 \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider cp large-file.zip s3://releases/
|
||||
|
||||
# Use MinIO or custom S3 endpoint
|
||||
docker run -e AWS_ENDPOINT_URL=http://minio:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
deltaglider/deltaglider ls s3://test-bucket/
|
||||
|
||||
# Persistent encryption key for cross-container cache sharing
|
||||
# (Only needed if sharing cache across containers via volume mount)
|
||||
docker run -v /shared-cache:/tmp/.deltaglider \
|
||||
-e DG_CACHE_ENCRYPTION_KEY=$(openssl rand -base64 32) \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
```
|
||||
|
||||
**Environment Variables**:
|
||||
- `DG_LOG_LEVEL`: Logging level (default: `INFO`, options: `DEBUG`, `INFO`, `WARNING`, `ERROR`)
|
||||
- `DG_MAX_RATIO`: Maximum delta/file ratio (default: `0.5`, range: `0.0-1.0`)
|
||||
- `DG_CACHE_BACKEND`: Cache backend (default: `filesystem`, options: `filesystem`, `memory`)
|
||||
- `DG_CACHE_MEMORY_SIZE_MB`: Memory cache size in MB (default: `100`)
|
||||
- `DG_CACHE_ENCRYPTION_KEY`: Optional base64-encoded encryption key for cross-process cache sharing
|
||||
- `AWS_ENDPOINT_URL`: S3 endpoint URL (default: AWS S3)
|
||||
- `AWS_ACCESS_KEY_ID`: AWS access key
|
||||
- `AWS_SECRET_ACCESS_KEY`: AWS secret key
|
||||
- `AWS_DEFAULT_REGION`: AWS region (default: `us-east-1`)
|
||||
|
||||
**Security Notes**:
|
||||
- Encryption is **always enabled** (cannot be disabled)
|
||||
- Each container gets ephemeral encryption keys for maximum security
|
||||
- Corrupted cache files are automatically deleted
|
||||
- Use `DG_CACHE_ENCRYPTION_KEY` only for persistent cache sharing (store securely)
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```bash
|
||||
# Upload a file (automatic delta compression)
|
||||
deltaglider cp my-app-v1.0.0.zip s3://releases/
|
||||
|
||||
# Download a file (automatic delta reconstruction)
|
||||
deltaglider cp s3://releases/my-app-v1.0.0.zip ./downloaded.zip
|
||||
|
||||
# List objects
|
||||
deltaglider ls s3://releases/
|
||||
|
||||
# Sync directories
|
||||
deltaglider sync ./dist/ s3://releases/v1.0.0/
|
||||
```
|
||||
|
||||
**That's it!** DeltaGlider automatically detects similar files and applies 99%+ compression. For more commands and options, see [CLI Reference](#cli-reference).
|
||||
|
||||
## Core Concepts
|
||||
|
||||
### How It Works
|
||||
|
||||
```
|
||||
Traditional S3:
|
||||
@@ -42,24 +134,32 @@ With DeltaGlider:
|
||||
v1.0.2.zip (100MB) → S3: 97KB delta (100.3MB total)
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
DeltaGlider stores the first file as a reference and subsequent similar files as tiny deltas (differences). When you download, it reconstructs the original file perfectly using the reference + delta.
|
||||
|
||||
### Installation
|
||||
### Intelligent File Type Detection
|
||||
|
||||
```bash
|
||||
# Via pip (Python 3.11+)
|
||||
pip install deltaglider
|
||||
DeltaGlider automatically detects file types and applies the optimal strategy:
|
||||
|
||||
# Via uv (faster)
|
||||
uv pip install deltaglider
|
||||
| File Type | Strategy | Typical Compression | Why It Works |
|
||||
|-----------|----------|---------------------|--------------|
|
||||
| `.zip`, `.tar`, `.gz` | Binary delta | 99%+ for similar versions | Archive structure remains consistent between versions |
|
||||
| `.dmg`, `.deb`, `.rpm` | Binary delta | 95%+ for similar versions | Package formats with predictable structure |
|
||||
| `.jar`, `.war`, `.ear` | Binary delta | 90%+ for similar builds | Java archives with mostly unchanged classes |
|
||||
| `.exe`, `.dll`, `.so` | Direct upload | 0% (no delta benefit) | Compiled code changes unpredictably |
|
||||
| `.txt`, `.json`, `.xml` | Direct upload | 0% (use gzip instead) | Text files benefit more from standard compression |
|
||||
| `.sha1`, `.sha512`, `.md5` | Direct upload | 0% (already minimal) | Hash files are unique by design |
|
||||
|
||||
# Via Docker
|
||||
docker run -v ~/.aws:/root/.aws deltaglider/deltaglider --help
|
||||
```
|
||||
### Key Features
|
||||
|
||||
### AWS S3 Compatible Commands
|
||||
- **AWS CLI Replacement**: Same commands as `aws s3` with automatic compression
|
||||
- **boto3-Compatible SDK**: Works with existing boto3 code with minimal changes
|
||||
- **Zero Configuration**: No databases, no manifest files, no complex setup
|
||||
- **Data Integrity**: SHA256 verification on every operation
|
||||
- **S3 Compatible**: Works with AWS S3, MinIO, Cloudflare R2, and any S3-compatible storage
|
||||
|
||||
DeltaGlider is a **drop-in replacement** for AWS S3 CLI with automatic delta compression:
|
||||
## CLI Reference
|
||||
|
||||
### All Commands
|
||||
|
||||
```bash
|
||||
# Copy files to/from S3 (automatic delta compression for archives)
|
||||
@@ -91,213 +191,7 @@ deltaglider sync --exclude "*.log" ./src/ s3://backup/ # Exclude patterns
|
||||
deltaglider cp file.zip s3://bucket/ --endpoint-url http://localhost:9000
|
||||
```
|
||||
|
||||
### Legacy Commands (still supported)
|
||||
|
||||
```bash
|
||||
# Original DeltaGlider commands
|
||||
deltaglider put my-app-v1.0.0.zip s3://releases/
|
||||
deltaglider get s3://releases/my-app-v1.0.1.zip
|
||||
deltaglider verify s3://releases/my-app-v1.0.1.zip.delta
|
||||
```
|
||||
|
||||
## Why xdelta3 Excels at Archive Compression
|
||||
|
||||
Traditional diff algorithms (like `diff` or `git diff`) work line-by-line on text files. Binary diff tools like `bsdiff` or `courgette` are optimized for executables. But **xdelta3** is uniquely suited for compressed archives because:
|
||||
|
||||
1. **Block-level matching**: xdelta3 uses a rolling hash algorithm to find matching byte sequences at any offset, not just line boundaries. This is crucial for archives where small file changes can shift all subsequent byte positions.
|
||||
|
||||
2. **Large window support**: xdelta3 can use reference windows up to 2GB, allowing it to find matches even when content has moved significantly within the archive. Other delta algorithms typically use much smaller windows (64KB-1MB).
|
||||
|
||||
3. **Compression-aware**: When you update one file in a ZIP/TAR archive, the archive format itself remains largely identical - same compression dictionary, same structure. xdelta3 preserves these similarities while other algorithms might miss them.
|
||||
|
||||
4. **Format agnostic**: Unlike specialized tools (e.g., `courgette` for Chrome updates), xdelta3 works on raw bytes without understanding the file format, making it perfect for any archive type.
|
||||
|
||||
### Real-World Example
|
||||
When you rebuild a JAR file with one class changed:
|
||||
- **Text diff**: 100% different (it's binary data!)
|
||||
- **bsdiff**: ~30-40% of original size (optimized for executables, not archives)
|
||||
- **xdelta3**: ~0.1-1% of original size (finds the unchanged parts regardless of position)
|
||||
|
||||
This is why DeltaGlider achieves 99%+ compression on versioned archives - xdelta3 can identify that 99% of the archive structure and content remains identical between versions.
|
||||
|
||||
## Intelligent File Type Detection
|
||||
|
||||
DeltaGlider automatically detects file types and applies the optimal strategy:
|
||||
|
||||
| File Type | Strategy | Typical Compression | Why It Works |
|
||||
|-----------|----------|-------------------|--------------|
|
||||
| `.zip`, `.tar`, `.gz` | Binary delta | 99%+ for similar versions | Archive structure remains consistent between versions |
|
||||
| `.dmg`, `.deb`, `.rpm` | Binary delta | 95%+ for similar versions | Package formats with predictable structure |
|
||||
| `.jar`, `.war`, `.ear` | Binary delta | 90%+ for similar builds | Java archives with mostly unchanged classes |
|
||||
| `.exe`, `.dll`, `.so` | Direct upload | 0% (no delta benefit) | Compiled code changes unpredictably |
|
||||
| `.txt`, `.json`, `.xml` | Direct upload | 0% (use gzip instead) | Text files benefit more from standard compression |
|
||||
| `.sha1`, `.sha512`, `.md5` | Direct upload | 0% (already minimal) | Hash files are unique by design |
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
Testing with real software releases:
|
||||
|
||||
```python
|
||||
# 513 Elasticsearch plugin releases (82.5MB each)
|
||||
Original size: 42.3 GB
|
||||
DeltaGlider size: 115 MB
|
||||
Compression: 99.7%
|
||||
Upload speed: 3-4 files/second
|
||||
Download speed: <100ms reconstruction
|
||||
```
|
||||
|
||||
## Integration Examples
|
||||
|
||||
### Drop-in AWS CLI Replacement
|
||||
|
||||
```bash
|
||||
# Before (aws-cli)
|
||||
aws s3 cp release-v2.0.0.zip s3://releases/
|
||||
aws s3 cp --recursive ./build/ s3://releases/v2.0.0/
|
||||
aws s3 ls s3://releases/
|
||||
aws s3 rm s3://releases/old-version.zip
|
||||
|
||||
# After (deltaglider) - Same commands, 99% less storage!
|
||||
deltaglider cp release-v2.0.0.zip s3://releases/
|
||||
deltaglider cp -r ./build/ s3://releases/v2.0.0/
|
||||
deltaglider ls s3://releases/
|
||||
deltaglider rm s3://releases/old-version.zip
|
||||
```
|
||||
|
||||
### CI/CD Pipeline (GitHub Actions)
|
||||
|
||||
```yaml
|
||||
- name: Upload Release with 99% compression
|
||||
run: |
|
||||
pip install deltaglider
|
||||
# Use AWS S3 compatible syntax
|
||||
deltaglider cp dist/*.zip s3://releases/${{ github.ref_name }}/
|
||||
|
||||
# Or use recursive for entire directories
|
||||
deltaglider cp -r dist/ s3://releases/${{ github.ref_name }}/
|
||||
```
|
||||
|
||||
### Backup Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Daily backup with automatic deduplication
|
||||
tar -czf backup-$(date +%Y%m%d).tar.gz /data
|
||||
deltaglider cp backup-*.tar.gz s3://backups/
|
||||
# Only changes are stored, not full backup
|
||||
|
||||
# List backups with human-readable sizes
|
||||
deltaglider ls -h s3://backups/
|
||||
|
||||
# Clean up old backups
|
||||
deltaglider rm -r s3://backups/2023/
|
||||
```
|
||||
|
||||
### Python SDK
|
||||
|
||||
**[📚 Full SDK Documentation](docs/sdk/README.md)** | **[API Reference](docs/sdk/api.md)** | **[Examples](docs/sdk/examples.md)**
|
||||
|
||||
#### Quick Start
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from deltaglider import create_client
|
||||
|
||||
# Uses AWS credentials from environment or ~/.aws/credentials
|
||||
client = create_client()
|
||||
|
||||
# Upload a file (auto-detects if delta compression should be used)
|
||||
summary = client.upload("my-app-v2.0.0.zip", "s3://releases/v2.0.0/")
|
||||
print(f"Compressed from {summary.original_size_mb:.1f}MB to {summary.stored_size_mb:.1f}MB")
|
||||
print(f"Saved {summary.savings_percent:.0f}% storage space")
|
||||
|
||||
# Download a file (auto-handles delta reconstruction)
|
||||
client.download("s3://releases/v2.0.0/my-app-v2.0.0.zip", "local-app.zip")
|
||||
```
|
||||
|
||||
#### Real-World Example: Software Release Storage
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Upload multiple versions of your software
|
||||
versions = ["v1.0.0", "v1.0.1", "v1.0.2", "v1.1.0"]
|
||||
for version in versions:
|
||||
file = f"dist/my-app-{version}.zip"
|
||||
summary = client.upload(file, f"s3://releases/{version}/")
|
||||
|
||||
if summary.is_delta:
|
||||
print(f"{version}: Stored as {summary.stored_size_mb:.1f}MB delta "
|
||||
f"(saved {summary.savings_percent:.0f}%)")
|
||||
else:
|
||||
print(f"{version}: Stored as reference ({summary.original_size_mb:.1f}MB)")
|
||||
|
||||
# Result:
|
||||
# v1.0.0: Stored as reference (100.0MB)
|
||||
# v1.0.1: Stored as 0.2MB delta (saved 99.8%)
|
||||
# v1.0.2: Stored as 0.3MB delta (saved 99.7%)
|
||||
# v1.1.0: Stored as 5.2MB delta (saved 94.8%)
|
||||
```
|
||||
|
||||
#### Advanced Example: Automated Backup System
|
||||
|
||||
```python
|
||||
from datetime import datetime
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client(
|
||||
endpoint_url="http://minio.internal:9000", # Works with MinIO/R2/etc
|
||||
log_level="INFO"
|
||||
)
|
||||
|
||||
def backup_database():
|
||||
"""Daily database backup with automatic deduplication."""
|
||||
date = datetime.now().strftime("%Y%m%d")
|
||||
|
||||
# Create database dump
|
||||
dump_file = f"backup-{date}.sql.gz"
|
||||
|
||||
# Upload with delta compression
|
||||
summary = client.upload(
|
||||
dump_file,
|
||||
f"s3://backups/postgres/{date}/",
|
||||
tags={"type": "daily", "database": "production"}
|
||||
)
|
||||
|
||||
# Monitor compression effectiveness
|
||||
if summary.delta_ratio > 0.1: # If delta is >10% of original
|
||||
print(f"Warning: Low compression ({summary.savings_percent:.0f}%), "
|
||||
"database might have significant changes")
|
||||
|
||||
# Keep last 30 days, archive older
|
||||
client.lifecycle_policy("s3://backups/postgres/",
|
||||
days_before_archive=30,
|
||||
days_before_delete=90)
|
||||
|
||||
return summary
|
||||
|
||||
# Run backup
|
||||
result = backup_database()
|
||||
print(f"Backup complete: {result.stored_size_mb:.1f}MB stored")
|
||||
```
|
||||
|
||||
For more examples and detailed API documentation, see the [SDK Documentation](docs/sdk/README.md).
|
||||
|
||||
## Migration from AWS CLI
|
||||
|
||||
Migrating from `aws s3` to `deltaglider` is as simple as changing the command name:
|
||||
|
||||
| AWS CLI | DeltaGlider | Compression Benefit |
|
||||
|---------|------------|-------------------|
|
||||
| `aws s3 cp file.zip s3://bucket/` | `deltaglider cp file.zip s3://bucket/` | ✅ 99% for similar files |
|
||||
| `aws s3 cp -r dir/ s3://bucket/` | `deltaglider cp -r dir/ s3://bucket/` | ✅ 99% for archives |
|
||||
| `aws s3 ls s3://bucket/` | `deltaglider ls s3://bucket/` | - |
|
||||
| `aws s3 rm s3://bucket/file` | `deltaglider rm s3://bucket/file` | - |
|
||||
| `aws s3 sync dir/ s3://bucket/` | `deltaglider sync dir/ s3://bucket/` | ✅ 99% incremental |
|
||||
|
||||
### Compatibility Flags
|
||||
### Command Flags
|
||||
|
||||
```bash
|
||||
# All standard AWS flags work
|
||||
@@ -312,7 +206,260 @@ deltaglider cp file.zip s3://bucket/ \
|
||||
--max-ratio 0.8 # Only use delta if compression > 20%
|
||||
```
|
||||
|
||||
## Architecture
|
||||
### CI/CD Integration
|
||||
|
||||
#### GitHub Actions
|
||||
|
||||
```yaml
|
||||
- name: Upload Release with 99% compression
|
||||
run: |
|
||||
pip install deltaglider
|
||||
deltaglider cp dist/*.zip s3://releases/${{ github.ref_name }}/
|
||||
# Or recursive for entire directories
|
||||
deltaglider cp -r dist/ s3://releases/${{ github.ref_name }}/
|
||||
```
|
||||
|
||||
#### Daily Backup Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Daily backup with automatic deduplication
|
||||
tar -czf backup-$(date +%Y%m%d).tar.gz /data
|
||||
deltaglider cp backup-*.tar.gz s3://backups/
|
||||
# Only changes are stored, not full backup
|
||||
|
||||
# Clean up old backups
|
||||
deltaglider rm -r s3://backups/2023/
|
||||
```
|
||||
|
||||
## Python SDK
|
||||
|
||||
**[📚 Full SDK Documentation](docs/sdk/README.md)** | **[API Reference](docs/sdk/api.md)** | **[Examples](docs/sdk/examples.md)** | **[boto3 Compatibility Guide](BOTO3_COMPATIBILITY.md)**
|
||||
|
||||
### boto3-Compatible API (Recommended)
|
||||
|
||||
DeltaGlider provides a **boto3-compatible API** for core S3 operations (21 methods covering 80% of use cases):
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
# Drop-in replacement for boto3.client('s3')
|
||||
client = create_client() # Uses AWS credentials automatically
|
||||
|
||||
# Identical to boto3 S3 API - just works with 99% compression!
|
||||
response = client.put_object(
|
||||
Bucket='releases',
|
||||
Key='v2.0.0/my-app.zip',
|
||||
Body=open('my-app-v2.0.0.zip', 'rb')
|
||||
)
|
||||
print(f"Stored with ETag: {response['ETag']}")
|
||||
|
||||
# Standard boto3 get_object - handles delta reconstruction automatically
|
||||
response = client.get_object(Bucket='releases', Key='v2.0.0/my-app.zip')
|
||||
with open('downloaded.zip', 'wb') as f:
|
||||
f.write(response['Body'].read())
|
||||
|
||||
# Smart list_objects with optimized performance
|
||||
response = client.list_objects(Bucket='releases', Prefix='v2.0.0/')
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
# Paginated listing for large buckets
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100)
|
||||
while response.get('IsTruncated'):
|
||||
for obj in response['Contents']:
|
||||
print(obj['Key'])
|
||||
response = client.list_objects(
|
||||
Bucket='releases',
|
||||
MaxKeys=100,
|
||||
ContinuationToken=response.get('NextContinuationToken')
|
||||
)
|
||||
|
||||
# Delete and inspect objects
|
||||
client.delete_object(Bucket='releases', Key='old-version.zip')
|
||||
client.head_object(Bucket='releases', Key='v2.0.0/my-app.zip')
|
||||
```
|
||||
|
||||
### Bucket Management
|
||||
|
||||
**No boto3 required!** DeltaGlider provides complete bucket management:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Create buckets
|
||||
client.create_bucket(Bucket='my-releases')
|
||||
|
||||
# Create bucket in specific region (AWS only)
|
||||
client.create_bucket(
|
||||
Bucket='my-regional-bucket',
|
||||
CreateBucketConfiguration={'LocationConstraint': 'us-west-2'}
|
||||
)
|
||||
|
||||
# List all buckets
|
||||
response = client.list_buckets()
|
||||
for bucket in response['Buckets']:
|
||||
print(f"{bucket['Name']} - {bucket['CreationDate']}")
|
||||
|
||||
# Delete bucket (must be empty)
|
||||
client.delete_bucket(Bucket='my-old-bucket')
|
||||
```
|
||||
|
||||
See [examples/bucket_management.py](examples/bucket_management.py) for complete example.
|
||||
|
||||
### Simple API (Alternative)
|
||||
|
||||
For simpler use cases, DeltaGlider also provides a streamlined API:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Simple upload with automatic compression detection
|
||||
summary = client.upload("my-app-v2.0.0.zip", "s3://releases/v2.0.0/")
|
||||
print(f"Compressed from {summary.original_size_mb:.1f}MB to {summary.stored_size_mb:.1f}MB")
|
||||
print(f"Saved {summary.savings_percent:.0f}% storage space")
|
||||
|
||||
# Simple download with automatic delta reconstruction
|
||||
client.download("s3://releases/v2.0.0/my-app-v2.0.0.zip", "local-app.zip")
|
||||
```
|
||||
|
||||
### Real-World Examples
|
||||
|
||||
#### Software Release Storage
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Upload multiple versions
|
||||
versions = ["v1.0.0", "v1.0.1", "v1.0.2", "v1.1.0"]
|
||||
for version in versions:
|
||||
with open(f"dist/my-app-{version}.zip", 'rb') as f:
|
||||
response = client.put_object(
|
||||
Bucket='releases',
|
||||
Key=f'{version}/my-app-{version}.zip',
|
||||
Body=f,
|
||||
Metadata={'version': version, 'build': 'production'}
|
||||
)
|
||||
|
||||
# Check compression stats (DeltaGlider extension)
|
||||
if 'DeltaGliderInfo' in response:
|
||||
info = response['DeltaGliderInfo']
|
||||
if info.get('IsDelta'):
|
||||
print(f"{version}: Stored as {info['StoredSizeMB']:.1f}MB delta "
|
||||
f"(saved {info['SavingsPercent']:.0f}%)")
|
||||
else:
|
||||
print(f"{version}: Stored as reference ({info['OriginalSizeMB']:.1f}MB)")
|
||||
|
||||
# Result:
|
||||
# v1.0.0: Stored as reference (100.0MB)
|
||||
# v1.0.1: Stored as 0.2MB delta (saved 99.8%)
|
||||
# v1.0.2: Stored as 0.3MB delta (saved 99.7%)
|
||||
# v1.1.0: Stored as 5.2MB delta (saved 94.8%)
|
||||
```
|
||||
|
||||
#### Automated Database Backup
|
||||
|
||||
```python
|
||||
from datetime import datetime
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client(endpoint_url="http://minio.internal:9000")
|
||||
|
||||
def backup_database():
|
||||
"""Daily database backup with automatic deduplication."""
|
||||
date = datetime.now().strftime("%Y%m%d")
|
||||
dump_file = f"backup-{date}.sql.gz"
|
||||
|
||||
# Upload using boto3-compatible API
|
||||
with open(dump_file, 'rb') as f:
|
||||
response = client.put_object(
|
||||
Bucket='backups',
|
||||
Key=f'postgres/{date}/{dump_file}',
|
||||
Body=f,
|
||||
Tagging='type=daily&database=production',
|
||||
Metadata={'date': date, 'source': 'production'}
|
||||
)
|
||||
|
||||
# Check compression effectiveness
|
||||
if 'DeltaGliderInfo' in response:
|
||||
info = response['DeltaGliderInfo']
|
||||
if info['DeltaRatio'] > 0.1:
|
||||
print(f"Warning: Low compression ({info['SavingsPercent']:.0f}%), "
|
||||
"database might have significant changes")
|
||||
print(f"Backup stored: {info['StoredSizeMB']:.1f}MB "
|
||||
f"(compressed from {info['OriginalSizeMB']:.1f}MB)")
|
||||
|
||||
backup_database()
|
||||
```
|
||||
|
||||
For more examples and detailed API documentation, see the [SDK Documentation](docs/sdk/README.md).
|
||||
|
||||
## Performance & Benchmarks
|
||||
|
||||
### Real-World Results
|
||||
|
||||
Testing with 513 Elasticsearch plugin releases (82.5MB each):
|
||||
|
||||
```
|
||||
Original size: 42.3 GB
|
||||
DeltaGlider size: 115 MB
|
||||
Compression: 99.7%
|
||||
Upload speed: 3-4 files/second
|
||||
Download speed: <100ms reconstruction
|
||||
```
|
||||
|
||||
### The Math
|
||||
|
||||
For `N` versions of a `S` MB file with `D%` difference between versions:
|
||||
|
||||
**Traditional S3**: `N × S` MB
|
||||
**DeltaGlider**: `S + (N-1) × S × D%` MB
|
||||
|
||||
Example: 100 versions of 100MB files with 1% difference:
|
||||
- **Traditional**: 10,000 MB
|
||||
- **DeltaGlider**: 199 MB
|
||||
- **Savings**: 98%
|
||||
|
||||
### Comparison
|
||||
|
||||
| Solution | Compression | Speed | Integration | Cost |
|
||||
|----------|------------|-------|-------------|------|
|
||||
| **DeltaGlider** | 99%+ | Fast | Drop-in | Open source |
|
||||
| S3 Versioning | 0% | Native | Built-in | $$ per version |
|
||||
| Deduplication | 30-50% | Slow | Complex | Enterprise $$$ |
|
||||
| Git LFS | Good | Slow | Git-only | $ per GB |
|
||||
| Restic/Borg | 80-90% | Medium | Backup-only | Open source |
|
||||
|
||||
## Architecture & Technical Deep Dive
|
||||
|
||||
### Why xdelta3 Excels at Archive Compression
|
||||
|
||||
Traditional diff algorithms (like `diff` or `git diff`) work line-by-line on text files. Binary diff tools like `bsdiff` or `courgette` are optimized for executables. But **xdelta3** is uniquely suited for compressed archives because:
|
||||
|
||||
1. **Block-level matching**: xdelta3 uses a rolling hash algorithm to find matching byte sequences at any offset, not just line boundaries. This is crucial for archives where small file changes can shift all subsequent byte positions.
|
||||
|
||||
2. **Large window support**: xdelta3 can use reference windows up to 2GB, allowing it to find matches even when content has moved significantly within the archive. Other delta algorithms typically use much smaller windows (64KB-1MB).
|
||||
|
||||
3. **Compression-aware**: When you update one file in a ZIP/TAR archive, the archive format itself remains largely identical - same compression dictionary, same structure. xdelta3 preserves these similarities while other algorithms might miss them.
|
||||
|
||||
4. **Format agnostic**: Unlike specialized tools (e.g., `courgette` for Chrome updates), xdelta3 works on raw bytes without understanding the file format, making it perfect for any archive type.
|
||||
|
||||
#### Real-World Example
|
||||
|
||||
When you rebuild a JAR file with one class changed:
|
||||
- **Text diff**: 100% different (it's binary data!)
|
||||
- **bsdiff**: ~30-40% of original size (optimized for executables, not archives)
|
||||
- **xdelta3**: ~0.1-1% of original size (finds the unchanged parts regardless of position)
|
||||
|
||||
This is why DeltaGlider achieves 99%+ compression on versioned archives - xdelta3 can identify that 99% of the archive structure and content remains identical between versions.
|
||||
|
||||
### System Architecture
|
||||
|
||||
DeltaGlider uses a clean hexagonal architecture:
|
||||
|
||||
@@ -335,7 +482,7 @@ DeltaGlider uses a clean hexagonal architecture:
|
||||
- **Local caching**: Fast repeated operations
|
||||
- **Zero dependencies**: No database, no manifest files
|
||||
|
||||
## When to Use DeltaGlider
|
||||
### When to Use DeltaGlider
|
||||
|
||||
✅ **Perfect for:**
|
||||
- Software releases and versioned artifacts
|
||||
@@ -346,20 +493,22 @@ DeltaGlider uses a clean hexagonal architecture:
|
||||
- Any versioned binary data
|
||||
|
||||
❌ **Not ideal for:**
|
||||
- Already compressed unique files
|
||||
- Streaming media files
|
||||
- Already compressed **unique** files
|
||||
- Streaming or multimedia files
|
||||
- Frequently changing unstructured data
|
||||
- Files smaller than 1MB
|
||||
|
||||
## Comparison
|
||||
## Migration from AWS CLI
|
||||
|
||||
| Solution | Compression | Speed | Integration | Cost |
|
||||
|----------|------------|-------|-------------|------|
|
||||
| **DeltaGlider** | 99%+ | Fast | Drop-in | Open source |
|
||||
| S3 Versioning | 0% | Native | Built-in | $$ per version |
|
||||
| Deduplication | 30-50% | Slow | Complex | Enterprise $$$ |
|
||||
| Git LFS | Good | Slow | Git-only | $ per GB |
|
||||
| Restic/Borg | 80-90% | Medium | Backup-only | Open source |
|
||||
Migrating from `aws s3` to `deltaglider` is as simple as changing the command name:
|
||||
|
||||
| AWS CLI | DeltaGlider | Compression Benefit |
|
||||
|---------|------------|---------------------|
|
||||
| `aws s3 cp file.zip s3://bucket/` | `deltaglider cp file.zip s3://bucket/` | ✅ 99% for similar files |
|
||||
| `aws s3 cp -r dir/ s3://bucket/` | `deltaglider cp -r dir/ s3://bucket/` | ✅ 99% for archives |
|
||||
| `aws s3 ls s3://bucket/` | `deltaglider ls s3://bucket/` | - |
|
||||
| `aws s3 rm s3://bucket/file` | `deltaglider rm s3://bucket/file` | - |
|
||||
| `aws s3 sync dir/ s3://bucket/` | `deltaglider sync dir/ s3://bucket/` | ✅ 99% incremental |
|
||||
|
||||
## Production Ready
|
||||
|
||||
@@ -368,7 +517,9 @@ DeltaGlider uses a clean hexagonal architecture:
|
||||
- ✅ **S3 compatible**: Works with AWS, MinIO, Cloudflare R2, etc.
|
||||
- ✅ **Atomic operations**: No partial states
|
||||
- ✅ **Concurrent safe**: Multiple clients supported
|
||||
- ✅ **Well tested**: 95%+ code coverage
|
||||
- ✅ **Thoroughly tested**: 99 integration/unit tests, comprehensive test coverage
|
||||
- ✅ **Type safe**: Full mypy type checking, zero type errors
|
||||
- ✅ **Code quality**: Automated linting with ruff, clean codebase
|
||||
|
||||
## Development
|
||||
|
||||
@@ -380,13 +531,17 @@ cd deltaglider
|
||||
# Install with dev dependencies
|
||||
uv pip install -e ".[dev]"
|
||||
|
||||
# Run tests
|
||||
# Run tests (99 integration/unit tests)
|
||||
uv run pytest
|
||||
|
||||
# Run quality checks
|
||||
uv run ruff check src/ # Linting
|
||||
uv run mypy src/ # Type checking
|
||||
|
||||
# Run with local MinIO
|
||||
docker-compose up -d
|
||||
export AWS_ENDPOINT_URL=http://localhost:9000
|
||||
deltaglider put test.zip s3://test/
|
||||
deltaglider cp test.zip s3://test/
|
||||
```
|
||||
|
||||
## FAQ
|
||||
@@ -406,18 +561,6 @@ A: Zero. Files without similarity are uploaded directly.
|
||||
**Q: Is this compatible with S3 encryption?**
|
||||
A: Yes, DeltaGlider respects all S3 settings including SSE, KMS, and bucket policies.
|
||||
|
||||
## The Math
|
||||
|
||||
For `N` versions of a `S` MB file with `D%` difference between versions:
|
||||
|
||||
**Traditional S3**: `N × S` MB
|
||||
**DeltaGlider**: `S + (N-1) × S × D%` MB
|
||||
|
||||
Example: 100 versions of 100MB files with 1% difference:
|
||||
- **Traditional**: 10,000 MB
|
||||
- **DeltaGlider**: 199 MB
|
||||
- **Savings**: 98%
|
||||
|
||||
## Contributing
|
||||
|
||||
We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
||||
@@ -454,4 +597,4 @@ deltaglider analyze s3://your-bucket/
|
||||
# Output: "Potential savings: 95.2% (4.8TB → 237GB)"
|
||||
```
|
||||
|
||||
Built with ❤️ by engineers who were tired of paying to store the same bytes over and over.
|
||||
Built with ❤️ by engineers who were tired of paying to store the same bytes over and over.
|
||||
|
||||
630
SECURITY_FIX_ROADMAP.md
Normal file
630
SECURITY_FIX_ROADMAP.md
Normal file
@@ -0,0 +1,630 @@
|
||||
# 🛡️ DeltaGlider Security Fix Roadmap
|
||||
|
||||
## Executive Summary
|
||||
Critical security vulnerabilities have been identified in DeltaGlider's cache system that enable multi-user attacks, data exposure, and cache poisoning. This document provides a **chronological, actionable roadmap** to eliminate these threats through bold architectural changes.
|
||||
|
||||
**Key Innovation**: Instead of patching individual issues, we propose a **"Zero-Trust Cache Architecture"** that eliminates entire classes of vulnerabilities.
|
||||
|
||||
---
|
||||
|
||||
## 🚀 The Bold Solution: Ephemeral Signed Cache
|
||||
|
||||
### Core Concept
|
||||
Replace filesystem cache with **ephemeral, cryptographically-signed, user-isolated cache** that eliminates:
|
||||
- TOCTOU vulnerabilities (no shared filesystem)
|
||||
- Multi-user interference (process isolation)
|
||||
- Cache poisoning (cryptographic signatures)
|
||||
- Information disclosure (encrypted metadata)
|
||||
- Cross-endpoint collision (content-addressed storage)
|
||||
|
||||
**Note**: DeltaGlider is designed as a standalone CLI/SDK application. All solutions maintain this architecture without requiring external services.
|
||||
|
||||
---
|
||||
|
||||
## 📋 Implementation Roadmap
|
||||
|
||||
### **DAY 1-2: Emergency Hotfix** (v5.0.3) ✅ COMPLETED
|
||||
*Stop the bleeding - minimal changes for immediate deployment*
|
||||
|
||||
#### 1. **Ephemeral Process-Isolated Cache** (2 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/app/cli/main.py
|
||||
import tempfile
|
||||
import atexit
|
||||
|
||||
# SECURITY: Always use ephemeral process-isolated cache
|
||||
cache_dir = Path(tempfile.mkdtemp(prefix="deltaglider-", dir="/tmp"))
|
||||
atexit.register(lambda: shutil.rmtree(cache_dir, ignore_errors=True))
|
||||
```
|
||||
|
||||
**Impact**: Each process gets isolated cache, auto-cleaned on exit. Eliminates multi-user attacks.
|
||||
**Implementation**: All legacy shared cache code removed. Ephemeral cache is now the ONLY mode.
|
||||
|
||||
#### 2. **Add SHA Validation at Use-Time** (2 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/ports/cache.py
|
||||
class CachePort(Protocol):
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get reference with atomic SHA validation - MUST use this for all operations."""
|
||||
...
|
||||
|
||||
# src/deltaglider/adapters/cache_fs.py
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
path = self.ref_path(bucket, prefix)
|
||||
if not path.exists():
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Lock file for atomic read (Unix only)
|
||||
with open(path, 'rb') as f:
|
||||
if sys.platform != "win32":
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
||||
content = f.read()
|
||||
actual_sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
if actual_sha != expected_sha:
|
||||
path.unlink() # Remove corrupted cache
|
||||
raise CacheCorruptionError(f"SHA mismatch: cache corrupted")
|
||||
|
||||
return path
|
||||
```
|
||||
|
||||
#### 3. **Update All Usage Points** (1 hour) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/core/service.py
|
||||
# Replaced ALL instances in two locations:
|
||||
# - Line 234 (get method for decoding)
|
||||
# - Line 415 (_create_delta method for encoding)
|
||||
|
||||
ref_path = self.cache.get_validated_ref(
|
||||
delta_space.bucket,
|
||||
delta_space.prefix,
|
||||
ref_sha256 # Pass expected SHA
|
||||
)
|
||||
```
|
||||
|
||||
**Test & Deploy**: ✅ All 99 tests passing + ready for release
|
||||
|
||||
---
|
||||
|
||||
### **DAY 3-5: Quick Wins** (v5.0.3) ✅ COMPLETED
|
||||
*Low-risk improvements with high security impact*
|
||||
|
||||
#### 4. **Implement Content-Addressed Storage** (4 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/adapters/cache_cas.py
|
||||
class ContentAddressedCache(CachePort):
|
||||
"""Cache using SHA as filename - eliminates collisions"""
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str, sha256: str) -> Path:
|
||||
# Use SHA as filename - guaranteed unique
|
||||
return self.base_dir / sha256[:2] / sha256[2:4] / sha256
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path, sha256: str) -> Path:
|
||||
path = self.ref_path(bucket, prefix, sha256)
|
||||
|
||||
# If file with this SHA exists, we're done (deduplication!)
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
# Atomic write
|
||||
path.parent.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
tmp = path.with_suffix('.tmp')
|
||||
shutil.copy2(src, tmp)
|
||||
os.chmod(tmp, 0o600)
|
||||
|
||||
# Verify content before committing
|
||||
actual_sha = self.hasher.sha256(tmp)
|
||||
if actual_sha != sha256:
|
||||
tmp.unlink()
|
||||
raise ValueError("File corruption during cache write")
|
||||
|
||||
os.replace(tmp, path) # Atomic
|
||||
return path
|
||||
```
|
||||
|
||||
**Benefits**: ✅ ACHIEVED
|
||||
- Same file cached once regardless of bucket/prefix (automatic deduplication)
|
||||
- No collision possible (SHA256 uniqueness guarantees)
|
||||
- Natural cache validation (filename IS the checksum)
|
||||
- Two-level directory structure (ab/cd/abcdef...) for filesystem optimization
|
||||
|
||||
**Implementation**: Complete in `src/deltaglider/adapters/cache_cas.py` with:
|
||||
- `_cas_path()` method for SHA256-based path computation
|
||||
- `get_validated_ref()` with atomic validation and locking
|
||||
- `write_ref()` with atomic temp-file + rename pattern
|
||||
- Ephemeral deltaspace-to-SHA mapping for compatibility
|
||||
|
||||
#### 5. **Add Secure Directory Creation** (2 hours)
|
||||
```python
|
||||
# src/deltaglider/utils/secure_fs.py
|
||||
import os
|
||||
import stat
|
||||
|
||||
def secure_makedirs(path: Path, mode: int = 0o700) -> None:
|
||||
"""Create directory with secure permissions atomically."""
|
||||
try:
|
||||
path.mkdir(parents=True, mode=mode, exist_ok=False)
|
||||
except FileExistsError:
|
||||
# Verify it's ours and has correct permissions
|
||||
st = path.stat()
|
||||
if st.st_uid != os.getuid():
|
||||
raise SecurityError(f"Directory {path} owned by different user")
|
||||
if stat.S_IMODE(st.st_mode) != mode:
|
||||
os.chmod(path, mode) # Fix permissions
|
||||
```
|
||||
|
||||
#### 6. **Unify Cache Configuration** (1 hour)
|
||||
```python
|
||||
# src/deltaglider/config.py
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def get_cache_dir() -> Path:
|
||||
"""Single source of truth for cache directory."""
|
||||
if os.environ.get("DG_NO_CACHE") == "true":
|
||||
return None # Feature flag to disable cache
|
||||
|
||||
if os.environ.get("DG_EPHEMERAL_CACHE") == "true":
|
||||
return Path(tempfile.mkdtemp(prefix="dg-cache-"))
|
||||
|
||||
# User-specific cache by default
|
||||
cache_base = os.environ.get("DG_CACHE_DIR",
|
||||
os.path.expanduser("~/.cache/deltaglider"))
|
||||
return Path(cache_base) / "v2" # Version cache format
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### **DAY 6-10: Architecture Redesign** (v5.0.3) ✅ COMPLETED
|
||||
*The bold solution that eliminates entire vulnerability classes*
|
||||
|
||||
#### 7. **Implement Memory Cache with Encryption** (8 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/adapters/cache_memory.py
|
||||
class MemoryCache(CachePort):
|
||||
"""In-memory cache with LRU eviction and configurable size limits."""
|
||||
|
||||
def __init__(self, hasher: HashPort, max_size_mb: int = 100, temp_dir: Path | None = None):
|
||||
self.hasher = hasher
|
||||
self.max_size_bytes = max_size_mb * 1024 * 1024
|
||||
self._current_size = 0
|
||||
self._cache: dict[tuple[str, str], tuple[bytes, str]] = {} # (bucket, prefix) -> (content, SHA)
|
||||
self._access_order: list[tuple[str, str]] = [] # LRU tracking
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Write reference to in-memory cache with LRU eviction."""
|
||||
# Read content and compute SHA
|
||||
content = src.read_bytes()
|
||||
sha256 = self.hasher.sha256_bytes(content)
|
||||
|
||||
# Check if file fits in cache
|
||||
needed_bytes = len(content)
|
||||
if needed_bytes > self.max_size_bytes:
|
||||
raise CacheCorruptionError(f"File too large for cache: {needed_bytes} > {self.max_size_bytes}")
|
||||
|
||||
# Evict LRU if needed
|
||||
self._evict_lru(needed_bytes)
|
||||
|
||||
# Store in memory
|
||||
key = (bucket, prefix)
|
||||
self._cache[key] = (content, sha256)
|
||||
self._current_size += needed_bytes
|
||||
self._access_order.append(key)
|
||||
|
||||
return src # Return original path for compatibility
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with validation."""
|
||||
key = (bucket, prefix)
|
||||
if key not in self._cache:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
content, stored_sha = self._cache[key]
|
||||
|
||||
# Validate SHA matches
|
||||
if stored_sha != expected_sha:
|
||||
raise CacheCorruptionError(f"SHA mismatch for {bucket}/{prefix}")
|
||||
|
||||
# Update LRU order
|
||||
self._access_order.remove(key)
|
||||
self._access_order.append(key)
|
||||
|
||||
# Write to temp file for compatibility
|
||||
temp_path = self.temp_dir / f"{expected_sha}.bin"
|
||||
temp_path.write_bytes(content)
|
||||
return temp_path
|
||||
```
|
||||
|
||||
# src/deltaglider/adapters/cache_encrypted.py
|
||||
class EncryptedCache(CachePort):
|
||||
"""Encrypted cache wrapper using Fernet symmetric encryption."""
|
||||
|
||||
def __init__(self, backend: CachePort, encryption_key: bytes | None = None):
|
||||
self.backend = backend
|
||||
|
||||
# Key management: ephemeral (default) or provided
|
||||
if encryption_key is None:
|
||||
self._key = Fernet.generate_key() # Ephemeral per process
|
||||
self._ephemeral = True
|
||||
else:
|
||||
self._key = encryption_key
|
||||
self._ephemeral = False
|
||||
|
||||
self._cipher = Fernet(self._key)
|
||||
# Track plaintext SHA since encrypted content has different SHA
|
||||
self._plaintext_sha_map: dict[tuple[str, str], str] = {}
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Encrypt and cache reference file."""
|
||||
# Read plaintext and compute SHA
|
||||
plaintext_data = src.read_bytes()
|
||||
plaintext_sha = hashlib.sha256(plaintext_data).hexdigest()
|
||||
|
||||
# Encrypt data
|
||||
encrypted_data = self._cipher.encrypt(plaintext_data)
|
||||
|
||||
# Write encrypted data to temp file
|
||||
temp_encrypted = src.with_suffix(".encrypted.tmp")
|
||||
temp_encrypted.write_bytes(encrypted_data)
|
||||
|
||||
try:
|
||||
# Store encrypted file via backend
|
||||
result_path = self.backend.write_ref(bucket, prefix, temp_encrypted)
|
||||
|
||||
# Store plaintext SHA mapping
|
||||
key = (bucket, prefix)
|
||||
self._plaintext_sha_map[key] = plaintext_sha
|
||||
|
||||
return result_path
|
||||
finally:
|
||||
temp_encrypted.unlink(missing_ok=True)
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with decryption and validation."""
|
||||
# Verify we have the plaintext SHA mapped
|
||||
key = (bucket, prefix)
|
||||
if key not in self._plaintext_sha_map:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
if self._plaintext_sha_map[key] != expected_sha:
|
||||
raise CacheCorruptionError(f"SHA mismatch for {bucket}/{prefix}")
|
||||
|
||||
# Get encrypted file from backend
|
||||
encrypted_path = self.backend.ref_path(bucket, prefix)
|
||||
if not encrypted_path.exists():
|
||||
raise CacheMissError(f"Encrypted cache file not found")
|
||||
|
||||
# Decrypt content
|
||||
encrypted_data = encrypted_path.read_bytes()
|
||||
try:
|
||||
decrypted_data = self._cipher.decrypt(encrypted_data)
|
||||
except Exception as e:
|
||||
raise CacheCorruptionError(f"Decryption failed: {e}") from e
|
||||
|
||||
# Validate plaintext SHA
|
||||
actual_sha = hashlib.sha256(decrypted_data).hexdigest()
|
||||
if actual_sha != expected_sha:
|
||||
raise CacheCorruptionError(f"Decrypted content SHA mismatch")
|
||||
|
||||
# Write decrypted content to temp file
|
||||
decrypted_path = encrypted_path.with_suffix(".decrypted")
|
||||
decrypted_path.write_bytes(decrypted_data)
|
||||
return decrypted_path
|
||||
```
|
||||
|
||||
**Implementation**: ✅ COMPLETED
|
||||
- **MemoryCache**: In-memory cache with LRU eviction, configurable size limits, zero filesystem I/O
|
||||
- **EncryptedCache**: Fernet (AES-128-CBC + HMAC) encryption wrapper, ephemeral keys by default
|
||||
- **Configuration**: `DG_CACHE_BACKEND` (filesystem/memory), `DG_CACHE_ENCRYPTION` (true/false)
|
||||
- **Environment Variables**: `DG_CACHE_MEMORY_SIZE_MB`, `DG_CACHE_ENCRYPTION_KEY`
|
||||
|
||||
**Benefits**: ✅ ACHIEVED
|
||||
- No filesystem access for memory cache = no permission issues
|
||||
- Encrypted at rest = secure cache storage
|
||||
- Per-process ephemeral keys = forward secrecy and process isolation
|
||||
- LRU eviction = prevents memory exhaustion
|
||||
- Zero TOCTOU window = memory operations are atomic
|
||||
- Configurable backends = flexibility for different use cases
|
||||
|
||||
#### 8. **Implement Signed Cache Entries** (6 hours)
|
||||
```python
|
||||
# src/deltaglider/adapters/cache_signed.py
|
||||
import hmac
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
class SignedCache(CachePort):
|
||||
"""Cache with cryptographic signatures and expiry."""
|
||||
|
||||
def __init__(self, base_dir: Path, secret_key: bytes = None):
|
||||
self.base_dir = base_dir
|
||||
# Per-session key if not provided
|
||||
self.secret = secret_key or os.urandom(32)
|
||||
|
||||
def _sign_metadata(self, metadata: dict) -> str:
|
||||
"""Create HMAC signature for metadata."""
|
||||
json_meta = json.dumps(metadata, sort_keys=True)
|
||||
signature = hmac.new(
|
||||
self.secret,
|
||||
json_meta.encode(),
|
||||
hashlib.sha256
|
||||
).hexdigest()
|
||||
return signature
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path, sha256: str) -> Path:
|
||||
# Create signed metadata
|
||||
metadata = {
|
||||
"sha256": sha256,
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"expires": (datetime.utcnow() + timedelta(hours=24)).isoformat(),
|
||||
"pid": os.getpid(),
|
||||
"uid": os.getuid(),
|
||||
}
|
||||
signature = self._sign_metadata(metadata)
|
||||
|
||||
# Store data + metadata
|
||||
cache_dir = self.base_dir / signature[:8] # Use signature prefix as namespace
|
||||
cache_dir.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
|
||||
data_path = cache_dir / f"{sha256}.bin"
|
||||
meta_path = cache_dir / f"{sha256}.meta"
|
||||
|
||||
# Atomic writes
|
||||
shutil.copy2(src, data_path)
|
||||
os.chmod(data_path, 0o600)
|
||||
|
||||
with open(meta_path, 'w') as f:
|
||||
json.dump({"metadata": metadata, "signature": signature}, f)
|
||||
os.chmod(meta_path, 0o600)
|
||||
|
||||
return data_path
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, sha256: str) -> Path:
|
||||
# Find and validate signed entry
|
||||
pattern = self.base_dir / "*" / f"{sha256}.meta"
|
||||
matches = list(Path(self.base_dir).glob(f"*/{sha256}.meta"))
|
||||
|
||||
for meta_path in matches:
|
||||
with open(meta_path) as f:
|
||||
entry = json.load(f)
|
||||
|
||||
# Verify signature
|
||||
expected_sig = self._sign_metadata(entry["metadata"])
|
||||
if not hmac.compare_digest(entry["signature"], expected_sig):
|
||||
meta_path.unlink() # Remove tampered entry
|
||||
continue
|
||||
|
||||
# Check expiry
|
||||
expires = datetime.fromisoformat(entry["metadata"]["expires"])
|
||||
if datetime.utcnow() > expires:
|
||||
meta_path.unlink()
|
||||
continue
|
||||
|
||||
# Validate data integrity
|
||||
data_path = meta_path.with_suffix('.bin')
|
||||
actual_sha = self.hasher.sha256(data_path)
|
||||
if actual_sha != sha256:
|
||||
data_path.unlink()
|
||||
meta_path.unlink()
|
||||
continue
|
||||
|
||||
return data_path
|
||||
|
||||
raise CacheMissError(f"No valid cache entry for {sha256}")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### **DAY 11-15: Advanced Security** (v6.0.0)
|
||||
*Next-generation features for standalone security*
|
||||
|
||||
#### 9. **Add Integrity Monitoring** (4 hours)
|
||||
```python
|
||||
# src/deltaglider/security/monitor.py
|
||||
import inotify
|
||||
import logging
|
||||
|
||||
class CacheIntegrityMonitor:
|
||||
"""Detect and alert on cache tampering attempts."""
|
||||
|
||||
def __init__(self, cache_dir: Path):
|
||||
self.cache_dir = cache_dir
|
||||
self.notifier = inotify.INotify()
|
||||
self.watch_desc = self.notifier.add_watch(
|
||||
str(cache_dir),
|
||||
inotify.IN_MODIFY | inotify.IN_DELETE | inotify.IN_ATTRIB
|
||||
)
|
||||
self.logger = logging.getLogger("security")
|
||||
|
||||
async def monitor(self):
|
||||
"""Monitor for unauthorized cache modifications."""
|
||||
async for event in self.notifier:
|
||||
if event.mask & inotify.IN_MODIFY:
|
||||
# File modified - verify it was by our process
|
||||
if not self._is_our_modification(event):
|
||||
self.logger.critical(
|
||||
f"SECURITY: Unauthorized cache modification detected: {event.path}"
|
||||
)
|
||||
# Immediately invalidate affected cache
|
||||
Path(event.path).unlink(missing_ok=True)
|
||||
|
||||
elif event.mask & inotify.IN_ATTRIB:
|
||||
# Permission change - always suspicious
|
||||
self.logger.warning(
|
||||
f"SECURITY: Cache permission change: {event.path}"
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### **DAY 16-20: Testing & Rollout** (v6.0.0 release)
|
||||
|
||||
#### 10. **Security Test Suite** (8 hours)
|
||||
```python
|
||||
# tests/security/test_cache_attacks.py
|
||||
import pytest
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
|
||||
class TestCacheSecurity:
|
||||
"""Test all known attack vectors."""
|
||||
|
||||
def test_toctou_attack_prevented(self, cache):
|
||||
"""Verify TOCTOU window is eliminated."""
|
||||
sha = "abc123"
|
||||
cache.write_ref("bucket", "prefix", test_file, sha)
|
||||
|
||||
# Attacker thread tries to replace file during read
|
||||
def attacker():
|
||||
time.sleep(0.0001) # Try to hit the TOCTOU window
|
||||
cache_path = cache.ref_path("bucket", "prefix", sha)
|
||||
cache_path.write_bytes(b"malicious")
|
||||
|
||||
thread = threading.Thread(target=attacker)
|
||||
thread.start()
|
||||
|
||||
# Should detect tampering
|
||||
with pytest.raises(CacheCorruptionError):
|
||||
cache.get_validated_ref("bucket", "prefix", sha)
|
||||
|
||||
def test_multi_user_isolation(self, cache):
|
||||
"""Verify users can't access each other's cache."""
|
||||
# Create cache as user A
|
||||
cache_a = SignedCache(Path("/tmp/cache"), secret=b"key_a")
|
||||
cache_a.write_ref("bucket", "prefix", test_file, "sha_a")
|
||||
|
||||
# Try to read as user B with different key
|
||||
cache_b = SignedCache(Path("/tmp/cache"), secret=b"key_b")
|
||||
|
||||
with pytest.raises(CacheMissError):
|
||||
cache_b.get_validated_ref("bucket", "prefix", "sha_a")
|
||||
|
||||
def test_cache_poisoning_prevented(self, cache):
|
||||
"""Verify corrupted cache is detected."""
|
||||
sha = "abc123"
|
||||
cache.write_ref("bucket", "prefix", test_file, sha)
|
||||
|
||||
# Corrupt the cache file
|
||||
cache_path = cache.ref_path("bucket", "prefix", sha)
|
||||
with open(cache_path, 'ab') as f:
|
||||
f.write(b"corrupted")
|
||||
|
||||
# Should detect corruption
|
||||
with pytest.raises(CacheCorruptionError):
|
||||
cache.get_validated_ref("bucket", "prefix", sha)
|
||||
```
|
||||
|
||||
#### 11. **Migration Guide** (4 hours)
|
||||
```python
|
||||
# src/deltaglider/migration/v5_to_v6.py
|
||||
def migrate_cache():
|
||||
"""Migrate from v5 shared cache to v6 secure cache."""
|
||||
old_cache = Path("/tmp/.deltaglider/cache")
|
||||
|
||||
if old_cache.exists():
|
||||
print("WARNING: Old insecure cache detected at", old_cache)
|
||||
print("This cache had security vulnerabilities and will not be migrated.")
|
||||
|
||||
response = input("Delete old cache? [y/N]: ")
|
||||
if response.lower() == 'y':
|
||||
shutil.rmtree(old_cache)
|
||||
print("Old cache deleted. New secure cache will be created on demand.")
|
||||
else:
|
||||
print("Old cache retained at", old_cache)
|
||||
print("Set DG_CACHE_DIR to use a different location.")
|
||||
```
|
||||
|
||||
#### 12. **Performance Benchmarks** (4 hours)
|
||||
```python
|
||||
# benchmarks/cache_performance.py
|
||||
def benchmark_cache_implementations():
|
||||
"""Compare performance of cache implementations."""
|
||||
|
||||
implementations = [
|
||||
("Filesystem (v5)", FsCacheAdapter),
|
||||
("Content-Addressed", ContentAddressedCache),
|
||||
("Memory", MemoryCache),
|
||||
("Signed", SignedCache),
|
||||
]
|
||||
|
||||
for name, cache_class in implementations:
|
||||
cache = cache_class(test_dir)
|
||||
|
||||
# Measure write performance
|
||||
start = time.perf_counter()
|
||||
for i in range(1000):
|
||||
cache.write_ref("bucket", f"prefix{i}", test_file, f"sha{i}")
|
||||
write_time = time.perf_counter() - start
|
||||
|
||||
# Measure read performance
|
||||
start = time.perf_counter()
|
||||
for i in range(1000):
|
||||
cache.get_validated_ref("bucket", f"prefix{i}", f"sha{i}")
|
||||
read_time = time.perf_counter() - start
|
||||
|
||||
print(f"{name}: Write={write_time:.3f}s Read={read_time:.3f}s")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Decision Matrix
|
||||
|
||||
| Solution | Security | Performance | Complexity | Breaking Change |
|
||||
|----------|----------|-------------|------------|-----------------|
|
||||
| Hotfix (Day 1-2) | ⭐⭐⭐ | ⭐⭐ | ⭐ | No |
|
||||
| Content-Addressed | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐ | No |
|
||||
| Memory Cache | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | No |
|
||||
| Signed Cache | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ | No |
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Recommended Approach
|
||||
|
||||
### For Immediate Production (Next 48 hours)
|
||||
Deploy **Hotfix v5.0.3** with ephemeral cache + SHA validation
|
||||
|
||||
### For Next Release (1 week)
|
||||
Implement **Content-Addressed Storage** (v5.1.0) - best balance of security and simplicity
|
||||
|
||||
### For Enterprise (1 month)
|
||||
Deploy **Signed Cache** (v6.0.0) for maximum security with built-in TTL and integrity
|
||||
|
||||
---
|
||||
|
||||
## 🚦 Success Metrics
|
||||
|
||||
After implementation, verify:
|
||||
|
||||
1. **Security Tests Pass**: All attack vectors prevented
|
||||
2. **Performance Maintained**: <10% degradation vs v5
|
||||
3. **Zero CVEs**: No security vulnerabilities in cache
|
||||
4. **User Isolation**: Multi-user systems work safely
|
||||
5. **Backward Compatible**: Existing workflows unaffected
|
||||
|
||||
---
|
||||
|
||||
## 📞 Support
|
||||
|
||||
For questions or security concerns:
|
||||
- Security Team: security@deltaglider.io
|
||||
- Lead Developer: @architect
|
||||
- Immediate Issues: Create SECURITY labeled issue
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ Disclosure Timeline
|
||||
|
||||
- **Day 0**: Vulnerabilities discovered
|
||||
- **Day 1**: Hotfix released (v5.0.3)
|
||||
- **Day 7**: Improved version released (v5.1.0)
|
||||
- **Day 30**: Full disclosure published
|
||||
- **Day 45**: v6.0.0 with complete redesign
|
||||
|
||||
---
|
||||
|
||||
*Document Version: 1.0*
|
||||
*Classification: SENSITIVE - INTERNAL USE ONLY*
|
||||
*Last Updated: 2024-10-09*
|
||||
8
command.sh
Executable file
8
command.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
export AWS_ENDPOINT_URL=http://localhost:9000
|
||||
export AWS_ACCESS_KEY_ID=deltadmin
|
||||
export AWS_SECRET_ACCESS_KEY=deltasecret
|
||||
|
||||
ror-data-importer \
|
||||
--source-bucket=dg-demo \
|
||||
--dest-bucket=new-buck \
|
||||
--yes
|
||||
44
commit_message.txt
Normal file
44
commit_message.txt
Normal file
@@ -0,0 +1,44 @@
|
||||
fix: Optimize list_objects performance by eliminating N+1 query problem
|
||||
|
||||
BREAKING CHANGE: list_objects and get_bucket_stats signatures updated
|
||||
|
||||
## Problem
|
||||
The list_objects method was making a separate HEAD request for every object
|
||||
in the bucket to fetch metadata, causing severe performance degradation:
|
||||
- 100 objects = 101 API calls (1 LIST + 100 HEAD)
|
||||
- Response time: ~2.6 seconds for 1000 objects
|
||||
|
||||
## Solution
|
||||
Implemented smart metadata fetching with intelligent defaults:
|
||||
- Added FetchMetadata parameter (default: False) to list_objects
|
||||
- Added detailed_stats parameter (default: False) to get_bucket_stats
|
||||
- NEVER fetch metadata for non-delta files (they don't need it)
|
||||
- Only fetch metadata for delta files when explicitly requested
|
||||
|
||||
## Performance Impact
|
||||
- Before: ~2.6 seconds for 1000 objects (N+1 API calls)
|
||||
- After: ~50ms for 1000 objects (1 API call)
|
||||
- Improvement: ~5x faster for typical operations
|
||||
|
||||
## API Changes
|
||||
- list_objects(..., FetchMetadata=False) - Smart performance default
|
||||
- get_bucket_stats(..., detailed_stats=False) - Quick stats by default
|
||||
- Full pagination support with ContinuationToken
|
||||
- Backwards compatible with existing code
|
||||
|
||||
## Implementation Details
|
||||
- Eliminated unnecessary HEAD requests for metadata
|
||||
- Smart detection: only delta files can benefit from metadata
|
||||
- Preserved boto3 compatibility while adding performance optimizations
|
||||
- Updated documentation with performance notes and examples
|
||||
|
||||
## Testing
|
||||
- All existing tests pass
|
||||
- Added test coverage for new parameters
|
||||
- Linting (ruff) passes
|
||||
- Type checking (mypy) passes
|
||||
- 61 tests passing (18 unit + 43 integration)
|
||||
|
||||
Fixes #[issue-number] - Web UI /buckets/ endpoint 2.6s latency
|
||||
|
||||
Co-authored-by: Claude <noreply@anthropic.com>
|
||||
316
docs/BOTO3_COMPATIBILITY_VISION.md
Normal file
316
docs/BOTO3_COMPATIBILITY_VISION.md
Normal file
@@ -0,0 +1,316 @@
|
||||
# boto3 Compatibility Vision
|
||||
|
||||
## Current State (v4.2.3)
|
||||
|
||||
DeltaGlider currently uses custom dataclasses for responses:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client, ListObjectsResponse, ObjectInfo
|
||||
|
||||
client = create_client()
|
||||
response: ListObjectsResponse = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response.contents: # Custom field name
|
||||
print(f"{obj.key}: {obj.size}") # Custom ObjectInfo dataclass
|
||||
```
|
||||
|
||||
**Problems:**
|
||||
- ❌ Not a true drop-in replacement for boto3
|
||||
- ❌ Users need to learn DeltaGlider-specific types
|
||||
- ❌ Can't use with tools expecting boto3 responses
|
||||
- ❌ Different API surface (`.contents` vs `['Contents']`)
|
||||
|
||||
## Target State (v5.0.0)
|
||||
|
||||
DeltaGlider should return native boto3-compatible dicts with TypedDict type hints:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client, ListObjectsV2Response
|
||||
|
||||
client = create_client()
|
||||
response: ListObjectsV2Response = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response['Contents']: # boto3-compatible!
|
||||
print(f"{obj['Key']}: {obj['Size']}") # Works exactly like boto3
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
- ✅ **True drop-in replacement** - swap `boto3.client('s3')` with `create_client()`
|
||||
- ✅ **No learning curve** - if you know boto3, you know DeltaGlider
|
||||
- ✅ **Tool compatibility** - works with any library expecting boto3 types
|
||||
- ✅ **Type safety** - TypedDict provides IDE autocomplete without boto3 import
|
||||
- ✅ **Zero runtime overhead** - TypedDict compiles to plain dict
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Type Definitions ✅ (DONE)
|
||||
|
||||
Created `deltaglider/types.py` with comprehensive TypedDict definitions:
|
||||
|
||||
```python
|
||||
from typing import TypedDict, NotRequired
|
||||
from datetime import datetime
|
||||
|
||||
class S3Object(TypedDict):
|
||||
Key: str
|
||||
Size: int
|
||||
LastModified: datetime
|
||||
ETag: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
|
||||
class ListObjectsV2Response(TypedDict):
|
||||
Contents: list[S3Object]
|
||||
CommonPrefixes: NotRequired[list[dict[str, str]]]
|
||||
IsTruncated: NotRequired[bool]
|
||||
NextContinuationToken: NotRequired[str]
|
||||
```
|
||||
|
||||
**Key insight:** TypedDict provides type safety at development time but compiles to plain `dict` at runtime!
|
||||
|
||||
### Phase 2: Refactor Client Methods (TODO)
|
||||
|
||||
Update all client methods to return boto3-compatible dicts:
|
||||
|
||||
#### `list_objects()`
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
def list_objects(...) -> ListObjectsResponse: # Custom dataclass
|
||||
return ListObjectsResponse(
|
||||
name=bucket,
|
||||
contents=[ObjectInfo(...), ...] # Custom dataclass
|
||||
)
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
def list_objects(...) -> ListObjectsV2Response: # TypedDict
|
||||
return {
|
||||
'Contents': [
|
||||
{
|
||||
'Key': 'file.zip', # .delta suffix already stripped
|
||||
'Size': 1024,
|
||||
'LastModified': datetime(...),
|
||||
'ETag': '"abc123"',
|
||||
}
|
||||
],
|
||||
'CommonPrefixes': [{'Prefix': 'dir/'}],
|
||||
'IsTruncated': False,
|
||||
}
|
||||
```
|
||||
|
||||
**Key changes:**
|
||||
1. Return plain dict instead of custom dataclass
|
||||
2. Use boto3 field names: `Contents` not `contents`, `Key` not `key`
|
||||
3. Strip `.delta` suffix transparently (already done)
|
||||
4. Hide `reference.bin` files (already done)
|
||||
|
||||
#### `put_object()`
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
def put_object(...) -> dict[str, Any]:
|
||||
return {
|
||||
"ETag": etag,
|
||||
"VersionId": None,
|
||||
"DeltaGliderInfo": {...} # Custom field
|
||||
}
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
def put_object(...) -> PutObjectResponse: # TypedDict
|
||||
return {
|
||||
'ETag': etag,
|
||||
'ResponseMetadata': {'HTTPStatusCode': 200},
|
||||
# DeltaGlider metadata goes in Metadata field
|
||||
'Metadata': {
|
||||
'deltaglider-is-delta': 'true',
|
||||
'deltaglider-compression-ratio': '0.99'
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `get_object()`
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
def get_object(...) -> dict[str, Any]:
|
||||
return {
|
||||
"Body": data,
|
||||
"ContentLength": len(data),
|
||||
"DeltaGliderInfo": {...} # Custom field
|
||||
}
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
def get_object(...) -> GetObjectResponse: # TypedDict
|
||||
return {
|
||||
'Body': data, # bytes, not StreamingBody (simpler!)
|
||||
'ContentLength': len(data),
|
||||
'LastModified': datetime(...),
|
||||
'ETag': '"abc123"',
|
||||
'Metadata': { # DeltaGlider metadata here
|
||||
'deltaglider-is-delta': 'true'
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `delete_object()`, `delete_objects()`, `head_object()`, etc.
|
||||
|
||||
All follow the same pattern: return boto3-compatible dicts with TypedDict hints.
|
||||
|
||||
### Phase 3: Backward Compatibility (TODO)
|
||||
|
||||
Keep old dataclasses for 1-2 versions with deprecation warnings:
|
||||
|
||||
```python
|
||||
class ListObjectsResponse:
|
||||
"""DEPRECATED: Use dict responses with ListObjectsV2Response type hint.
|
||||
|
||||
This will be removed in v6.0.0. Update your code:
|
||||
|
||||
Before:
|
||||
response.contents[0].key
|
||||
|
||||
After:
|
||||
response['Contents'][0]['Key']
|
||||
"""
|
||||
def __init__(self, data: dict):
|
||||
warnings.warn(
|
||||
"ListObjectsResponse dataclass is deprecated. "
|
||||
"Use dict responses with ListObjectsV2Response type hint.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
self._data = data
|
||||
|
||||
@property
|
||||
def contents(self):
|
||||
return [ObjectInfo(obj) for obj in self._data.get('Contents', [])]
|
||||
```
|
||||
|
||||
### Phase 4: Update Documentation (TODO)
|
||||
|
||||
1. Update all examples to use dict responses
|
||||
2. Add migration guide from v4.x to v5.0
|
||||
3. Update BOTO3_COMPATIBILITY.md
|
||||
4. Add "Drop-in Replacement" marketing language
|
||||
|
||||
### Phase 5: Update Tests (TODO)
|
||||
|
||||
Convert all tests from:
|
||||
```python
|
||||
assert response.contents[0].key == "file.zip"
|
||||
```
|
||||
|
||||
To:
|
||||
```python
|
||||
assert response['Contents'][0]['Key'] == "file.zip"
|
||||
```
|
||||
|
||||
## Migration Guide (for users)
|
||||
|
||||
### v4.x → v5.0
|
||||
|
||||
**Old code (v4.x):**
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
response = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response.contents: # Dataclass attribute
|
||||
print(f"{obj.key}: {obj.size}") # Dataclass attributes
|
||||
```
|
||||
|
||||
**New code (v5.0):**
|
||||
```python
|
||||
from deltaglider import create_client, ListObjectsV2Response
|
||||
|
||||
client = create_client()
|
||||
response: ListObjectsV2Response = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response['Contents']: # Dict key (boto3-compatible)
|
||||
print(f"{obj['Key']}: {obj['Size']}") # Dict keys (boto3-compatible)
|
||||
```
|
||||
|
||||
**Or even simpler - no type hint needed:**
|
||||
```python
|
||||
client = create_client()
|
||||
response = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']}")
|
||||
```
|
||||
|
||||
## Benefits Summary
|
||||
|
||||
### For Users
|
||||
- **Zero learning curve** - if you know boto3, you're done
|
||||
- **Drop-in replacement** - literally change one line (client creation)
|
||||
- **Type safety** - TypedDict provides autocomplete without boto3 dependency
|
||||
- **Tool compatibility** - works with all boto3-compatible libraries
|
||||
|
||||
### For DeltaGlider
|
||||
- **Simpler codebase** - no custom dataclasses to maintain
|
||||
- **Better marketing** - true "drop-in replacement" claim
|
||||
- **Easier testing** - test against boto3 behavior directly
|
||||
- **Future-proof** - if boto3 adds fields, users can access them immediately
|
||||
|
||||
## Technical Details
|
||||
|
||||
### How TypedDict Works
|
||||
|
||||
```python
|
||||
from typing import TypedDict
|
||||
|
||||
class MyResponse(TypedDict):
|
||||
Key: str
|
||||
Size: int
|
||||
|
||||
# At runtime, this is just a dict!
|
||||
response: MyResponse = {'Key': 'file.zip', 'Size': 1024}
|
||||
print(type(response)) # <class 'dict'>
|
||||
|
||||
# But mypy and IDEs understand the structure
|
||||
response['Key'] # ✅ Autocomplete works!
|
||||
response['Nonexistent'] # ❌ Mypy error: Key 'Nonexistent' not found
|
||||
```
|
||||
|
||||
### DeltaGlider-Specific Metadata
|
||||
|
||||
Store in standard boto3 `Metadata` field:
|
||||
|
||||
```python
|
||||
{
|
||||
'Key': 'file.zip',
|
||||
'Size': 1024,
|
||||
'Metadata': {
|
||||
# DeltaGlider-specific fields (prefixed for safety)
|
||||
'deltaglider-is-delta': 'true',
|
||||
'deltaglider-compression-ratio': '0.99',
|
||||
'deltaglider-original-size': '100000',
|
||||
'deltaglider-reference-key': 'releases/v1.0.0/reference.bin',
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This is:
|
||||
- ✅ boto3-compatible (Metadata is a standard field)
|
||||
- ✅ Namespaced (deltaglider- prefix prevents conflicts)
|
||||
- ✅ Optional (tools can ignore it)
|
||||
- ✅ Type-safe (Metadata: NotRequired[dict[str, str]])
|
||||
|
||||
## Status
|
||||
|
||||
- ✅ **Phase 1:** TypedDict definitions created
|
||||
- ✅ **Phase 2:** `list_objects()` refactored to return boto3-compatible dict
|
||||
- ⏳ **Phase 3:** Refactor remaining methods (`put_object`, `get_object`, etc.) (TODO)
|
||||
- ⏳ **Phase 4:** Backward compatibility with deprecation warnings (TODO)
|
||||
- ⏳ **Phase 5:** Documentation updates (TODO)
|
||||
- ⏳ **Phase 6:** Full test coverage updates (PARTIAL - list_objects tests done)
|
||||
|
||||
**Current:** v4.2.3+ (Phase 2 complete - `list_objects()` boto3-compatible)
|
||||
**Target:** v5.0.0 release (all phases complete)
|
||||
@@ -1,21 +1,23 @@
|
||||
# AWS S3 CLI Compatibility Plan for DeltaGlider
|
||||
# AWS S3 CLI Compatibility for DeltaGlider
|
||||
|
||||
## Current State
|
||||
|
||||
DeltaGlider currently provides a custom CLI with the following commands:
|
||||
DeltaGlider provides AWS S3 CLI compatible commands with automatic delta compression:
|
||||
|
||||
### Existing Commands
|
||||
- `deltaglider put <file> <s3_url>` - Upload file with delta compression
|
||||
- `deltaglider get <s3_url> [-o output]` - Download and reconstruct file
|
||||
### Commands
|
||||
- `deltaglider cp <source> <destination>` - Copy files with delta compression
|
||||
- `deltaglider ls [s3_url]` - List buckets and objects
|
||||
- `deltaglider rm <s3_url>` - Remove objects
|
||||
- `deltaglider sync <source> <destination>` - Synchronize directories
|
||||
- `deltaglider verify <s3_url>` - Verify file integrity
|
||||
|
||||
### Current Usage Examples
|
||||
```bash
|
||||
# Upload a file
|
||||
deltaglider put myfile.zip s3://bucket/path/to/file.zip
|
||||
deltaglider cp myfile.zip s3://bucket/path/to/file.zip
|
||||
|
||||
# Download a file (auto-detects .delta)
|
||||
deltaglider get s3://bucket/path/to/file.zip
|
||||
# Download a file
|
||||
deltaglider cp s3://bucket/path/to/file.zip .
|
||||
|
||||
# Verify integrity
|
||||
deltaglider verify s3://bucket/path/to/file.zip.delta
|
||||
@@ -168,18 +170,7 @@ Additional flags specific to DeltaGlider's delta compression:
|
||||
3. Create migration guide from aws-cli
|
||||
4. Performance benchmarks comparing to aws-cli
|
||||
|
||||
## Migration Path for Existing Users
|
||||
|
||||
### Alias Support During Transition
|
||||
```bash
|
||||
# Old command -> New command mapping
|
||||
deltaglider put FILE S3_URL -> deltaglider cp FILE S3_URL
|
||||
deltaglider get S3_URL -> deltaglider cp S3_URL .
|
||||
deltaglider verify S3_URL -> deltaglider ls --verify S3_URL
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
- `DELTAGLIDER_LEGACY_MODE=1` - Use old command syntax
|
||||
## Environment Variables
|
||||
- `DELTAGLIDER_AWS_COMPAT=1` - Strict AWS S3 CLI compatibility mode
|
||||
|
||||
## Success Criteria
|
||||
|
||||
@@ -57,7 +57,7 @@ aws s3 cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/
|
||||
# Size on S3: 82.5MB
|
||||
|
||||
# With DeltaGlider
|
||||
deltaglider put readonlyrest-1.66.1_es8.0.0.zip s3://releases/
|
||||
deltaglider cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/
|
||||
# Size on S3: 65KB (99.92% smaller!)
|
||||
```
|
||||
|
||||
@@ -186,7 +186,7 @@ This intelligence meant our 127,455 checksum files were uploaded directly, avoid
|
||||
```bash
|
||||
# Simple integration into our CI/CD
|
||||
- aws s3 cp $FILE s3://releases/
|
||||
+ deltaglider put $FILE s3://releases/
|
||||
+ deltaglider cp $FILE s3://releases/
|
||||
```
|
||||
|
||||
### Week 4: Full Migration
|
||||
@@ -253,10 +253,10 @@ Storage costs scale linearly with data growth. Without DeltaGlider:
|
||||
pip install deltaglider
|
||||
|
||||
# Upload a file (automatic compression)
|
||||
deltaglider put my-release-v1.0.0.zip s3://releases/
|
||||
deltaglider cp my-release-v1.0.0.zip s3://releases/
|
||||
|
||||
# Download (automatic reconstruction)
|
||||
deltaglider get s3://releases/my-release-v1.0.0.zip
|
||||
deltaglider cp s3://releases/my-release-v1.0.0.zip .
|
||||
|
||||
# It's that simple.
|
||||
```
|
||||
@@ -277,12 +277,12 @@ completely_different: 0% # No compression (uploaded as-is)
|
||||
**GitHub Actions**:
|
||||
```yaml
|
||||
- name: Upload Release
|
||||
run: deltaglider put dist/*.zip s3://releases/${{ github.ref_name }}/
|
||||
run: deltaglider cp dist/*.zip s3://releases/${{ github.ref_name }}/
|
||||
```
|
||||
|
||||
**Jenkins Pipeline**:
|
||||
```groovy
|
||||
sh "deltaglider put ${WORKSPACE}/target/*.jar s3://artifacts/"
|
||||
sh "deltaglider cp ${WORKSPACE}/target/*.jar s3://artifacts/"
|
||||
```
|
||||
|
||||
**Python Script**:
|
||||
@@ -327,7 +327,7 @@ python calculate_savings.py --path /your/releases
|
||||
# Try it yourself
|
||||
docker run -p 9000:9000 minio/minio # Local S3
|
||||
pip install deltaglider
|
||||
deltaglider put your-file.zip s3://test/
|
||||
deltaglider cp your-file.zip s3://test/
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
@@ -1,6 +1,14 @@
|
||||
# DeltaGlider Python SDK Documentation
|
||||
|
||||
The DeltaGlider Python SDK provides a simple, intuitive interface for integrating delta compression into your Python applications. Whether you're managing software releases, database backups, or any versioned binary data, DeltaGlider can reduce your storage costs by up to 99%.
|
||||
The DeltaGlider Python SDK provides a **boto3-compatible API for core S3 operations** (~20% of methods covering 80% of use cases), while achieving 99%+ compression for versioned artifacts through intelligent binary delta compression.
|
||||
|
||||
## 🎯 Key Highlights
|
||||
|
||||
- **boto3-Compatible Core API**: 21 essential S3 methods that work exactly like boto3
|
||||
- **99%+ Compression**: Automatically for versioned files and archives
|
||||
- **Familiar API**: If you know boto3, you already know DeltaGlider's core methods
|
||||
- **Full S3 Compatibility**: Works with AWS S3, MinIO, Cloudflare R2, and all S3-compatible storage
|
||||
- **See [BOTO3_COMPATIBILITY.md](../../BOTO3_COMPATIBILITY.md)**: For complete method coverage details
|
||||
|
||||
## Quick Links
|
||||
|
||||
@@ -11,33 +19,101 @@ The DeltaGlider Python SDK provides a simple, intuitive interface for integratin
|
||||
|
||||
## Overview
|
||||
|
||||
DeltaGlider provides two ways to interact with your S3 storage:
|
||||
DeltaGlider provides three ways to interact with your S3 storage:
|
||||
|
||||
### 1. boto3-Compatible API (Recommended) 🌟
|
||||
|
||||
Core boto3 S3 methods with automatic compression (see [BOTO3_COMPATIBILITY.md](../../BOTO3_COMPATIBILITY.md) for full list):
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
# Core boto3 S3 methods work exactly the same, with 99% compression!
|
||||
client = create_client()
|
||||
|
||||
# Standard boto3 S3 methods - just work!
|
||||
client.put_object(Bucket='releases', Key='v1.0.0/app.zip', Body=data)
|
||||
response = client.get_object(Bucket='releases', Key='v1.0.0/app.zip')
|
||||
|
||||
# Optimized list_objects with smart performance defaults (NEW!)
|
||||
# Fast by default - no unnecessary metadata fetching
|
||||
response = client.list_objects(Bucket='releases', Prefix='v1.0.0/')
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
# Pagination for large buckets
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100)
|
||||
while response.get('IsTruncated'):
|
||||
# Process current page
|
||||
for obj in response['Contents']:
|
||||
print(obj['Key'])
|
||||
# Get next page
|
||||
response = client.list_objects(
|
||||
Bucket='releases',
|
||||
MaxKeys=100,
|
||||
ContinuationToken=response.get('NextContinuationToken')
|
||||
)
|
||||
|
||||
# Get detailed compression stats only when needed
|
||||
response = client.list_objects(Bucket='releases', FetchMetadata=True) # Slower but detailed
|
||||
|
||||
# Quick bucket statistics
|
||||
stats = client.get_bucket_stats('releases') # Fast overview
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True) # With compression metrics
|
||||
|
||||
client.delete_object(Bucket='releases', Key='old-version.zip')
|
||||
```
|
||||
|
||||
### 2. Simple API
|
||||
|
||||
For straightforward use cases:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
summary = client.upload("my-app-v1.0.0.zip", "s3://releases/v1.0.0/")
|
||||
client.download("s3://releases/v1.0.0/my-app-v1.0.0.zip", "local.zip")
|
||||
```
|
||||
|
||||
### 3. CLI (Command Line Interface)
|
||||
|
||||
Drop-in replacement for AWS S3 CLI:
|
||||
|
||||
### 1. CLI (Command Line Interface)
|
||||
Drop-in replacement for AWS S3 CLI with automatic delta compression:
|
||||
```bash
|
||||
deltaglider cp my-app-v1.0.0.zip s3://releases/
|
||||
deltaglider ls s3://releases/
|
||||
deltaglider sync ./builds/ s3://releases/
|
||||
```
|
||||
|
||||
### 2. Python SDK
|
||||
Programmatic interface for Python applications:
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
## Migration from boto3
|
||||
|
||||
For core S3 operations, migrating is as simple as changing your import:
|
||||
|
||||
```python
|
||||
# Before (boto3)
|
||||
import boto3
|
||||
client = boto3.client('s3')
|
||||
client.put_object(Bucket='mybucket', Key='myfile.zip', Body=data)
|
||||
|
||||
# After (DeltaGlider) - Core methods work the same, with 99% compression!
|
||||
from deltaglider import create_client
|
||||
client = create_client()
|
||||
summary = client.upload("my-app-v1.0.0.zip", "s3://releases/v1.0.0/")
|
||||
print(f"Compressed from {summary.original_size_mb:.1f}MB to {summary.stored_size_mb:.1f}MB")
|
||||
client.put_object(Bucket='mybucket', Key='myfile.zip', Body=data)
|
||||
```
|
||||
|
||||
**Note**: DeltaGlider implements ~21 core S3 methods. For advanced features (versioning, ACLs, multipart uploads >5GB), use boto3 directly. See [BOTO3_COMPATIBILITY.md](../../BOTO3_COMPATIBILITY.md) for details.
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Core boto3 Compatibility**: 21 essential S3 methods work exactly as expected (~20% coverage, 80% use cases)
|
||||
- **99%+ Compression**: For versioned artifacts and similar files
|
||||
- **Drop-in Replacement**: Works with existing AWS S3 workflows
|
||||
- **Intelligent Detection**: Automatically determines when to use delta compression
|
||||
- **Data Integrity**: SHA256 verification on every operation
|
||||
- **S3 Compatible**: Works with AWS, MinIO, Cloudflare R2, and other S3-compatible storage
|
||||
- **Transparent**: Works with existing tools and workflows
|
||||
- **Production Ready**: Battle-tested with 200K+ files
|
||||
- **Thoroughly Tested**: 99 integration/unit tests with comprehensive coverage
|
||||
- **Type Safe**: Full mypy type checking, zero type errors
|
||||
|
||||
## When to Use DeltaGlider
|
||||
|
||||
@@ -69,7 +145,43 @@ export AWS_ENDPOINT_URL=http://localhost:9000
|
||||
|
||||
## Basic Usage
|
||||
|
||||
### Simple Upload/Download
|
||||
### boto3-Compatible Usage (Recommended)
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
# Create client (uses AWS credentials automatically)
|
||||
client = create_client()
|
||||
|
||||
# Upload using boto3 API
|
||||
with open('release-v2.0.0.zip', 'rb') as f:
|
||||
response = client.put_object(
|
||||
Bucket='releases',
|
||||
Key='v2.0.0/release.zip',
|
||||
Body=f,
|
||||
Metadata={'version': '2.0.0'}
|
||||
)
|
||||
|
||||
# Check compression stats (DeltaGlider extension)
|
||||
if 'DeltaGliderInfo' in response:
|
||||
info = response['DeltaGliderInfo']
|
||||
print(f"Saved {info['SavingsPercent']:.0f}% storage space")
|
||||
|
||||
# Download using boto3 API
|
||||
response = client.get_object(Bucket='releases', Key='v2.0.0/release.zip')
|
||||
with open('local-copy.zip', 'wb') as f:
|
||||
f.write(response['Body'].read())
|
||||
|
||||
# List objects
|
||||
response = client.list_objects(Bucket='releases', Prefix='v2.0.0/')
|
||||
for obj in response.get('Contents', []):
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
# Delete object
|
||||
client.delete_object(Bucket='releases', Key='old-version.zip')
|
||||
```
|
||||
|
||||
### Simple API Usage
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
@@ -97,12 +209,44 @@ client = create_client(
|
||||
)
|
||||
```
|
||||
|
||||
## Real-World Example
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
# Core boto3 methods work exactly like boto3!
|
||||
client = create_client()
|
||||
|
||||
# Upload multiple software versions
|
||||
versions = ["v1.0.0", "v1.0.1", "v1.0.2", "v1.1.0"]
|
||||
for version in versions:
|
||||
with open(f"dist/my-app-{version}.zip", 'rb') as f:
|
||||
response = client.put_object(
|
||||
Bucket='releases',
|
||||
Key=f'{version}/my-app.zip',
|
||||
Body=f
|
||||
)
|
||||
|
||||
# DeltaGlider provides compression stats
|
||||
if 'DeltaGliderInfo' in response:
|
||||
info = response['DeltaGliderInfo']
|
||||
print(f"{version}: {info['StoredSizeMB']:.1f}MB "
|
||||
f"(saved {info['SavingsPercent']:.0f}%)")
|
||||
|
||||
# Result:
|
||||
# v1.0.0: 100.0MB (saved 0%) <- First file becomes reference
|
||||
# v1.0.1: 0.2MB (saved 99.8%) <- Only differences stored
|
||||
# v1.0.2: 0.3MB (saved 99.7%) <- Delta from reference
|
||||
# v1.1.0: 5.2MB (saved 94.8%) <- Larger changes, still huge savings
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **First Upload**: The first file uploaded to a prefix becomes the reference
|
||||
2. **Delta Compression**: Subsequent similar files are compared using xdelta3
|
||||
3. **Smart Storage**: Only the differences (deltas) are stored
|
||||
4. **Transparent Reconstruction**: Files are automatically reconstructed on download
|
||||
5. **Core boto3 Compatibility**: Essential operations maintain full boto3 API compatibility
|
||||
|
||||
## Performance
|
||||
|
||||
@@ -112,6 +256,41 @@ Based on real-world usage:
|
||||
- **Download Speed**: <100ms reconstruction
|
||||
- **Storage Savings**: 4TB → 5GB (ReadOnlyREST case study)
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Multipart Upload Support
|
||||
|
||||
```python
|
||||
# Large file uploads work automatically
|
||||
with open('large-file.zip', 'rb') as f:
|
||||
client.put_object(
|
||||
Bucket='backups',
|
||||
Key='database/backup.zip',
|
||||
Body=f # Handles multipart automatically for large files
|
||||
)
|
||||
```
|
||||
|
||||
### Batch Operations
|
||||
|
||||
```python
|
||||
# Upload multiple files efficiently
|
||||
files = ['app.zip', 'docs.zip', 'assets.zip']
|
||||
for file in files:
|
||||
with open(file, 'rb') as f:
|
||||
client.put_object(Bucket='releases', Key=file, Body=f)
|
||||
```
|
||||
|
||||
### Presigned URLs
|
||||
|
||||
```python
|
||||
# Generate presigned URLs for secure sharing
|
||||
url = client.generate_presigned_url(
|
||||
'get_object',
|
||||
Params={'Bucket': 'releases', 'Key': 'v1.0.0/app.zip'},
|
||||
ExpiresIn=3600
|
||||
)
|
||||
```
|
||||
|
||||
## Support
|
||||
|
||||
- GitHub Issues: [github.com/beshu-tech/deltaglider/issues](https://github.com/beshu-tech/deltaglider/issues)
|
||||
|
||||
273
docs/sdk/api.md
273
docs/sdk/api.md
@@ -21,7 +21,6 @@ Factory function to create a configured DeltaGlider client with sensible default
|
||||
def create_client(
|
||||
endpoint_url: Optional[str] = None,
|
||||
log_level: str = "INFO",
|
||||
cache_dir: str = "/tmp/.deltaglider/cache",
|
||||
**kwargs
|
||||
) -> DeltaGliderClient
|
||||
```
|
||||
@@ -30,11 +29,12 @@ def create_client(
|
||||
|
||||
- **endpoint_url** (`Optional[str]`): S3 endpoint URL for MinIO, R2, or other S3-compatible storage. If None, uses AWS S3.
|
||||
- **log_level** (`str`): Logging verbosity level. Options: "DEBUG", "INFO", "WARNING", "ERROR". Default: "INFO".
|
||||
- **cache_dir** (`str`): Directory for local reference cache. Default: "/tmp/.deltaglider/cache".
|
||||
- **kwargs**: Additional arguments passed to `DeltaService`:
|
||||
- **tool_version** (`str`): Version string for metadata. Default: "deltaglider/0.1.0"
|
||||
- **max_ratio** (`float`): Maximum acceptable delta/file ratio. Default: 0.5
|
||||
|
||||
**Security Note**: DeltaGlider automatically uses ephemeral, process-isolated cache (`/tmp/deltaglider-*`) that is cleaned up on exit. No configuration needed.
|
||||
|
||||
#### Returns
|
||||
|
||||
`DeltaGliderClient`: Configured client instance ready for use.
|
||||
@@ -48,11 +48,8 @@ client = create_client()
|
||||
# Custom endpoint for MinIO
|
||||
client = create_client(endpoint_url="http://localhost:9000")
|
||||
|
||||
# Debug mode with custom cache
|
||||
client = create_client(
|
||||
log_level="DEBUG",
|
||||
cache_dir="/var/cache/deltaglider"
|
||||
)
|
||||
# Debug mode
|
||||
client = create_client(log_level="DEBUG")
|
||||
|
||||
# Custom delta ratio threshold
|
||||
client = create_client(max_ratio=0.3) # Only use delta if <30% of original
|
||||
@@ -75,7 +72,261 @@ class DeltaGliderClient:
|
||||
|
||||
**Note**: Use `create_client()` instead of instantiating directly.
|
||||
|
||||
### Methods
|
||||
### boto3-Compatible Methods (Recommended)
|
||||
|
||||
These methods provide compatibility with boto3's core S3 client operations. DeltaGlider implements 21 essential S3 methods covering ~80% of common use cases. See [BOTO3_COMPATIBILITY.md](../../BOTO3_COMPATIBILITY.md) for complete coverage details.
|
||||
|
||||
#### `list_objects`
|
||||
|
||||
List objects in a bucket with smart performance optimizations.
|
||||
|
||||
```python
|
||||
def list_objects(
|
||||
self,
|
||||
Bucket: str,
|
||||
Prefix: str = "",
|
||||
Delimiter: str = "",
|
||||
MaxKeys: int = 1000,
|
||||
ContinuationToken: Optional[str] = None,
|
||||
StartAfter: Optional[str] = None,
|
||||
FetchMetadata: bool = False,
|
||||
**kwargs
|
||||
) -> dict[str, Any]
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
|
||||
- **Bucket** (`str`): S3 bucket name.
|
||||
- **Prefix** (`str`): Filter results to keys beginning with prefix.
|
||||
- **Delimiter** (`str`): Delimiter for grouping keys (e.g., '/' for folders).
|
||||
- **MaxKeys** (`int`): Maximum number of keys to return (for pagination). Default: 1000.
|
||||
- **ContinuationToken** (`Optional[str]`): Token from previous response for pagination.
|
||||
- **StartAfter** (`Optional[str]`): Start listing after this key (alternative pagination).
|
||||
- **FetchMetadata** (`bool`): If True, fetch compression metadata for delta files only. Default: False.
|
||||
- **IMPORTANT**: Non-delta files NEVER trigger metadata fetching (no performance impact).
|
||||
- With `FetchMetadata=False`: ~50ms for 1000 objects (1 API call)
|
||||
- With `FetchMetadata=True`: ~2-3s for 1000 objects (1 + N delta files API calls)
|
||||
|
||||
##### Performance Optimization
|
||||
|
||||
The method intelligently optimizes performance by:
|
||||
1. **Never** fetching metadata for non-delta files (they don't need it)
|
||||
2. Only fetching metadata for delta files when explicitly requested
|
||||
3. Supporting efficient pagination for large buckets
|
||||
|
||||
##### Returns
|
||||
|
||||
boto3-compatible dict with:
|
||||
- **Contents** (`list[dict]`): List of S3Object dicts with Key, Size, LastModified, Metadata
|
||||
- **CommonPrefixes** (`list[dict]`): Optional list of common prefixes (folders)
|
||||
- **IsTruncated** (`bool`): Whether more results are available
|
||||
- **NextContinuationToken** (`str`): Token for next page
|
||||
- **KeyCount** (`int`): Number of keys returned
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Fast listing for UI display (no metadata fetching)
|
||||
response = client.list_objects(Bucket='releases')
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
# Paginated listing for large buckets
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100)
|
||||
while response.get('IsTruncated'):
|
||||
for obj in response['Contents']:
|
||||
print(obj['Key'])
|
||||
response = client.list_objects(
|
||||
Bucket='releases',
|
||||
MaxKeys=100,
|
||||
ContinuationToken=response.get('NextContinuationToken')
|
||||
)
|
||||
|
||||
# Get detailed compression stats (slower, only for analytics)
|
||||
response = client.list_objects(
|
||||
Bucket='releases',
|
||||
FetchMetadata=True # Only fetches for delta files
|
||||
)
|
||||
for obj in response['Contents']:
|
||||
metadata = obj.get('Metadata', {})
|
||||
if metadata.get('deltaglider-is-delta') == 'true':
|
||||
compression = metadata.get('deltaglider-compression-ratio', 'unknown')
|
||||
print(f"{obj['Key']}: {compression} compression")
|
||||
```
|
||||
|
||||
#### `get_bucket_stats`
|
||||
|
||||
Get statistics for a bucket with optional detailed compression metrics.
|
||||
|
||||
```python
|
||||
def get_bucket_stats(
|
||||
self,
|
||||
bucket: str,
|
||||
detailed_stats: bool = False
|
||||
) -> BucketStats
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
|
||||
- **bucket** (`str`): S3 bucket name.
|
||||
- **detailed_stats** (`bool`): If True, fetch accurate compression ratios for delta files. Default: False.
|
||||
- With `detailed_stats=False`: ~50ms for any bucket size (LIST calls only)
|
||||
- With `detailed_stats=True`: ~2-3s per 1000 objects (adds HEAD calls for delta files)
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Quick stats for dashboard display
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Objects: {stats.object_count}, Size: {stats.total_size}")
|
||||
|
||||
# Detailed stats for analytics (slower but accurate)
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True)
|
||||
print(f"Compression ratio: {stats.average_compression_ratio:.1%}")
|
||||
```
|
||||
|
||||
#### `put_object`
|
||||
|
||||
Upload an object to S3 with automatic delta compression (boto3-compatible).
|
||||
|
||||
```python
|
||||
def put_object(
|
||||
self,
|
||||
Bucket: str,
|
||||
Key: str,
|
||||
Body: bytes | str | Path | None = None,
|
||||
Metadata: Optional[Dict[str, str]] = None,
|
||||
ContentType: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
|
||||
- **Bucket** (`str`): S3 bucket name.
|
||||
- **Key** (`str`): Object key (path in bucket).
|
||||
- **Body** (`bytes | str | Path`): Object data.
|
||||
- **Metadata** (`Optional[Dict[str, str]]`): Custom metadata.
|
||||
- **ContentType** (`Optional[str]`): MIME type (for compatibility).
|
||||
|
||||
##### Returns
|
||||
|
||||
Dict with ETag and DeltaGlider compression info.
|
||||
|
||||
#### `get_object`
|
||||
|
||||
Download an object from S3 with automatic delta reconstruction (boto3-compatible).
|
||||
|
||||
```python
|
||||
def get_object(
|
||||
self,
|
||||
Bucket: str,
|
||||
Key: str,
|
||||
**kwargs
|
||||
) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
##### Returns
|
||||
|
||||
Dict with Body stream and metadata (identical to boto3).
|
||||
|
||||
#### `create_bucket`
|
||||
|
||||
Create an S3 bucket (boto3-compatible).
|
||||
|
||||
```python
|
||||
def create_bucket(
|
||||
self,
|
||||
Bucket: str,
|
||||
CreateBucketConfiguration: Optional[Dict[str, str]] = None,
|
||||
**kwargs
|
||||
) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
|
||||
- **Bucket** (`str`): Name of the bucket to create.
|
||||
- **CreateBucketConfiguration** (`Optional[Dict[str, str]]`): Bucket configuration with optional LocationConstraint.
|
||||
|
||||
##### Returns
|
||||
|
||||
Dict with Location of created bucket.
|
||||
|
||||
##### Notes
|
||||
|
||||
- Idempotent: Creating an existing bucket returns success
|
||||
- Use for basic bucket creation without advanced S3 features
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Create bucket in default region
|
||||
client.create_bucket(Bucket='my-releases')
|
||||
|
||||
# Create bucket in specific region
|
||||
client.create_bucket(
|
||||
Bucket='my-backups',
|
||||
CreateBucketConfiguration={'LocationConstraint': 'eu-west-1'}
|
||||
)
|
||||
```
|
||||
|
||||
#### `delete_bucket`
|
||||
|
||||
Delete an S3 bucket (boto3-compatible).
|
||||
|
||||
```python
|
||||
def delete_bucket(
|
||||
self,
|
||||
Bucket: str,
|
||||
**kwargs
|
||||
) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
|
||||
- **Bucket** (`str`): Name of the bucket to delete.
|
||||
|
||||
##### Returns
|
||||
|
||||
Dict confirming deletion.
|
||||
|
||||
##### Notes
|
||||
|
||||
- Idempotent: Deleting a non-existent bucket returns success
|
||||
- Bucket must be empty before deletion
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Delete empty bucket
|
||||
client.delete_bucket(Bucket='old-releases')
|
||||
```
|
||||
|
||||
#### `list_buckets`
|
||||
|
||||
List all S3 buckets (boto3-compatible).
|
||||
|
||||
```python
|
||||
def list_buckets(
|
||||
self,
|
||||
**kwargs
|
||||
) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
##### Returns
|
||||
|
||||
Dict with list of buckets and owner information (identical to boto3).
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# List all buckets
|
||||
response = client.list_buckets()
|
||||
for bucket in response['Buckets']:
|
||||
print(f"{bucket['Name']} - Created: {bucket['CreationDate']}")
|
||||
```
|
||||
|
||||
### Simple API Methods
|
||||
|
||||
#### `upload`
|
||||
|
||||
@@ -472,9 +723,10 @@ DeltaGlider respects these environment variables:
|
||||
### DeltaGlider Configuration
|
||||
|
||||
- **DG_LOG_LEVEL**: Logging level (DEBUG, INFO, WARNING, ERROR)
|
||||
- **DG_CACHE_DIR**: Local cache directory
|
||||
- **DG_MAX_RATIO**: Default maximum delta ratio
|
||||
|
||||
**Note**: Cache is automatically managed (ephemeral, process-isolated) and requires no configuration.
|
||||
|
||||
### Example
|
||||
|
||||
```bash
|
||||
@@ -485,10 +737,9 @@ export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
|
||||
# Configure DeltaGlider
|
||||
export DG_LOG_LEVEL=DEBUG
|
||||
export DG_CACHE_DIR=/var/cache/deltaglider
|
||||
export DG_MAX_RATIO=0.3
|
||||
|
||||
# Now use normally
|
||||
# Now use normally (cache managed automatically)
|
||||
python my_script.py
|
||||
```
|
||||
|
||||
|
||||
@@ -4,14 +4,294 @@ Real-world examples and patterns for using DeltaGlider in production application
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Software Release Management](#software-release-management)
|
||||
2. [Database Backup System](#database-backup-system)
|
||||
3. [CI/CD Pipeline Integration](#cicd-pipeline-integration)
|
||||
4. [Container Registry Storage](#container-registry-storage)
|
||||
5. [Machine Learning Model Versioning](#machine-learning-model-versioning)
|
||||
6. [Game Asset Distribution](#game-asset-distribution)
|
||||
7. [Log Archive Management](#log-archive-management)
|
||||
8. [Multi-Region Replication](#multi-region-replication)
|
||||
1. [Performance-Optimized Bucket Listing](#performance-optimized-bucket-listing)
|
||||
2. [Bucket Management](#bucket-management)
|
||||
3. [Software Release Management](#software-release-management)
|
||||
4. [Database Backup System](#database-backup-system)
|
||||
5. [CI/CD Pipeline Integration](#cicd-pipeline-integration)
|
||||
6. [Container Registry Storage](#container-registry-storage)
|
||||
7. [Machine Learning Model Versioning](#machine-learning-model-versioning)
|
||||
8. [Game Asset Distribution](#game-asset-distribution)
|
||||
9. [Log Archive Management](#log-archive-management)
|
||||
10. [Multi-Region Replication](#multi-region-replication)
|
||||
|
||||
## Performance-Optimized Bucket Listing
|
||||
|
||||
DeltaGlider's smart `list_objects` method eliminates the N+1 query problem by intelligently managing metadata fetching.
|
||||
|
||||
### Fast Web UI Listing (No Metadata)
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import time
|
||||
|
||||
client = create_client()
|
||||
|
||||
def fast_bucket_listing(bucket: str):
|
||||
"""Ultra-fast listing for web UI display (~50ms for 1000 objects)."""
|
||||
start = time.time()
|
||||
|
||||
# Default: FetchMetadata=False - no HEAD requests
|
||||
response = client.list_objects(
|
||||
Bucket=bucket,
|
||||
MaxKeys=100 # Pagination for UI
|
||||
)
|
||||
|
||||
# Process objects for display
|
||||
items = []
|
||||
for obj in response.contents:
|
||||
items.append({
|
||||
"key": obj.key,
|
||||
"size": obj.size,
|
||||
"last_modified": obj.last_modified,
|
||||
"is_delta": obj.is_delta, # Determined from filename
|
||||
# No compression_ratio - would require HEAD request
|
||||
})
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f"Listed {len(items)} objects in {elapsed*1000:.0f}ms")
|
||||
|
||||
return items, response.next_continuation_token
|
||||
|
||||
# Example: List first page
|
||||
items, next_token = fast_bucket_listing('releases')
|
||||
```
|
||||
|
||||
### Paginated Listing for Large Buckets
|
||||
|
||||
```python
|
||||
def paginated_listing(bucket: str, page_size: int = 50):
|
||||
"""Efficiently paginate through large buckets."""
|
||||
all_objects = []
|
||||
continuation_token = None
|
||||
|
||||
while True:
|
||||
response = client.list_objects(
|
||||
Bucket=bucket,
|
||||
MaxKeys=page_size,
|
||||
ContinuationToken=continuation_token,
|
||||
FetchMetadata=False # Keep it fast
|
||||
)
|
||||
|
||||
all_objects.extend(response.contents)
|
||||
|
||||
if not response.is_truncated:
|
||||
break
|
||||
|
||||
continuation_token = response.next_continuation_token
|
||||
print(f"Fetched {len(all_objects)} objects so far...")
|
||||
|
||||
return all_objects
|
||||
|
||||
# Example: List all objects efficiently
|
||||
all_objects = paginated_listing('releases', page_size=100)
|
||||
print(f"Total objects: {len(all_objects)}")
|
||||
```
|
||||
|
||||
### Analytics Dashboard with Compression Stats
|
||||
|
||||
```python
|
||||
def dashboard_with_stats(bucket: str):
|
||||
"""Dashboard view with optional detailed stats."""
|
||||
|
||||
# Quick overview (fast - no metadata)
|
||||
stats = client.get_bucket_stats(bucket, detailed_stats=False)
|
||||
|
||||
print(f"Quick Stats for {bucket}:")
|
||||
print(f" Total Objects: {stats.object_count}")
|
||||
print(f" Delta Files: {stats.delta_objects}")
|
||||
print(f" Regular Files: {stats.direct_objects}")
|
||||
print(f" Total Size: {stats.total_size / (1024**3):.2f} GB")
|
||||
print(f" Stored Size: {stats.compressed_size / (1024**3):.2f} GB")
|
||||
|
||||
# Detailed compression analysis (slower - fetches metadata for deltas only)
|
||||
if stats.delta_objects > 0:
|
||||
detailed_stats = client.get_bucket_stats(bucket, detailed_stats=True)
|
||||
print(f"\nDetailed Compression Stats:")
|
||||
print(f" Average Compression: {detailed_stats.average_compression_ratio:.1%}")
|
||||
print(f" Space Saved: {detailed_stats.space_saved / (1024**3):.2f} GB")
|
||||
|
||||
# Example usage
|
||||
dashboard_with_stats('releases')
|
||||
```
|
||||
|
||||
### Smart Metadata Fetching for Analytics
|
||||
|
||||
```python
|
||||
def compression_analysis(bucket: str, prefix: str = ""):
|
||||
"""Analyze compression effectiveness with selective metadata fetching."""
|
||||
|
||||
# Only fetch metadata when we need compression stats
|
||||
response = client.list_objects(
|
||||
Bucket=bucket,
|
||||
Prefix=prefix,
|
||||
FetchMetadata=True # Fetches metadata ONLY for .delta files
|
||||
)
|
||||
|
||||
# Analyze compression effectiveness
|
||||
delta_files = [obj for obj in response.contents if obj.is_delta]
|
||||
|
||||
if delta_files:
|
||||
total_original = sum(obj.original_size for obj in delta_files)
|
||||
total_compressed = sum(obj.compressed_size for obj in delta_files)
|
||||
avg_ratio = (total_original - total_compressed) / total_original
|
||||
|
||||
print(f"Compression Analysis for {prefix or 'all files'}:")
|
||||
print(f" Delta Files: {len(delta_files)}")
|
||||
print(f" Original Size: {total_original / (1024**2):.1f} MB")
|
||||
print(f" Compressed Size: {total_compressed / (1024**2):.1f} MB")
|
||||
print(f" Average Compression: {avg_ratio:.1%}")
|
||||
|
||||
# Find best and worst compression
|
||||
best = max(delta_files, key=lambda x: x.compression_ratio or 0)
|
||||
worst = min(delta_files, key=lambda x: x.compression_ratio or 1)
|
||||
|
||||
print(f" Best Compression: {best.key} ({best.compression_ratio:.1%})")
|
||||
print(f" Worst Compression: {worst.key} ({worst.compression_ratio:.1%})")
|
||||
|
||||
# Example: Analyze v2.0 releases
|
||||
compression_analysis('releases', 'v2.0/')
|
||||
```
|
||||
|
||||
### Performance Comparison
|
||||
|
||||
```python
|
||||
def performance_comparison(bucket: str):
|
||||
"""Compare performance with and without metadata fetching."""
|
||||
import time
|
||||
|
||||
# Test 1: Fast listing (no metadata)
|
||||
start = time.time()
|
||||
response_fast = client.list_objects(
|
||||
Bucket=bucket,
|
||||
MaxKeys=100,
|
||||
FetchMetadata=False # Default
|
||||
)
|
||||
time_fast = (time.time() - start) * 1000
|
||||
|
||||
# Test 2: Detailed listing (with metadata for deltas)
|
||||
start = time.time()
|
||||
response_detailed = client.list_objects(
|
||||
Bucket=bucket,
|
||||
MaxKeys=100,
|
||||
FetchMetadata=True # Fetches for delta files only
|
||||
)
|
||||
time_detailed = (time.time() - start) * 1000
|
||||
|
||||
delta_count = sum(1 for obj in response_fast.contents if obj.is_delta)
|
||||
|
||||
print(f"Performance Comparison for {bucket}:")
|
||||
print(f" Fast Listing: {time_fast:.0f}ms (1 API call)")
|
||||
print(f" Detailed Listing: {time_detailed:.0f}ms (1 + {delta_count} API calls)")
|
||||
print(f" Speed Improvement: {time_detailed/time_fast:.1f}x slower with metadata")
|
||||
print(f"\nRecommendation: Use FetchMetadata=True only when you need:")
|
||||
print(" - Exact original file sizes for delta files")
|
||||
print(" - Accurate compression ratios")
|
||||
print(" - Reference key information")
|
||||
|
||||
# Example: Compare performance
|
||||
performance_comparison('releases')
|
||||
```
|
||||
|
||||
### Best Practices
|
||||
|
||||
1. **Default to Fast Mode**: Always use `FetchMetadata=False` (default) unless you specifically need compression stats.
|
||||
|
||||
2. **Never Fetch for Non-Deltas**: The SDK automatically skips metadata fetching for non-delta files even when `FetchMetadata=True`.
|
||||
|
||||
3. **Use Pagination**: For large buckets, use `MaxKeys` and `ContinuationToken` to paginate results.
|
||||
|
||||
4. **Cache Results**: If you need metadata frequently, consider caching the results to avoid repeated HEAD requests.
|
||||
|
||||
5. **Batch Analytics**: When doing analytics, fetch metadata once and process the results rather than making multiple calls.
|
||||
|
||||
## Bucket Management
|
||||
|
||||
DeltaGlider provides boto3-compatible bucket management methods for creating, listing, and deleting buckets without requiring boto3.
|
||||
|
||||
### Complete Bucket Lifecycle
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Create bucket
|
||||
client.create_bucket(Bucket='my-releases')
|
||||
|
||||
# Create bucket in specific region
|
||||
client.create_bucket(
|
||||
Bucket='eu-backups',
|
||||
CreateBucketConfiguration={'LocationConstraint': 'eu-west-1'}
|
||||
)
|
||||
|
||||
# List all buckets
|
||||
response = client.list_buckets()
|
||||
for bucket in response['Buckets']:
|
||||
print(f"{bucket['Name']} - Created: {bucket['CreationDate']}")
|
||||
|
||||
# Upload some objects
|
||||
with open('app-v1.0.0.zip', 'rb') as f:
|
||||
client.put_object(Bucket='my-releases', Key='v1.0.0/app.zip', Body=f)
|
||||
|
||||
# Delete objects first (bucket must be empty)
|
||||
client.delete_object(Bucket='my-releases', Key='v1.0.0/app.zip')
|
||||
|
||||
# Delete bucket
|
||||
client.delete_bucket(Bucket='my-releases')
|
||||
```
|
||||
|
||||
### Idempotent Operations
|
||||
|
||||
Bucket management operations are idempotent for safe automation:
|
||||
|
||||
```python
|
||||
# Creating existing bucket returns success (no error)
|
||||
client.create_bucket(Bucket='my-releases')
|
||||
client.create_bucket(Bucket='my-releases') # Safe, returns success
|
||||
|
||||
# Deleting non-existent bucket returns success (no error)
|
||||
client.delete_bucket(Bucket='non-existent') # Safe, returns success
|
||||
```
|
||||
|
||||
### Hybrid boto3/DeltaGlider Usage
|
||||
|
||||
For advanced S3 features not in DeltaGlider's 21 core methods, use boto3 directly:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import boto3
|
||||
|
||||
# DeltaGlider for core operations with compression
|
||||
dg_client = create_client()
|
||||
|
||||
# boto3 for advanced features
|
||||
s3_client = boto3.client('s3')
|
||||
|
||||
# Use DeltaGlider for object operations (with compression)
|
||||
with open('release.zip', 'rb') as f:
|
||||
dg_client.put_object(Bucket='releases', Key='v1.0.0/release.zip', Body=f)
|
||||
|
||||
# Use boto3 for advanced bucket features
|
||||
s3_client.put_bucket_versioning(
|
||||
Bucket='releases',
|
||||
VersioningConfiguration={'Status': 'Enabled'}
|
||||
)
|
||||
|
||||
# Use boto3 for bucket policies
|
||||
policy = {
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [{
|
||||
"Effect": "Allow",
|
||||
"Principal": "*",
|
||||
"Action": "s3:GetObject",
|
||||
"Resource": "arn:aws:s3:::releases/*"
|
||||
}]
|
||||
}
|
||||
s3_client.put_bucket_policy(Bucket='releases', Policy=json.dumps(policy))
|
||||
```
|
||||
|
||||
See [BOTO3_COMPATIBILITY.md](../../BOTO3_COMPATIBILITY.md) for complete method coverage.
|
||||
|
||||
## Software Release Management
|
||||
|
||||
|
||||
@@ -69,6 +69,38 @@ Or via environment variable:
|
||||
export AWS_ENDPOINT_URL=http://minio.local:9000
|
||||
```
|
||||
|
||||
### DeltaGlider Configuration
|
||||
|
||||
DeltaGlider supports the following environment variables:
|
||||
|
||||
**Logging & Performance**:
|
||||
- `DG_LOG_LEVEL`: Logging level (default: `INFO`, options: `DEBUG`, `INFO`, `WARNING`, `ERROR`)
|
||||
- `DG_MAX_RATIO`: Maximum delta/file ratio (default: `0.5`, range: `0.0-1.0`)
|
||||
|
||||
**Cache Configuration**:
|
||||
- `DG_CACHE_BACKEND`: Cache backend type (default: `filesystem`, options: `filesystem`, `memory`)
|
||||
- `DG_CACHE_MEMORY_SIZE_MB`: Memory cache size in MB (default: `100`)
|
||||
- `DG_CACHE_ENCRYPTION_KEY`: Optional base64-encoded Fernet key for persistent encryption
|
||||
|
||||
**Security**:
|
||||
- Encryption is **always enabled** (cannot be disabled)
|
||||
- Ephemeral encryption keys per process (forward secrecy)
|
||||
- Corrupted cache files automatically deleted
|
||||
- Set `DG_CACHE_ENCRYPTION_KEY` only for cross-process cache sharing
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
# Use memory cache for faster performance in CI/CD
|
||||
export DG_CACHE_BACKEND=memory
|
||||
export DG_CACHE_MEMORY_SIZE_MB=500
|
||||
|
||||
# Enable debug logging
|
||||
export DG_LOG_LEVEL=DEBUG
|
||||
|
||||
# Adjust delta compression threshold
|
||||
export DG_MAX_RATIO=0.3 # More aggressive compression
|
||||
```
|
||||
|
||||
## Your First Upload
|
||||
|
||||
### Basic Example
|
||||
|
||||
64
examples/boto3_compatible_types.py
Normal file
64
examples/boto3_compatible_types.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Example: Using boto3-compatible responses without importing boto3.
|
||||
|
||||
This demonstrates how DeltaGlider provides full type safety and boto3 compatibility
|
||||
without requiring boto3 imports in user code.
|
||||
|
||||
As of v5.0.0, DeltaGlider returns plain dicts (not custom dataclasses) that are
|
||||
100% compatible with boto3 S3 responses. You get IDE autocomplete through TypedDict
|
||||
type hints without any runtime overhead.
|
||||
"""
|
||||
|
||||
from deltaglider import ListObjectsV2Response, S3Object, create_client
|
||||
|
||||
# Create client (no boto3 import needed!)
|
||||
client = create_client()
|
||||
|
||||
# Type hints work perfectly without boto3
|
||||
def process_files(bucket: str, prefix: str) -> None:
|
||||
"""Process files in S3 with full type safety."""
|
||||
# Return type is fully typed - IDE autocomplete works!
|
||||
response: ListObjectsV2Response = client.list_objects(
|
||||
Bucket=bucket, Prefix=prefix, Delimiter="/"
|
||||
)
|
||||
|
||||
# Response is a plain dict - 100% boto3-compatible
|
||||
# TypedDict provides autocomplete and type checking
|
||||
for obj in response["Contents"]:
|
||||
# obj is typed as S3Object - all fields have autocomplete!
|
||||
key: str = obj["Key"] # ✅ IDE knows this is str
|
||||
size: int = obj["Size"] # ✅ IDE knows this is int
|
||||
print(f"{key}: {size} bytes")
|
||||
|
||||
# DeltaGlider metadata is in the standard Metadata field
|
||||
metadata = obj.get("Metadata", {})
|
||||
if metadata.get("deltaglider-is-delta") == "true":
|
||||
compression = metadata.get("deltaglider-compression-ratio", "unknown")
|
||||
print(f" └─ Delta file (compression: {compression})")
|
||||
|
||||
# Optional fields work too
|
||||
for prefix_dict in response.get("CommonPrefixes", []):
|
||||
print(f"Directory: {prefix_dict['Prefix']}")
|
||||
|
||||
# Pagination info
|
||||
if response.get("IsTruncated"):
|
||||
next_token = response.get("NextContinuationToken")
|
||||
print(f"More results available, token: {next_token}")
|
||||
|
||||
|
||||
# This is 100% compatible with boto3 code!
|
||||
def works_with_boto3_or_deltaglider(s3_client) -> None:
|
||||
"""This function works with EITHER boto3 or DeltaGlider client."""
|
||||
# Because the response structure is identical!
|
||||
response = s3_client.list_objects(Bucket="my-bucket")
|
||||
|
||||
for obj in response["Contents"]:
|
||||
print(obj["Key"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
print("✅ Full type safety without boto3 imports!")
|
||||
print("✅ 100% compatible with boto3")
|
||||
print("✅ Drop-in replacement")
|
||||
print("✅ Plain dict responses (not custom dataclasses)")
|
||||
print("✅ DeltaGlider metadata in standard Metadata field")
|
||||
116
examples/bucket_management.py
Normal file
116
examples/bucket_management.py
Normal file
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Example: Bucket management without boto3.
|
||||
|
||||
This example shows how to use DeltaGlider's bucket management APIs
|
||||
to create, list, and delete buckets without needing boto3 directly.
|
||||
"""
|
||||
|
||||
from deltaglider import create_client
|
||||
|
||||
# Create client (works with AWS S3, MinIO, or any S3-compatible storage)
|
||||
client = create_client()
|
||||
|
||||
# For local MinIO/S3-compatible storage:
|
||||
# client = create_client(endpoint_url='http://localhost:9000')
|
||||
|
||||
print("=" * 70)
|
||||
print("DeltaGlider Bucket Management Example")
|
||||
print("=" * 70)
|
||||
|
||||
# 1. List existing buckets
|
||||
print("\n1. List all buckets:")
|
||||
try:
|
||||
response = client.list_buckets()
|
||||
if response["Buckets"]:
|
||||
for bucket in response["Buckets"]:
|
||||
print(f" - {bucket['Name']} (created: {bucket.get('CreationDate', 'unknown')})")
|
||||
else:
|
||||
print(" No buckets found")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
# 2. Create a new bucket
|
||||
bucket_name = "my-deltaglider-bucket"
|
||||
print(f"\n2. Create bucket '{bucket_name}':")
|
||||
try:
|
||||
response = client.create_bucket(Bucket=bucket_name)
|
||||
print(f" ✅ Created: {response['Location']}")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
# 3. Create bucket with region (if using AWS)
|
||||
# Uncomment for AWS S3:
|
||||
# print("\n3. Create bucket in specific region:")
|
||||
# try:
|
||||
# response = client.create_bucket(
|
||||
# Bucket='my-regional-bucket',
|
||||
# CreateBucketConfiguration={'LocationConstraint': 'us-west-2'}
|
||||
# )
|
||||
# print(f" ✅ Created: {response['Location']}")
|
||||
# except Exception as e:
|
||||
# print(f" Error: {e}")
|
||||
|
||||
# 4. Upload some files to the bucket
|
||||
print(f"\n4. Upload files to '{bucket_name}':")
|
||||
try:
|
||||
# Upload a simple file
|
||||
client.put_object(
|
||||
Bucket=bucket_name,
|
||||
Key="test-file.txt",
|
||||
Body=b"Hello from DeltaGlider!",
|
||||
)
|
||||
print(" ✅ Uploaded: test-file.txt")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
# 5. List objects in the bucket
|
||||
print(f"\n5. List objects in '{bucket_name}':")
|
||||
try:
|
||||
response = client.list_objects(Bucket=bucket_name)
|
||||
if response.contents:
|
||||
for obj in response.contents:
|
||||
print(f" - {obj.key} ({obj.size} bytes)")
|
||||
else:
|
||||
print(" No objects found")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
# 6. Delete all objects in the bucket (required before deleting bucket)
|
||||
print(f"\n6. Delete all objects in '{bucket_name}':")
|
||||
try:
|
||||
response = client.list_objects(Bucket=bucket_name)
|
||||
for obj in response.contents:
|
||||
client.delete_object(Bucket=bucket_name, Key=obj.key)
|
||||
print(f" ✅ Deleted: {obj.key}")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
# 7. Delete the bucket
|
||||
print(f"\n7. Delete bucket '{bucket_name}':")
|
||||
try:
|
||||
response = client.delete_bucket(Bucket=bucket_name)
|
||||
print(f" ✅ Deleted bucket (status: {response['ResponseMetadata']['HTTPStatusCode']})")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
# 8. Verify bucket is deleted
|
||||
print("\n8. Verify bucket deletion:")
|
||||
try:
|
||||
response = client.list_buckets()
|
||||
bucket_names = [b["Name"] for b in response["Buckets"]]
|
||||
if bucket_name in bucket_names:
|
||||
print(f" ❌ Bucket still exists!")
|
||||
else:
|
||||
print(f" ✅ Bucket successfully deleted")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("✅ Bucket management complete - no boto3 required!")
|
||||
print("=" * 70)
|
||||
|
||||
print("\n📚 Key Benefits:")
|
||||
print(" - No need to import boto3 directly")
|
||||
print(" - Consistent API with other DeltaGlider operations")
|
||||
print(" - Works with AWS S3, MinIO, and S3-compatible storage")
|
||||
print(" - Idempotent operations (safe to retry)")
|
||||
101
examples/credentials_example.py
Normal file
101
examples/credentials_example.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Example: Using explicit AWS credentials with DeltaGlider.
|
||||
|
||||
This example demonstrates how to pass AWS credentials directly to
|
||||
DeltaGlider's create_client() function, which is useful when:
|
||||
|
||||
1. You need to use different credentials than your environment default
|
||||
2. You're working with temporary credentials (session tokens)
|
||||
3. You want to avoid relying on environment variables
|
||||
4. You're implementing multi-tenant systems with different AWS accounts
|
||||
"""
|
||||
|
||||
from deltaglider import create_client
|
||||
|
||||
|
||||
def example_basic_credentials():
|
||||
"""Use basic AWS credentials (access key + secret key)."""
|
||||
client = create_client(
|
||||
aws_access_key_id="AKIAIOSFODNN7EXAMPLE",
|
||||
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
||||
region_name="us-west-2",
|
||||
)
|
||||
|
||||
# Now use the client normally
|
||||
# client.put_object(Bucket="my-bucket", Key="file.zip", Body=b"data")
|
||||
print("✓ Created client with explicit credentials")
|
||||
|
||||
|
||||
def example_temporary_credentials():
|
||||
"""Use temporary AWS credentials (with session token)."""
|
||||
client = create_client(
|
||||
aws_access_key_id="ASIAIOSFODNN7EXAMPLE",
|
||||
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
||||
aws_session_token="FwoGZXIvYXdzEBEaDH...", # From STS
|
||||
region_name="us-east-1",
|
||||
)
|
||||
|
||||
print("✓ Created client with temporary credentials")
|
||||
|
||||
|
||||
def example_environment_credentials():
|
||||
"""Use default credential chain (environment variables, IAM role, etc.)."""
|
||||
# When credentials are omitted, DeltaGlider uses boto3's default credential chain:
|
||||
# 1. Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
||||
# 2. AWS credentials file (~/.aws/credentials)
|
||||
# 3. IAM role (for EC2 instances)
|
||||
client = create_client()
|
||||
|
||||
print("✓ Created client with default credential chain")
|
||||
|
||||
|
||||
def example_minio_credentials():
|
||||
"""Use credentials for MinIO or other S3-compatible services."""
|
||||
client = create_client(
|
||||
endpoint_url="http://localhost:9000",
|
||||
aws_access_key_id="minioadmin",
|
||||
aws_secret_access_key="minioadmin",
|
||||
)
|
||||
|
||||
print("✓ Created client for MinIO with custom credentials")
|
||||
|
||||
|
||||
def example_multi_tenant():
|
||||
"""Example: Different credentials for different tenants."""
|
||||
|
||||
# Tenant A uses one AWS account
|
||||
tenant_a_client = create_client(
|
||||
aws_access_key_id="TENANT_A_KEY",
|
||||
aws_secret_access_key="TENANT_A_SECRET",
|
||||
region_name="us-west-2",
|
||||
)
|
||||
|
||||
# Tenant B uses a different AWS account
|
||||
tenant_b_client = create_client(
|
||||
aws_access_key_id="TENANT_B_KEY",
|
||||
aws_secret_access_key="TENANT_B_SECRET",
|
||||
region_name="eu-west-1",
|
||||
)
|
||||
|
||||
print("✓ Created separate clients for multi-tenant scenario")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("DeltaGlider Credentials Examples\n" + "=" * 40)
|
||||
|
||||
print("\n1. Basic credentials:")
|
||||
example_basic_credentials()
|
||||
|
||||
print("\n2. Temporary credentials:")
|
||||
example_temporary_credentials()
|
||||
|
||||
print("\n3. Environment credentials:")
|
||||
example_environment_credentials()
|
||||
|
||||
print("\n4. MinIO credentials:")
|
||||
example_minio_credentials()
|
||||
|
||||
print("\n5. Multi-tenant scenario:")
|
||||
example_multi_tenant()
|
||||
|
||||
print("\n" + "=" * 40)
|
||||
print("All examples completed successfully!")
|
||||
@@ -13,7 +13,7 @@ maintainers = [
|
||||
{name = "Beshu Tech Team", email = "info@beshu.tech"},
|
||||
]
|
||||
readme = "README.md"
|
||||
license = {text = "MIT"}
|
||||
license = "MIT"
|
||||
requires-python = ">=3.11"
|
||||
keywords = [
|
||||
"s3",
|
||||
@@ -35,7 +35,6 @@ classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: System Administrators",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
@@ -52,6 +51,7 @@ classifiers = [
|
||||
dependencies = [
|
||||
"boto3>=1.35.0",
|
||||
"click>=8.1.0",
|
||||
"cryptography>=42.0.0",
|
||||
"python-dateutil>=2.9.0",
|
||||
]
|
||||
|
||||
@@ -115,7 +115,6 @@ dev-dependencies = [
|
||||
[tool.setuptools_scm]
|
||||
# Automatically determine version from git tags
|
||||
write_to = "src/deltaglider/_version.py"
|
||||
version_scheme = "release-branch-semver"
|
||||
local_scheme = "no-local-version"
|
||||
|
||||
[tool.ruff]
|
||||
@@ -146,8 +145,12 @@ disallow_untyped_defs = true
|
||||
disallow_any_unimported = false
|
||||
no_implicit_optional = true
|
||||
check_untyped_defs = true
|
||||
namespace_packages = true
|
||||
explicit_package_bases = true
|
||||
namespace_packages = false
|
||||
mypy_path = "src"
|
||||
exclude = [
|
||||
"^build/",
|
||||
"^dist/",
|
||||
]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = "8.0"
|
||||
|
||||
@@ -6,15 +6,55 @@ except ImportError:
|
||||
# Package is not installed, so version is not available
|
||||
__version__ = "0.0.0+unknown"
|
||||
|
||||
# Import simplified client API
|
||||
# Import client API
|
||||
from .client import DeltaGliderClient, create_client
|
||||
from .client_models import (
|
||||
BucketStats,
|
||||
CompressionEstimate,
|
||||
ListObjectsResponse,
|
||||
ObjectInfo,
|
||||
UploadSummary,
|
||||
)
|
||||
from .core import DeltaService, DeltaSpace, ObjectKey
|
||||
|
||||
# Import boto3-compatible type aliases (no boto3 import required!)
|
||||
from .types import (
|
||||
CopyObjectResponse,
|
||||
CreateBucketResponse,
|
||||
DeleteObjectResponse,
|
||||
DeleteObjectsResponse,
|
||||
GetObjectResponse,
|
||||
HeadObjectResponse,
|
||||
ListBucketsResponse,
|
||||
ListObjectsV2Response,
|
||||
PutObjectResponse,
|
||||
S3Object,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
# Client
|
||||
"DeltaGliderClient",
|
||||
"create_client",
|
||||
# Data classes (legacy - will be deprecated in favor of TypedDict)
|
||||
"UploadSummary",
|
||||
"CompressionEstimate",
|
||||
"ObjectInfo",
|
||||
"ListObjectsResponse",
|
||||
"BucketStats",
|
||||
# Core classes
|
||||
"DeltaService",
|
||||
"DeltaSpace",
|
||||
"ObjectKey",
|
||||
# boto3-compatible types (no boto3 import needed!)
|
||||
"ListObjectsV2Response",
|
||||
"PutObjectResponse",
|
||||
"GetObjectResponse",
|
||||
"DeleteObjectResponse",
|
||||
"DeleteObjectsResponse",
|
||||
"HeadObjectResponse",
|
||||
"ListBucketsResponse",
|
||||
"CreateBucketResponse",
|
||||
"CopyObjectResponse",
|
||||
"S3Object",
|
||||
]
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
# file generated by setuptools-scm
|
||||
# don't change, don't track in version control
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"__version_tuple__",
|
||||
"version",
|
||||
"version_tuple",
|
||||
"__commit_id__",
|
||||
"commit_id",
|
||||
]
|
||||
|
||||
TYPE_CHECKING = False
|
||||
if TYPE_CHECKING:
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
||||
COMMIT_ID = Union[str, None]
|
||||
else:
|
||||
VERSION_TUPLE = object
|
||||
COMMIT_ID = object
|
||||
|
||||
version: str
|
||||
__version__: str
|
||||
__version_tuple__: VERSION_TUPLE
|
||||
version_tuple: VERSION_TUPLE
|
||||
commit_id: COMMIT_ID
|
||||
__commit_id__: COMMIT_ID
|
||||
|
||||
__version__ = version = '0.1.0'
|
||||
__version_tuple__ = version_tuple = (0, 1, 0)
|
||||
|
||||
__commit_id__ = commit_id = 'gf08960b6c'
|
||||
@@ -1,6 +1,9 @@
|
||||
"""Adapters for DeltaGlider."""
|
||||
|
||||
from .cache_cas import ContentAddressedCache
|
||||
from .cache_encrypted import EncryptedCache
|
||||
from .cache_fs import FsCacheAdapter
|
||||
from .cache_memory import MemoryCache
|
||||
from .clock_utc import UtcClockAdapter
|
||||
from .diff_xdelta import XdeltaAdapter
|
||||
from .hash_sha import Sha256Adapter
|
||||
@@ -13,6 +16,9 @@ __all__ = [
|
||||
"XdeltaAdapter",
|
||||
"Sha256Adapter",
|
||||
"FsCacheAdapter",
|
||||
"ContentAddressedCache",
|
||||
"EncryptedCache",
|
||||
"MemoryCache",
|
||||
"UtcClockAdapter",
|
||||
"StdLoggerAdapter",
|
||||
"NoopMetricsAdapter",
|
||||
|
||||
246
src/deltaglider/adapters/cache_cas.py
Normal file
246
src/deltaglider/adapters/cache_cas.py
Normal file
@@ -0,0 +1,246 @@
|
||||
"""Content-Addressed Storage (CAS) cache adapter.
|
||||
|
||||
This adapter stores cached references using their SHA256 hash as the filename,
|
||||
eliminating collision risks and enabling automatic deduplication.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Unix-only imports for file locking
|
||||
if sys.platform != "win32":
|
||||
import fcntl
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
from ..ports.hash import HashPort
|
||||
|
||||
|
||||
class ContentAddressedCache(CachePort):
|
||||
"""Content-addressed storage cache using SHA256 as filename.
|
||||
|
||||
Key Features:
|
||||
- Zero collision risk (SHA256 namespace is the filename)
|
||||
- Automatic deduplication (same content = same filename)
|
||||
- No metadata tracking needed (self-describing)
|
||||
- Secure by design (tampering changes SHA, breaks lookup)
|
||||
|
||||
Storage Layout:
|
||||
- base_dir/
|
||||
- ab/
|
||||
- cd/
|
||||
- abcdef123456... (full SHA256 as filename)
|
||||
|
||||
The two-level directory structure (first 2 chars, next 2 chars) prevents
|
||||
filesystem performance degradation from too many files in one directory.
|
||||
"""
|
||||
|
||||
def __init__(self, base_dir: Path, hasher: HashPort):
|
||||
"""Initialize content-addressed cache.
|
||||
|
||||
Args:
|
||||
base_dir: Root directory for cache storage
|
||||
hasher: Hash adapter for SHA256 computation
|
||||
"""
|
||||
self.base_dir = base_dir
|
||||
self.hasher = hasher
|
||||
# Mapping of (bucket, prefix) -> sha256 for compatibility
|
||||
# This is ephemeral and only used within a single process
|
||||
self._deltaspace_to_sha: dict[tuple[str, str], str] = {}
|
||||
|
||||
def _cas_path(self, sha256: str) -> Path:
|
||||
"""Get content-addressed path from SHA256 hash.
|
||||
|
||||
Uses two-level directory structure for filesystem optimization:
|
||||
- First 2 hex chars as L1 directory (256 buckets)
|
||||
- Next 2 hex chars as L2 directory (256 buckets per L1)
|
||||
- Full SHA as filename
|
||||
|
||||
Example: abcdef1234... -> ab/cd/abcdef1234...
|
||||
|
||||
Args:
|
||||
sha256: Full SHA256 hash (64 hex chars)
|
||||
|
||||
Returns:
|
||||
Path to file in content-addressed storage
|
||||
"""
|
||||
if len(sha256) < 4:
|
||||
raise ValueError(f"Invalid SHA256: {sha256}")
|
||||
|
||||
# Two-level directory structure
|
||||
l1_dir = sha256[:2] # First 2 chars
|
||||
l2_dir = sha256[2:4] # Next 2 chars
|
||||
|
||||
return self.base_dir / l1_dir / l2_dir / sha256
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str) -> Path:
|
||||
"""Get path where reference should be cached.
|
||||
|
||||
For CAS, we need the SHA to compute the path. This method looks up
|
||||
the SHA from the ephemeral mapping. If not found, it returns a
|
||||
placeholder path (backward compatibility with has_ref checks).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
|
||||
Returns:
|
||||
Path to cached reference (may not exist)
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# If we have the SHA mapping, use CAS path
|
||||
if key in self._deltaspace_to_sha:
|
||||
sha = self._deltaspace_to_sha[key]
|
||||
return self._cas_path(sha)
|
||||
|
||||
# Fallback: return a non-existent placeholder
|
||||
# This enables has_ref to return False for unmapped deltaspaces
|
||||
return self.base_dir / "_unmapped" / bucket / prefix / "reference.bin"
|
||||
|
||||
def has_ref(self, bucket: str, prefix: str, sha: str) -> bool:
|
||||
"""Check if reference exists with given SHA.
|
||||
|
||||
In CAS, existence check is simple: if file exists at SHA path,
|
||||
it MUST have that SHA (content-addressed guarantee).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
True if reference exists with this SHA
|
||||
"""
|
||||
path = self._cas_path(sha)
|
||||
return path.exists()
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with atomic SHA validation.
|
||||
|
||||
In CAS, the SHA IS the filename, so if the file exists, it's already
|
||||
validated by definition. We still perform an integrity check to detect
|
||||
filesystem corruption.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
expected_sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
Path to validated cached file
|
||||
|
||||
Raises:
|
||||
CacheMissError: File not found in cache
|
||||
CacheCorruptionError: SHA mismatch (filesystem corruption)
|
||||
"""
|
||||
path = self._cas_path(expected_sha)
|
||||
|
||||
if not path.exists():
|
||||
raise CacheMissError(f"Cache miss for SHA {expected_sha[:8]}...")
|
||||
|
||||
# Lock file and validate content atomically
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
# Acquire shared lock (Unix only)
|
||||
if sys.platform != "win32":
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
||||
|
||||
# Read and hash content
|
||||
content = f.read()
|
||||
actual_sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Release lock automatically when exiting context
|
||||
|
||||
# Validate SHA (should never fail in CAS unless filesystem corruption)
|
||||
if actual_sha != expected_sha:
|
||||
# Filesystem corruption detected
|
||||
try:
|
||||
path.unlink()
|
||||
except OSError:
|
||||
pass # Best effort cleanup
|
||||
|
||||
raise CacheCorruptionError(
|
||||
f"Filesystem corruption detected: file {path.name} has wrong content. "
|
||||
f"Expected SHA {expected_sha}, got {actual_sha}"
|
||||
)
|
||||
|
||||
# Update mapping for ref_path compatibility
|
||||
self._deltaspace_to_sha[(bucket, prefix)] = expected_sha
|
||||
|
||||
return path
|
||||
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cache read error for SHA {expected_sha[:8]}...: {e}") from e
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Cache reference file using content-addressed storage.
|
||||
|
||||
The file is stored at a path determined by its SHA256 hash.
|
||||
If a file with the same content already exists, it's reused
|
||||
(automatic deduplication).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
src: Source file to cache
|
||||
|
||||
Returns:
|
||||
Path to cached file (content-addressed)
|
||||
"""
|
||||
# Compute SHA of source file
|
||||
sha = self.hasher.sha256(src)
|
||||
path = self._cas_path(sha)
|
||||
|
||||
# If file already exists, we're done (deduplication)
|
||||
if path.exists():
|
||||
# Update mapping
|
||||
self._deltaspace_to_sha[(bucket, prefix)] = sha
|
||||
return path
|
||||
|
||||
# Create directory structure with secure permissions
|
||||
path.parent.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
|
||||
# Atomic write using temp file + rename
|
||||
temp_path = path.parent / f".tmp.{sha}"
|
||||
try:
|
||||
shutil.copy2(src, temp_path)
|
||||
# Atomic rename (POSIX guarantee)
|
||||
temp_path.rename(path)
|
||||
except Exception:
|
||||
# Cleanup on failure
|
||||
if temp_path.exists():
|
||||
temp_path.unlink()
|
||||
raise
|
||||
|
||||
# Update mapping
|
||||
self._deltaspace_to_sha[(bucket, prefix)] = sha
|
||||
|
||||
return path
|
||||
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference for given deltaspace.
|
||||
|
||||
In CAS, eviction is more complex because:
|
||||
1. Multiple deltaspaces may reference the same SHA (deduplication)
|
||||
2. We can't delete the file unless we know no other deltaspace uses it
|
||||
|
||||
For safety, we only remove the mapping, not the actual file.
|
||||
Orphaned files will be cleaned up by cache expiry (future feature).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# Remove mapping (safe operation)
|
||||
if key in self._deltaspace_to_sha:
|
||||
del self._deltaspace_to_sha[key]
|
||||
|
||||
# NOTE: We don't delete the actual CAS file because:
|
||||
# - Other deltaspaces may reference the same SHA
|
||||
# - The ephemeral cache will be cleaned on process exit anyway
|
||||
# - For persistent cache (future), we'd need reference counting
|
||||
283
src/deltaglider/adapters/cache_encrypted.py
Normal file
283
src/deltaglider/adapters/cache_encrypted.py
Normal file
@@ -0,0 +1,283 @@
|
||||
"""Encrypted cache wrapper using Fernet symmetric encryption.
|
||||
|
||||
This adapter wraps any CachePort implementation and adds transparent encryption/decryption.
|
||||
It uses Fernet (symmetric encryption based on AES-128-CBC with HMAC authentication).
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from cryptography.fernet import Fernet
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
|
||||
|
||||
class EncryptedCache(CachePort):
|
||||
"""Encrypted cache wrapper using Fernet symmetric encryption.
|
||||
|
||||
Wraps any CachePort implementation and transparently encrypts data at rest.
|
||||
Uses Fernet which provides:
|
||||
- AES-128-CBC encryption
|
||||
- HMAC authentication (prevents tampering)
|
||||
- Automatic key rotation support
|
||||
- Safe for ephemeral process-isolated caches
|
||||
|
||||
Key Management:
|
||||
- Ephemeral key generated per process (default, most secure)
|
||||
- Or use DG_CACHE_ENCRYPTION_KEY env var (base64-encoded Fernet key)
|
||||
- For production: use secrets management system (AWS KMS, HashiCorp Vault, etc.)
|
||||
|
||||
Security Properties:
|
||||
- Confidentiality: Data encrypted at rest
|
||||
- Integrity: HMAC prevents tampering
|
||||
- Authenticity: Only valid keys can decrypt
|
||||
- Forward Secrecy: Ephemeral keys destroyed on process exit
|
||||
"""
|
||||
|
||||
def __init__(self, backend: CachePort, encryption_key: bytes | None = None):
|
||||
"""Initialize encrypted cache wrapper.
|
||||
|
||||
Args:
|
||||
backend: Underlying cache implementation (CAS, filesystem, memory, etc.)
|
||||
encryption_key: Optional Fernet key (32 bytes base64-encoded).
|
||||
If None, generates ephemeral key for this process.
|
||||
"""
|
||||
self.backend = backend
|
||||
|
||||
# Key management: ephemeral (default) or provided
|
||||
if encryption_key is None:
|
||||
# Generate ephemeral key for this process (most secure)
|
||||
self._key = Fernet.generate_key()
|
||||
self._ephemeral = True
|
||||
else:
|
||||
# Use provided key (for persistent cache scenarios)
|
||||
self._key = encryption_key
|
||||
self._ephemeral = False
|
||||
|
||||
self._cipher = Fernet(self._key)
|
||||
|
||||
# Mapping: (bucket, prefix) -> plaintext_sha256
|
||||
# Needed because backend uses SHA for storage, but encrypted content has different SHA
|
||||
self._plaintext_sha_map: dict[tuple[str, str], str] = {}
|
||||
|
||||
@classmethod
|
||||
def from_env(cls, backend: CachePort) -> "EncryptedCache":
|
||||
"""Create encrypted cache with key from environment.
|
||||
|
||||
Looks for DG_CACHE_ENCRYPTION_KEY environment variable.
|
||||
If not found, generates ephemeral key.
|
||||
|
||||
Args:
|
||||
backend: Underlying cache implementation
|
||||
|
||||
Returns:
|
||||
EncryptedCache instance
|
||||
"""
|
||||
key_str = os.environ.get("DG_CACHE_ENCRYPTION_KEY")
|
||||
if key_str:
|
||||
# Decode base64-encoded key
|
||||
encryption_key = key_str.encode("utf-8")
|
||||
else:
|
||||
# Use ephemeral key
|
||||
encryption_key = None
|
||||
|
||||
return cls(backend, encryption_key)
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str) -> Path:
|
||||
"""Get path where reference should be cached.
|
||||
|
||||
Delegates to backend. Path structure determined by backend
|
||||
(e.g., CAS uses SHA256-based paths).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
|
||||
Returns:
|
||||
Path from backend
|
||||
"""
|
||||
return self.backend.ref_path(bucket, prefix)
|
||||
|
||||
def has_ref(self, bucket: str, prefix: str, sha: str) -> bool:
|
||||
"""Check if reference exists with given SHA.
|
||||
|
||||
Note: SHA is of the *unencrypted* content. The backend may store
|
||||
encrypted data, but we verify against original content hash.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
sha: SHA256 of unencrypted content
|
||||
|
||||
Returns:
|
||||
True if encrypted reference exists with this SHA
|
||||
"""
|
||||
# Delegate to backend
|
||||
# Backend may use SHA for content-addressed storage of encrypted data
|
||||
return self.backend.has_ref(bucket, prefix, sha)
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with decryption and validation.
|
||||
|
||||
Retrieves encrypted data from backend, decrypts it, validates SHA,
|
||||
and returns path to decrypted temporary file.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
expected_sha: Expected SHA256 of *decrypted* content
|
||||
|
||||
Returns:
|
||||
Path to decrypted validated file (temporary)
|
||||
|
||||
Raises:
|
||||
CacheMissError: File not in cache
|
||||
CacheCorruptionError: Decryption failed or SHA mismatch
|
||||
"""
|
||||
# Check if we have this plaintext SHA mapped
|
||||
key = (bucket, prefix)
|
||||
if key not in self._plaintext_sha_map:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Verify the requested SHA matches our mapping
|
||||
if self._plaintext_sha_map[key] != expected_sha:
|
||||
raise CacheCorruptionError(
|
||||
f"SHA mismatch for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, have {self._plaintext_sha_map[key]}"
|
||||
)
|
||||
|
||||
# Get encrypted file from backend using ref_path (not validated, we validate plaintext)
|
||||
encrypted_path = self.backend.ref_path(bucket, prefix)
|
||||
if not encrypted_path.exists():
|
||||
raise CacheMissError(f"Encrypted cache file not found for {bucket}/{prefix}")
|
||||
|
||||
# Read encrypted content
|
||||
try:
|
||||
with open(encrypted_path, "rb") as f:
|
||||
encrypted_data = f.read()
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cannot read encrypted cache: {e}") from e
|
||||
|
||||
# Decrypt
|
||||
try:
|
||||
decrypted_data = self._cipher.decrypt(encrypted_data)
|
||||
except Exception as e:
|
||||
# Fernet raises InvalidToken for tampering/wrong key
|
||||
# SECURITY: Auto-delete corrupted cache files
|
||||
try:
|
||||
encrypted_path.unlink(missing_ok=True)
|
||||
# Clean up mapping
|
||||
if key in self._plaintext_sha_map:
|
||||
del self._plaintext_sha_map[key]
|
||||
except Exception:
|
||||
pass # Best effort cleanup
|
||||
raise CacheCorruptionError(
|
||||
f"Decryption failed for {bucket}/{prefix}: {e}. "
|
||||
f"Corrupted cache deleted automatically."
|
||||
) from e
|
||||
|
||||
# Validate SHA of decrypted content
|
||||
import hashlib
|
||||
|
||||
actual_sha = hashlib.sha256(decrypted_data).hexdigest()
|
||||
if actual_sha != expected_sha:
|
||||
# SECURITY: Auto-delete corrupted cache files
|
||||
try:
|
||||
encrypted_path.unlink(missing_ok=True)
|
||||
# Clean up mapping
|
||||
if key in self._plaintext_sha_map:
|
||||
del self._plaintext_sha_map[key]
|
||||
except Exception:
|
||||
pass # Best effort cleanup
|
||||
raise CacheCorruptionError(
|
||||
f"Decrypted content SHA mismatch for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, got {actual_sha}. "
|
||||
f"Corrupted cache deleted automatically."
|
||||
)
|
||||
|
||||
# Write decrypted content to temporary file
|
||||
# Use same path as encrypted file but with .decrypted suffix
|
||||
decrypted_path = encrypted_path.with_suffix(".decrypted")
|
||||
try:
|
||||
with open(decrypted_path, "wb") as f:
|
||||
f.write(decrypted_data)
|
||||
except OSError as e:
|
||||
raise CacheCorruptionError(f"Cannot write decrypted cache: {e}") from e
|
||||
|
||||
return decrypted_path
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Encrypt and cache reference file.
|
||||
|
||||
Reads source file, encrypts it, and stores encrypted version via backend.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
src: Source file to encrypt and cache
|
||||
|
||||
Returns:
|
||||
Path to encrypted cached file (from backend)
|
||||
"""
|
||||
# Read source file
|
||||
try:
|
||||
with open(src, "rb") as f:
|
||||
plaintext_data = f.read()
|
||||
except OSError as e:
|
||||
raise CacheCorruptionError(f"Cannot read source file {src}: {e}") from e
|
||||
|
||||
# Compute plaintext SHA for mapping
|
||||
import hashlib
|
||||
|
||||
plaintext_sha = hashlib.sha256(plaintext_data).hexdigest()
|
||||
|
||||
# Encrypt
|
||||
encrypted_data = self._cipher.encrypt(plaintext_data)
|
||||
|
||||
# Write encrypted data to temporary file
|
||||
temp_encrypted = src.with_suffix(".encrypted.tmp")
|
||||
try:
|
||||
with open(temp_encrypted, "wb") as f:
|
||||
f.write(encrypted_data)
|
||||
|
||||
# Store encrypted file via backend
|
||||
result_path = self.backend.write_ref(bucket, prefix, temp_encrypted)
|
||||
|
||||
# Store mapping of plaintext SHA
|
||||
key = (bucket, prefix)
|
||||
self._plaintext_sha_map[key] = plaintext_sha
|
||||
|
||||
return result_path
|
||||
|
||||
finally:
|
||||
# Cleanup temporary file
|
||||
if temp_encrypted.exists():
|
||||
temp_encrypted.unlink()
|
||||
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference (encrypted version).
|
||||
|
||||
Delegates to backend. Also cleans up any .decrypted temporary files and mappings.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
"""
|
||||
# Remove from plaintext SHA mapping
|
||||
key = (bucket, prefix)
|
||||
if key in self._plaintext_sha_map:
|
||||
del self._plaintext_sha_map[key]
|
||||
|
||||
# Get path to potentially clean up .decrypted files
|
||||
try:
|
||||
path = self.backend.ref_path(bucket, prefix)
|
||||
decrypted_path = path.with_suffix(".decrypted")
|
||||
if decrypted_path.exists():
|
||||
decrypted_path.unlink()
|
||||
except Exception:
|
||||
# Best effort cleanup
|
||||
pass
|
||||
|
||||
# Evict from backend
|
||||
self.backend.evict(bucket, prefix)
|
||||
@@ -1,8 +1,15 @@
|
||||
"""Filesystem cache adapter."""
|
||||
|
||||
import hashlib
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Unix-only imports for file locking
|
||||
if sys.platform != "win32":
|
||||
import fcntl
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
from ..ports.hash import HashPort
|
||||
|
||||
@@ -29,6 +36,60 @@ class FsCacheAdapter(CachePort):
|
||||
actual_sha = self.hasher.sha256(path)
|
||||
return actual_sha == sha
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with atomic SHA validation.
|
||||
|
||||
This method prevents TOCTOU attacks by validating the SHA at use-time,
|
||||
not just at check-time.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Prefix/deltaspace within bucket
|
||||
expected_sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
Path to validated cached file
|
||||
|
||||
Raises:
|
||||
CacheMissError: File not found in cache
|
||||
CacheCorruptionError: SHA mismatch detected
|
||||
"""
|
||||
path = self.ref_path(bucket, prefix)
|
||||
|
||||
if not path.exists():
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Lock file and validate content atomically
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
# Acquire shared lock (Unix only)
|
||||
if sys.platform != "win32":
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
||||
|
||||
# Read and hash content
|
||||
content = f.read()
|
||||
actual_sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Release lock automatically when exiting context
|
||||
|
||||
# Validate SHA
|
||||
if actual_sha != expected_sha:
|
||||
# File corrupted or tampered - remove it
|
||||
try:
|
||||
path.unlink()
|
||||
except OSError:
|
||||
pass # Best effort cleanup
|
||||
|
||||
raise CacheCorruptionError(
|
||||
f"Cache corruption detected for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, got {actual_sha}"
|
||||
)
|
||||
|
||||
return path
|
||||
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cache read error for {bucket}/{prefix}: {e}") from e
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Cache reference file."""
|
||||
path = self.ref_path(bucket, prefix)
|
||||
|
||||
279
src/deltaglider/adapters/cache_memory.py
Normal file
279
src/deltaglider/adapters/cache_memory.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""In-memory cache implementation with optional size limits.
|
||||
|
||||
This adapter stores cached references entirely in memory, avoiding filesystem I/O.
|
||||
Useful for:
|
||||
- High-performance scenarios where memory is abundant
|
||||
- Containerized environments with limited filesystem access
|
||||
- Testing and development
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Unix-only imports for compatibility
|
||||
if sys.platform != "win32":
|
||||
import fcntl # noqa: F401
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
from ..ports.hash import HashPort
|
||||
|
||||
|
||||
class MemoryCache(CachePort):
|
||||
"""In-memory cache implementation with LRU eviction.
|
||||
|
||||
Stores cached references in memory as bytes. Useful for high-performance
|
||||
scenarios or when filesystem access is limited.
|
||||
|
||||
Features:
|
||||
- Zero filesystem I/O (everything in RAM)
|
||||
- Optional size limits with LRU eviction
|
||||
- Thread-safe operations
|
||||
- Temporary file creation for compatibility with file-based APIs
|
||||
|
||||
Limitations:
|
||||
- Data lost on process exit (ephemeral only)
|
||||
- Memory usage proportional to cache size
|
||||
- Not suitable for very large reference files
|
||||
|
||||
Storage Layout:
|
||||
- Key: (bucket, prefix) tuple
|
||||
- Value: (content_bytes, sha256) tuple
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hasher: HashPort,
|
||||
max_size_mb: int = 100,
|
||||
temp_dir: Path | None = None,
|
||||
):
|
||||
"""Initialize in-memory cache.
|
||||
|
||||
Args:
|
||||
hasher: Hash adapter for SHA256 computation
|
||||
max_size_mb: Maximum cache size in megabytes (default 100MB)
|
||||
temp_dir: Directory for temporary files (default: system temp)
|
||||
"""
|
||||
self.hasher = hasher
|
||||
self.max_size_bytes = max_size_mb * 1024 * 1024
|
||||
|
||||
# Storage: (bucket, prefix) -> (content_bytes, sha256)
|
||||
self._cache: dict[tuple[str, str], tuple[bytes, str]] = {}
|
||||
|
||||
# Size tracking
|
||||
self._current_size = 0
|
||||
|
||||
# Access order for LRU eviction: (bucket, prefix) list
|
||||
self._access_order: list[tuple[str, str]] = []
|
||||
|
||||
# Temp directory for file-based API compatibility
|
||||
if temp_dir is None:
|
||||
import tempfile
|
||||
|
||||
self.temp_dir = Path(tempfile.gettempdir()) / "deltaglider-mem-cache"
|
||||
else:
|
||||
self.temp_dir = temp_dir
|
||||
|
||||
self.temp_dir.mkdir(parents=True, exist_ok=True, mode=0o700)
|
||||
|
||||
def _update_access(self, key: tuple[str, str]) -> None:
|
||||
"""Update LRU access order.
|
||||
|
||||
Args:
|
||||
key: Cache key (bucket, prefix)
|
||||
"""
|
||||
# Remove old position if exists
|
||||
if key in self._access_order:
|
||||
self._access_order.remove(key)
|
||||
|
||||
# Add to end (most recently used)
|
||||
self._access_order.append(key)
|
||||
|
||||
def _evict_lru(self, needed_bytes: int) -> None:
|
||||
"""Evict least recently used entries to free space.
|
||||
|
||||
Args:
|
||||
needed_bytes: Bytes needed for new entry
|
||||
"""
|
||||
while self._current_size + needed_bytes > self.max_size_bytes and self._access_order:
|
||||
# Evict least recently used
|
||||
lru_key = self._access_order[0]
|
||||
bucket, prefix = lru_key
|
||||
|
||||
# Remove from cache
|
||||
if lru_key in self._cache:
|
||||
content, _ = self._cache[lru_key]
|
||||
self._current_size -= len(content)
|
||||
del self._cache[lru_key]
|
||||
|
||||
# Remove from access order
|
||||
self._access_order.remove(lru_key)
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str) -> Path:
|
||||
"""Get placeholder path for in-memory reference.
|
||||
|
||||
Returns a virtual path that doesn't actually exist on filesystem.
|
||||
Used for API compatibility.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
|
||||
Returns:
|
||||
Virtual path (may not exist on filesystem)
|
||||
"""
|
||||
# Return virtual path for compatibility
|
||||
# Actual data is in memory, but we need Path for API
|
||||
safe_bucket = bucket.replace("/", "_")
|
||||
safe_prefix = prefix.replace("/", "_")
|
||||
return self.temp_dir / safe_bucket / safe_prefix / "reference.bin"
|
||||
|
||||
def has_ref(self, bucket: str, prefix: str, sha: str) -> bool:
|
||||
"""Check if reference exists in memory with given SHA.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
True if reference exists with this SHA
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
if key not in self._cache:
|
||||
return False
|
||||
|
||||
_, cached_sha = self._cache[key]
|
||||
return cached_sha == sha
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference from memory with validation.
|
||||
|
||||
Retrieves content from memory, validates SHA, and writes to
|
||||
temporary file for compatibility with file-based APIs.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
expected_sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
Path to temporary file containing content
|
||||
|
||||
Raises:
|
||||
CacheMissError: Content not in cache
|
||||
CacheCorruptionError: SHA mismatch
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# Check if in cache
|
||||
if key not in self._cache:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Get content and validate
|
||||
content, cached_sha = self._cache[key]
|
||||
|
||||
# Update LRU
|
||||
self._update_access(key)
|
||||
|
||||
# Validate SHA
|
||||
if cached_sha != expected_sha:
|
||||
# SHA mismatch - possible corruption
|
||||
raise CacheCorruptionError(
|
||||
f"Memory cache SHA mismatch for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, got {cached_sha}"
|
||||
)
|
||||
|
||||
# Write to temporary file for API compatibility
|
||||
temp_path = self.ref_path(bucket, prefix)
|
||||
temp_path.parent.mkdir(parents=True, exist_ok=True, mode=0o700)
|
||||
|
||||
try:
|
||||
with open(temp_path, "wb") as f:
|
||||
f.write(content)
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cannot write temp file: {e}") from e
|
||||
|
||||
return temp_path
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Store reference file in memory.
|
||||
|
||||
Reads file content and stores in memory with SHA hash.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
src: Source file to cache
|
||||
|
||||
Returns:
|
||||
Virtual path (content is in memory)
|
||||
"""
|
||||
# Read source file
|
||||
try:
|
||||
with open(src, "rb") as f:
|
||||
content = f.read()
|
||||
except OSError as e:
|
||||
raise CacheCorruptionError(f"Cannot read source file {src}: {e}") from e
|
||||
|
||||
# Compute SHA
|
||||
sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Check if we need to evict
|
||||
content_size = len(content)
|
||||
if content_size > self.max_size_bytes:
|
||||
raise CacheCorruptionError(
|
||||
f"File too large for memory cache: {content_size} bytes "
|
||||
f"(limit: {self.max_size_bytes} bytes)"
|
||||
)
|
||||
|
||||
# Evict LRU entries if needed
|
||||
self._evict_lru(content_size)
|
||||
|
||||
# Store in memory
|
||||
key = (bucket, prefix)
|
||||
self._cache[key] = (content, sha)
|
||||
self._current_size += content_size
|
||||
|
||||
# Update LRU
|
||||
self._update_access(key)
|
||||
|
||||
# Return virtual path
|
||||
return self.ref_path(bucket, prefix)
|
||||
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference from memory.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# Remove from cache
|
||||
if key in self._cache:
|
||||
content, _ = self._cache[key]
|
||||
self._current_size -= len(content)
|
||||
del self._cache[key]
|
||||
|
||||
# Remove from LRU tracking
|
||||
if key in self._access_order:
|
||||
self._access_order.remove(key)
|
||||
|
||||
# Clean up temp file if exists
|
||||
temp_path = self.ref_path(bucket, prefix)
|
||||
if temp_path.exists():
|
||||
try:
|
||||
temp_path.unlink()
|
||||
except OSError:
|
||||
pass # Best effort
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached content from memory.
|
||||
|
||||
Useful for testing and cleanup.
|
||||
"""
|
||||
self._cache.clear()
|
||||
self._access_order.clear()
|
||||
self._current_size = 0
|
||||
215
src/deltaglider/adapters/metrics_cloudwatch.py
Normal file
215
src/deltaglider/adapters/metrics_cloudwatch.py
Normal file
@@ -0,0 +1,215 @@
|
||||
"""CloudWatch metrics adapter for production metrics collection."""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from ..ports.metrics import MetricsPort
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Constants for byte conversions
|
||||
BYTES_PER_KB = 1024
|
||||
BYTES_PER_MB = 1024 * 1024
|
||||
BYTES_PER_GB = 1024 * 1024 * 1024
|
||||
|
||||
|
||||
class CloudWatchMetricsAdapter(MetricsPort):
|
||||
"""CloudWatch implementation of MetricsPort for AWS-native metrics."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
namespace: str = "DeltaGlider",
|
||||
region: str | None = None,
|
||||
endpoint_url: str | None = None,
|
||||
):
|
||||
"""Initialize CloudWatch metrics adapter.
|
||||
|
||||
Args:
|
||||
namespace: CloudWatch namespace for metrics
|
||||
region: AWS region (uses default if None)
|
||||
endpoint_url: Override endpoint for testing
|
||||
"""
|
||||
self.namespace = namespace
|
||||
try:
|
||||
self.client = boto3.client(
|
||||
"cloudwatch",
|
||||
region_name=region,
|
||||
endpoint_url=endpoint_url,
|
||||
)
|
||||
self.enabled = True
|
||||
except Exception as e:
|
||||
logger.warning(f"CloudWatch metrics disabled: {e}")
|
||||
self.enabled = False
|
||||
self.client = None
|
||||
|
||||
def increment(self, name: str, value: int = 1, tags: dict[str, str] | None = None) -> None:
|
||||
"""Increment a counter metric.
|
||||
|
||||
Args:
|
||||
name: Metric name
|
||||
value: Increment value
|
||||
tags: Optional tags/dimensions
|
||||
"""
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
try:
|
||||
dimensions = self._tags_to_dimensions(tags)
|
||||
self.client.put_metric_data(
|
||||
Namespace=self.namespace,
|
||||
MetricData=[
|
||||
{
|
||||
"MetricName": name,
|
||||
"Value": value,
|
||||
"Unit": "Count",
|
||||
"Timestamp": datetime.utcnow(),
|
||||
"Dimensions": dimensions,
|
||||
}
|
||||
],
|
||||
)
|
||||
except ClientError as e:
|
||||
logger.debug(f"Failed to send metric {name}: {e}")
|
||||
|
||||
def gauge(self, name: str, value: float, tags: dict[str, str] | None = None) -> None:
|
||||
"""Set a gauge metric value.
|
||||
|
||||
Args:
|
||||
name: Metric name
|
||||
value: Gauge value
|
||||
tags: Optional tags/dimensions
|
||||
"""
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
try:
|
||||
dimensions = self._tags_to_dimensions(tags)
|
||||
|
||||
# Determine unit based on metric name
|
||||
unit = self._infer_unit(name, value)
|
||||
|
||||
self.client.put_metric_data(
|
||||
Namespace=self.namespace,
|
||||
MetricData=[
|
||||
{
|
||||
"MetricName": name,
|
||||
"Value": value,
|
||||
"Unit": unit,
|
||||
"Timestamp": datetime.utcnow(),
|
||||
"Dimensions": dimensions,
|
||||
}
|
||||
],
|
||||
)
|
||||
except ClientError as e:
|
||||
logger.debug(f"Failed to send gauge {name}: {e}")
|
||||
|
||||
def timing(self, name: str, value: float, tags: dict[str, str] | None = None) -> None:
|
||||
"""Record a timing metric.
|
||||
|
||||
Args:
|
||||
name: Metric name
|
||||
value: Time in milliseconds
|
||||
tags: Optional tags/dimensions
|
||||
"""
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
try:
|
||||
dimensions = self._tags_to_dimensions(tags)
|
||||
self.client.put_metric_data(
|
||||
Namespace=self.namespace,
|
||||
MetricData=[
|
||||
{
|
||||
"MetricName": name,
|
||||
"Value": value,
|
||||
"Unit": "Milliseconds",
|
||||
"Timestamp": datetime.utcnow(),
|
||||
"Dimensions": dimensions,
|
||||
}
|
||||
],
|
||||
)
|
||||
except ClientError as e:
|
||||
logger.debug(f"Failed to send timing {name}: {e}")
|
||||
|
||||
def _tags_to_dimensions(self, tags: dict[str, str] | None) -> list[dict[str, str]]:
|
||||
"""Convert tags dict to CloudWatch dimensions format.
|
||||
|
||||
Args:
|
||||
tags: Tags dictionary
|
||||
|
||||
Returns:
|
||||
List of dimension dicts for CloudWatch
|
||||
"""
|
||||
if not tags:
|
||||
return []
|
||||
|
||||
return [
|
||||
{"Name": key, "Value": str(value)}
|
||||
for key, value in tags.items()
|
||||
if key and value # Skip empty keys/values
|
||||
][:10] # CloudWatch limit is 10 dimensions
|
||||
|
||||
def _infer_unit(self, name: str, value: float) -> str:
|
||||
"""Infer CloudWatch unit from metric name.
|
||||
|
||||
Args:
|
||||
name: Metric name
|
||||
value: Metric value
|
||||
|
||||
Returns:
|
||||
CloudWatch unit string
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
|
||||
# Size metrics
|
||||
if any(x in name_lower for x in ["size", "bytes"]):
|
||||
if value > BYTES_PER_GB: # > 1GB
|
||||
return "Gigabytes"
|
||||
elif value > BYTES_PER_MB: # > 1MB
|
||||
return "Megabytes"
|
||||
elif value > BYTES_PER_KB: # > 1KB
|
||||
return "Kilobytes"
|
||||
return "Bytes"
|
||||
|
||||
# Time metrics
|
||||
if any(x in name_lower for x in ["time", "duration", "latency"]):
|
||||
if value > 1000: # > 1 second
|
||||
return "Seconds"
|
||||
return "Milliseconds"
|
||||
|
||||
# Percentage metrics
|
||||
if any(x in name_lower for x in ["ratio", "percent", "rate"]):
|
||||
return "Percent"
|
||||
|
||||
# Count metrics
|
||||
if any(x in name_lower for x in ["count", "total", "number"]):
|
||||
return "Count"
|
||||
|
||||
# Default to None (no unit)
|
||||
return "None"
|
||||
|
||||
|
||||
class LoggingMetricsAdapter(MetricsPort):
|
||||
"""Simple logging-based metrics adapter for development/debugging."""
|
||||
|
||||
def __init__(self, log_level: str = "INFO"):
|
||||
"""Initialize logging metrics adapter.
|
||||
|
||||
Args:
|
||||
log_level: Logging level for metrics
|
||||
"""
|
||||
self.log_level = getattr(logging, log_level.upper(), logging.INFO)
|
||||
|
||||
def increment(self, name: str, value: int = 1, tags: dict[str, str] | None = None) -> None:
|
||||
"""Log counter increment."""
|
||||
logger.log(self.log_level, f"METRIC:INCREMENT {name}={value} tags={tags or {}}")
|
||||
|
||||
def gauge(self, name: str, value: float, tags: dict[str, str] | None = None) -> None:
|
||||
"""Log gauge value."""
|
||||
logger.log(self.log_level, f"METRIC:GAUGE {name}={value:.2f} tags={tags or {}}")
|
||||
|
||||
def timing(self, name: str, value: float, tags: dict[str, str] | None = None) -> None:
|
||||
"""Log timing value."""
|
||||
logger.log(self.log_level, f"METRIC:TIMING {name}={value:.2f}ms tags={tags or {}}")
|
||||
@@ -3,7 +3,7 @@
|
||||
import os
|
||||
from collections.abc import Iterator
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, BinaryIO, Optional
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO, Optional
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
@@ -21,13 +21,31 @@ class S3StorageAdapter(StoragePort):
|
||||
self,
|
||||
client: Optional["S3Client"] = None,
|
||||
endpoint_url: str | None = None,
|
||||
boto3_kwargs: dict[str, Any] | None = None,
|
||||
):
|
||||
"""Initialize with S3 client."""
|
||||
"""Initialize with S3 client.
|
||||
|
||||
Args:
|
||||
client: Pre-configured S3 client (if None, one will be created)
|
||||
endpoint_url: S3 endpoint URL override (for MinIO, LocalStack, etc.)
|
||||
boto3_kwargs: Additional kwargs to pass to boto3.client() including:
|
||||
- aws_access_key_id: AWS access key
|
||||
- aws_secret_access_key: AWS secret key
|
||||
- aws_session_token: AWS session token (for temporary credentials)
|
||||
- region_name: AWS region name
|
||||
"""
|
||||
if client is None:
|
||||
self.client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=endpoint_url or os.environ.get("AWS_ENDPOINT_URL"),
|
||||
)
|
||||
# Build boto3 client parameters
|
||||
client_params: dict[str, Any] = {
|
||||
"service_name": "s3",
|
||||
"endpoint_url": endpoint_url or os.environ.get("AWS_ENDPOINT_URL"),
|
||||
}
|
||||
|
||||
# Merge in any additional boto3 kwargs (credentials, region, etc.)
|
||||
if boto3_kwargs:
|
||||
client_params.update(boto3_kwargs)
|
||||
|
||||
self.client = boto3.client(**client_params)
|
||||
else:
|
||||
self.client = client
|
||||
|
||||
@@ -50,7 +68,11 @@ class S3StorageAdapter(StoragePort):
|
||||
raise
|
||||
|
||||
def list(self, prefix: str) -> Iterator[ObjectHead]:
|
||||
"""List objects by prefix."""
|
||||
"""List objects by prefix (implements StoragePort interface).
|
||||
|
||||
This is a simple iterator for core service compatibility.
|
||||
For advanced S3 features, use list_objects instead.
|
||||
"""
|
||||
# Handle bucket-only prefix (e.g., "bucket" or "bucket/")
|
||||
if "/" not in prefix:
|
||||
bucket = prefix
|
||||
@@ -68,13 +90,80 @@ class S3StorageAdapter(StoragePort):
|
||||
if head:
|
||||
yield head
|
||||
|
||||
def list_objects(
|
||||
self,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""List objects with S3-compatible response.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Filter results to keys beginning with prefix
|
||||
delimiter: Delimiter for grouping keys (e.g., '/' for folders)
|
||||
max_keys: Maximum number of keys to return
|
||||
start_after: Start listing after this key
|
||||
|
||||
Returns:
|
||||
Dict with objects, common_prefixes, and pagination info
|
||||
"""
|
||||
params: dict[str, Any] = {
|
||||
"Bucket": bucket,
|
||||
"MaxKeys": max_keys,
|
||||
}
|
||||
|
||||
if prefix:
|
||||
params["Prefix"] = prefix
|
||||
if delimiter:
|
||||
params["Delimiter"] = delimiter
|
||||
if start_after:
|
||||
params["StartAfter"] = start_after
|
||||
|
||||
try:
|
||||
response = self.client.list_objects_v2(**params)
|
||||
|
||||
# Process objects
|
||||
objects = []
|
||||
for obj in response.get("Contents", []):
|
||||
objects.append(
|
||||
{
|
||||
"key": obj["Key"],
|
||||
"size": obj["Size"],
|
||||
"last_modified": obj["LastModified"].isoformat()
|
||||
if hasattr(obj["LastModified"], "isoformat")
|
||||
else str(obj["LastModified"]),
|
||||
"etag": obj.get("ETag", "").strip('"'),
|
||||
"storage_class": obj.get("StorageClass", "STANDARD"),
|
||||
}
|
||||
)
|
||||
|
||||
# Process common prefixes (folders)
|
||||
common_prefixes = []
|
||||
for prefix_info in response.get("CommonPrefixes", []):
|
||||
common_prefixes.append(prefix_info["Prefix"])
|
||||
|
||||
return {
|
||||
"objects": objects,
|
||||
"common_prefixes": common_prefixes,
|
||||
"is_truncated": response.get("IsTruncated", False),
|
||||
"next_continuation_token": response.get("NextContinuationToken"),
|
||||
"key_count": response.get("KeyCount", len(objects)),
|
||||
}
|
||||
except ClientError as e:
|
||||
if e.response["Error"]["Code"] == "NoSuchBucket":
|
||||
raise FileNotFoundError(f"Bucket not found: {bucket}") from e
|
||||
raise
|
||||
|
||||
def get(self, key: str) -> BinaryIO:
|
||||
"""Get object content as stream."""
|
||||
bucket, object_key = self._parse_key(key)
|
||||
|
||||
try:
|
||||
response = self.client.get_object(Bucket=bucket, Key=object_key)
|
||||
return response["Body"] # type: ignore[return-value]
|
||||
return response["Body"] # type: ignore[no-any-return]
|
||||
except ClientError as e:
|
||||
if e.response["Error"]["Code"] == "NoSuchKey":
|
||||
raise FileNotFoundError(f"Object not found: {key}") from e
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
"""CLI main entry point."""
|
||||
|
||||
import atexit
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
|
||||
from ...adapters import (
|
||||
FsCacheAdapter,
|
||||
NoopMetricsAdapter,
|
||||
S3StorageAdapter,
|
||||
Sha256Adapter,
|
||||
@@ -16,7 +18,9 @@ from ...adapters import (
|
||||
UtcClockAdapter,
|
||||
XdeltaAdapter,
|
||||
)
|
||||
from ...core import DeltaService, DeltaSpace, ObjectKey
|
||||
from ...core import DeltaService, ObjectKey
|
||||
from ...ports import MetricsPort
|
||||
from ...ports.cache import CachePort
|
||||
from .aws_compat import (
|
||||
copy_s3_to_s3,
|
||||
determine_operation,
|
||||
@@ -37,8 +41,13 @@ def create_service(
|
||||
) -> DeltaService:
|
||||
"""Create service with wired adapters."""
|
||||
# Get config from environment
|
||||
cache_dir = Path(os.environ.get("DG_CACHE_DIR", "/tmp/.deltaglider/reference_cache"))
|
||||
max_ratio = float(os.environ.get("DG_MAX_RATIO", "0.5"))
|
||||
metrics_type = os.environ.get("DG_METRICS", "logging") # Options: noop, logging, cloudwatch
|
||||
|
||||
# SECURITY: Always use ephemeral process-isolated cache
|
||||
cache_dir = Path(tempfile.mkdtemp(prefix="deltaglider-", dir="/tmp"))
|
||||
# Register cleanup handler to remove cache on exit
|
||||
atexit.register(lambda: shutil.rmtree(cache_dir, ignore_errors=True))
|
||||
|
||||
# Set AWS environment variables if provided
|
||||
if endpoint_url:
|
||||
@@ -52,10 +61,44 @@ def create_service(
|
||||
hasher = Sha256Adapter()
|
||||
storage = S3StorageAdapter(endpoint_url=endpoint_url)
|
||||
diff = XdeltaAdapter()
|
||||
cache = FsCacheAdapter(cache_dir, hasher)
|
||||
|
||||
# SECURITY: Configurable cache with encryption and backend selection
|
||||
from deltaglider.adapters import ContentAddressedCache, EncryptedCache, MemoryCache
|
||||
|
||||
# Select backend: memory or filesystem
|
||||
cache_backend = os.environ.get("DG_CACHE_BACKEND", "filesystem") # Options: filesystem, memory
|
||||
base_cache: CachePort
|
||||
if cache_backend == "memory":
|
||||
max_size_mb = int(os.environ.get("DG_CACHE_MEMORY_SIZE_MB", "100"))
|
||||
base_cache = MemoryCache(hasher, max_size_mb=max_size_mb, temp_dir=cache_dir)
|
||||
else:
|
||||
# Filesystem-backed with Content-Addressed Storage
|
||||
base_cache = ContentAddressedCache(cache_dir, hasher)
|
||||
|
||||
# Always apply encryption with ephemeral keys (security hardening)
|
||||
# Encryption key is optional via DG_CACHE_ENCRYPTION_KEY (ephemeral if not set)
|
||||
cache: CachePort = EncryptedCache.from_env(base_cache)
|
||||
|
||||
clock = UtcClockAdapter()
|
||||
logger = StdLoggerAdapter(level=log_level)
|
||||
metrics = NoopMetricsAdapter()
|
||||
|
||||
# Create metrics adapter based on configuration
|
||||
metrics: MetricsPort
|
||||
if metrics_type == "cloudwatch":
|
||||
# Import here to avoid dependency if not used
|
||||
from ...adapters.metrics_cloudwatch import CloudWatchMetricsAdapter
|
||||
|
||||
metrics = CloudWatchMetricsAdapter(
|
||||
namespace=os.environ.get("DG_METRICS_NAMESPACE", "DeltaGlider"),
|
||||
region=region,
|
||||
endpoint_url=endpoint_url if endpoint_url and "localhost" in endpoint_url else None,
|
||||
)
|
||||
elif metrics_type == "logging":
|
||||
from ...adapters.metrics_cloudwatch import LoggingMetricsAdapter
|
||||
|
||||
metrics = LoggingMetricsAdapter(log_level=log_level)
|
||||
else:
|
||||
metrics = NoopMetricsAdapter()
|
||||
|
||||
# Create service
|
||||
return DeltaService(
|
||||
@@ -221,6 +264,13 @@ def ls(
|
||||
prefix_str: str
|
||||
bucket_name, prefix_str = parse_s3_url(s3_url)
|
||||
|
||||
# Ensure prefix ends with / if it's meant to be a directory
|
||||
# This helps with proper path handling
|
||||
if prefix_str and not prefix_str.endswith("/"):
|
||||
# Check if this is a file or directory by listing
|
||||
# For now, assume it's a directory prefix
|
||||
prefix_str = prefix_str + "/"
|
||||
|
||||
# Format bytes to human readable
|
||||
def format_bytes(size: int) -> str:
|
||||
if not human_readable:
|
||||
@@ -232,53 +282,66 @@ def ls(
|
||||
size_float /= 1024.0
|
||||
return f"{size_float:.1f}P"
|
||||
|
||||
# List objects
|
||||
list_prefix = f"{bucket_name}/{prefix_str}" if prefix_str else bucket_name
|
||||
objects = list(service.storage.list(list_prefix))
|
||||
# List objects using SDK (automatically filters .delta and reference.bin)
|
||||
from deltaglider.client import DeltaGliderClient
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
dg_response = client.list_objects(
|
||||
Bucket=bucket_name,
|
||||
Prefix=prefix_str,
|
||||
MaxKeys=10000,
|
||||
Delimiter="/" if not recursive else "",
|
||||
)
|
||||
objects = dg_response["Contents"]
|
||||
|
||||
# Filter by recursive flag
|
||||
if not recursive:
|
||||
# Only show direct children
|
||||
seen_prefixes = set()
|
||||
# Show common prefixes (subdirectories) from S3 response
|
||||
for common_prefix in dg_response.get("CommonPrefixes", []):
|
||||
prefix_path = common_prefix.get("Prefix", "")
|
||||
# Show only the directory name, not the full path
|
||||
if prefix_str:
|
||||
# Strip the current prefix to show only the subdirectory
|
||||
display_name = prefix_path[len(prefix_str) :]
|
||||
else:
|
||||
display_name = prefix_path
|
||||
click.echo(f" PRE {display_name}")
|
||||
|
||||
# Only show files at current level (not in subdirectories)
|
||||
filtered_objects = []
|
||||
for obj in objects:
|
||||
rel_path = obj.key[len(prefix_str) :] if prefix_str else obj.key
|
||||
if "/" in rel_path:
|
||||
# It's in a subdirectory
|
||||
subdir = rel_path.split("/")[0] + "/"
|
||||
if subdir not in seen_prefixes:
|
||||
seen_prefixes.add(subdir)
|
||||
# Show as directory
|
||||
full_prefix = f"{prefix_str}{subdir}" if prefix_str else subdir
|
||||
click.echo(f" PRE {full_prefix}")
|
||||
else:
|
||||
# Direct file
|
||||
if rel_path: # Only add if there's actually a file at this level
|
||||
filtered_objects.append(obj)
|
||||
obj_key = obj["Key"]
|
||||
rel_path = obj_key[len(prefix_str) :] if prefix_str else obj_key
|
||||
# Only include if it's a direct child (no / in relative path)
|
||||
if "/" not in rel_path and rel_path:
|
||||
filtered_objects.append(obj)
|
||||
objects = filtered_objects
|
||||
|
||||
# Display objects
|
||||
# Display objects (SDK already filters reference.bin and strips .delta)
|
||||
total_size = 0
|
||||
total_count = 0
|
||||
|
||||
for obj in objects:
|
||||
# Skip reference.bin files (internal)
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
continue
|
||||
|
||||
total_size += obj.size
|
||||
total_size += obj["Size"]
|
||||
total_count += 1
|
||||
|
||||
# Format the display
|
||||
size_str = format_bytes(obj.size)
|
||||
date_str = obj.last_modified.strftime("%Y-%m-%d %H:%M:%S")
|
||||
size_str = format_bytes(obj["Size"])
|
||||
# last_modified is a string from SDK, parse it if needed
|
||||
last_modified = obj.get("LastModified", "")
|
||||
if isinstance(last_modified, str):
|
||||
# Already a string, extract date portion
|
||||
date_str = last_modified[:19].replace("T", " ")
|
||||
else:
|
||||
date_str = last_modified.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# Remove .delta extension from display
|
||||
display_key = obj.key
|
||||
if display_key.endswith(".delta"):
|
||||
display_key = display_key[:-6]
|
||||
# Show only the filename relative to current prefix (like AWS CLI)
|
||||
if prefix_str:
|
||||
display_key = obj["Key"][len(prefix_str) :]
|
||||
else:
|
||||
display_key = obj["Key"]
|
||||
|
||||
click.echo(f"{date_str} {size_str:>10} s3://{bucket_name}/{display_key}")
|
||||
click.echo(f"{date_str} {size_str:>10} {display_key}")
|
||||
|
||||
# Show summary if requested
|
||||
if summarize:
|
||||
@@ -386,28 +449,45 @@ def rm(
|
||||
click.echo("Error: Cannot remove directories. Use --recursive", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# List all objects with prefix
|
||||
list_prefix = f"{bucket}/{prefix}" if prefix else bucket
|
||||
objects = list(service.storage.list(list_prefix))
|
||||
|
||||
if not objects:
|
||||
if not quiet:
|
||||
click.echo(f"delete: No objects found with prefix: s3://{bucket}/{prefix}")
|
||||
return
|
||||
|
||||
# Delete all objects
|
||||
deleted_count = 0
|
||||
for obj in objects:
|
||||
if dryrun:
|
||||
click.echo(f"(dryrun) delete: s3://{bucket}/{obj.key}")
|
||||
else:
|
||||
service.storage.delete(f"{bucket}/{obj.key}")
|
||||
# Use the service's delete_recursive method for proper delta-aware deletion
|
||||
if dryrun:
|
||||
# For dryrun, we need to simulate what would be deleted
|
||||
objects = list(service.storage.list(f"{bucket}/{prefix}" if prefix else bucket))
|
||||
if not objects:
|
||||
if not quiet:
|
||||
click.echo(f"delete: s3://{bucket}/{obj.key}")
|
||||
deleted_count += 1
|
||||
click.echo(f"delete: No objects found with prefix: s3://{bucket}/{prefix}")
|
||||
return
|
||||
|
||||
if not quiet and not dryrun:
|
||||
click.echo(f"Deleted {deleted_count} object(s)")
|
||||
for obj in objects:
|
||||
click.echo(f"(dryrun) delete: s3://{bucket}/{obj.key}")
|
||||
|
||||
if not quiet:
|
||||
click.echo(f"Would delete {len(objects)} object(s)")
|
||||
else:
|
||||
# Use the core service method for actual deletion
|
||||
result = service.delete_recursive(bucket, prefix)
|
||||
|
||||
# Report the results
|
||||
if not quiet:
|
||||
if result["deleted_count"] == 0:
|
||||
click.echo(f"delete: No objects found with prefix: s3://{bucket}/{prefix}")
|
||||
else:
|
||||
click.echo(f"Deleted {result['deleted_count']} object(s)")
|
||||
|
||||
# Show warnings if any references were kept
|
||||
for warning in result.get("warnings", []):
|
||||
if "Kept reference" in warning:
|
||||
click.echo(
|
||||
f"Keeping reference file (still in use): s3://{bucket}/{warning.split()[2]}"
|
||||
)
|
||||
|
||||
# Report any errors
|
||||
if result["failed_count"] > 0:
|
||||
for error in result.get("errors", []):
|
||||
click.echo(f"Error: {error}", err=True)
|
||||
|
||||
if result["failed_count"] > 0:
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"delete failed: {e}", err=True)
|
||||
@@ -519,130 +599,6 @@ def sync(
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("file", type=click.Path(exists=True, path_type=Path))
|
||||
@click.argument("s3_url")
|
||||
@click.option("--max-ratio", type=float, help="Max delta/file ratio (default: 0.5)")
|
||||
@click.pass_obj
|
||||
def put(service: DeltaService, file: Path, s3_url: str, max_ratio: float | None) -> None:
|
||||
"""Upload file as reference or delta (legacy command, use 'cp' instead)."""
|
||||
# Parse S3 URL
|
||||
if not s3_url.startswith("s3://"):
|
||||
click.echo(f"Error: Invalid S3 URL: {s3_url}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Extract bucket and prefix
|
||||
s3_path = s3_url[5:].rstrip("/")
|
||||
parts = s3_path.split("/", 1)
|
||||
bucket = parts[0]
|
||||
prefix = parts[1] if len(parts) > 1 else ""
|
||||
|
||||
delta_space = DeltaSpace(bucket=bucket, prefix=prefix)
|
||||
|
||||
try:
|
||||
summary = service.put(file, delta_space, max_ratio)
|
||||
|
||||
# Output JSON summary
|
||||
output = {
|
||||
"operation": summary.operation,
|
||||
"bucket": summary.bucket,
|
||||
"key": summary.key,
|
||||
"original_name": summary.original_name,
|
||||
"file_size": summary.file_size,
|
||||
"file_sha256": summary.file_sha256,
|
||||
}
|
||||
|
||||
if summary.delta_size is not None:
|
||||
output["delta_size"] = summary.delta_size
|
||||
output["delta_ratio"] = round(summary.delta_ratio or 0, 3)
|
||||
|
||||
if summary.ref_key:
|
||||
output["ref_key"] = summary.ref_key
|
||||
output["ref_sha256"] = summary.ref_sha256
|
||||
|
||||
output["cache_hit"] = summary.cache_hit
|
||||
|
||||
click.echo(json.dumps(output, indent=2))
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("s3_url")
|
||||
@click.option("-o", "--output", type=click.Path(path_type=Path), help="Output file path")
|
||||
@click.pass_obj
|
||||
def get(service: DeltaService, s3_url: str, output: Path | None) -> None:
|
||||
"""Download and hydrate delta file.
|
||||
|
||||
The S3 URL can be either:
|
||||
- Full path to delta file: s3://bucket/path/to/file.zip.delta
|
||||
- Path to original file (will append .delta): s3://bucket/path/to/file.zip
|
||||
"""
|
||||
# Parse S3 URL
|
||||
if not s3_url.startswith("s3://"):
|
||||
click.echo(f"Error: Invalid S3 URL: {s3_url}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
s3_path = s3_url[5:]
|
||||
parts = s3_path.split("/", 1)
|
||||
if len(parts) != 2:
|
||||
click.echo(f"Error: Invalid S3 URL: {s3_url}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
bucket = parts[0]
|
||||
key = parts[1]
|
||||
|
||||
# Try to determine if this is a direct file or needs .delta appended
|
||||
# First try the key as-is
|
||||
obj_key = ObjectKey(bucket=bucket, key=key)
|
||||
|
||||
# Check if the file exists using the service's storage port
|
||||
# which already has proper credentials configured
|
||||
try:
|
||||
# Try to head the object as-is
|
||||
obj_head = service.storage.head(f"{bucket}/{key}")
|
||||
if obj_head is not None:
|
||||
click.echo(f"Found file: s3://{bucket}/{key}")
|
||||
else:
|
||||
# If not found and doesn't end with .delta, try adding .delta
|
||||
if not key.endswith(".delta"):
|
||||
delta_key = f"{key}.delta"
|
||||
delta_head = service.storage.head(f"{bucket}/{delta_key}")
|
||||
if delta_head is not None:
|
||||
key = delta_key
|
||||
obj_key = ObjectKey(bucket=bucket, key=key)
|
||||
click.echo(f"Found delta file: s3://{bucket}/{key}")
|
||||
else:
|
||||
click.echo(
|
||||
f"Error: File not found: s3://{bucket}/{key} (also tried .delta)", err=True
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
click.echo(f"Error: File not found: s3://{bucket}/{key}", err=True)
|
||||
sys.exit(1)
|
||||
except Exception:
|
||||
# For unexpected errors, just proceed with the original key
|
||||
click.echo(f"Warning: Could not check file existence, proceeding with: s3://{bucket}/{key}")
|
||||
|
||||
# Determine output path
|
||||
if output is None:
|
||||
# Extract original name from delta name
|
||||
if key.endswith(".delta"):
|
||||
output = Path(Path(key).stem)
|
||||
else:
|
||||
output = Path(Path(key).name)
|
||||
|
||||
try:
|
||||
service.get(obj_key, output)
|
||||
click.echo(f"Successfully retrieved: {output}")
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("s3_url")
|
||||
@click.pass_obj
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
35
src/deltaglider/client_delete_helpers.py
Normal file
35
src/deltaglider/client_delete_helpers.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""Helper utilities for client delete operations."""
|
||||
|
||||
from .core import DeltaService, ObjectKey
|
||||
from .core.errors import NotFoundError
|
||||
|
||||
|
||||
def delete_with_delta_suffix(
|
||||
service: DeltaService, bucket: str, key: str
|
||||
) -> tuple[str, dict[str, object]]:
|
||||
"""Delete an object, retrying with '.delta' suffix when needed.
|
||||
|
||||
Args:
|
||||
service: DeltaService-like instance exposing ``delete(ObjectKey)``.
|
||||
bucket: Target bucket.
|
||||
key: Requested key (without forcing .delta suffix).
|
||||
|
||||
Returns:
|
||||
Tuple containing the actual key deleted in storage and the delete result dict.
|
||||
|
||||
Raises:
|
||||
NotFoundError: Propagated when both the direct and '.delta' keys are missing.
|
||||
"""
|
||||
actual_key = key
|
||||
object_key = ObjectKey(bucket=bucket, key=actual_key)
|
||||
|
||||
try:
|
||||
delete_result = service.delete(object_key)
|
||||
except NotFoundError:
|
||||
if key.endswith(".delta"):
|
||||
raise
|
||||
actual_key = f"{key}.delta"
|
||||
object_key = ObjectKey(bucket=bucket, key=actual_key)
|
||||
delete_result = service.delete(object_key)
|
||||
|
||||
return actual_key, delete_result
|
||||
99
src/deltaglider/client_models.py
Normal file
99
src/deltaglider/client_models.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Shared data models for the DeltaGlider client."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class UploadSummary:
|
||||
"""User-friendly upload summary."""
|
||||
|
||||
operation: str
|
||||
bucket: str
|
||||
key: str
|
||||
original_size: int
|
||||
stored_size: int
|
||||
is_delta: bool
|
||||
delta_ratio: float = 0.0
|
||||
|
||||
@property
|
||||
def original_size_mb(self) -> float:
|
||||
"""Original size in MB."""
|
||||
return self.original_size / (1024 * 1024)
|
||||
|
||||
@property
|
||||
def stored_size_mb(self) -> float:
|
||||
"""Stored size in MB."""
|
||||
return self.stored_size / (1024 * 1024)
|
||||
|
||||
@property
|
||||
def savings_percent(self) -> float:
|
||||
"""Percentage saved through compression."""
|
||||
if self.original_size == 0:
|
||||
return 0.0
|
||||
return ((self.original_size - self.stored_size) / self.original_size) * 100
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompressionEstimate:
|
||||
"""Compression estimate for a file."""
|
||||
|
||||
original_size: int
|
||||
estimated_compressed_size: int
|
||||
estimated_ratio: float
|
||||
confidence: float
|
||||
recommended_reference: str | None = None
|
||||
should_use_delta: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class ObjectInfo:
|
||||
"""Detailed object information with compression stats."""
|
||||
|
||||
key: str
|
||||
size: int
|
||||
last_modified: str
|
||||
etag: str | None = None
|
||||
storage_class: str = "STANDARD"
|
||||
|
||||
# DeltaGlider-specific fields
|
||||
original_size: int | None = None
|
||||
compressed_size: int | None = None
|
||||
compression_ratio: float | None = None
|
||||
is_delta: bool = False
|
||||
reference_key: str | None = None
|
||||
delta_chain_length: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListObjectsResponse:
|
||||
"""Response from list_objects, compatible with boto3."""
|
||||
|
||||
name: str # Bucket name
|
||||
prefix: str = ""
|
||||
delimiter: str = ""
|
||||
max_keys: int = 1000
|
||||
common_prefixes: list[dict[str, str]] = field(default_factory=list)
|
||||
contents: list[ObjectInfo] = field(default_factory=list)
|
||||
is_truncated: bool = False
|
||||
next_continuation_token: str | None = None
|
||||
continuation_token: str | None = None
|
||||
key_count: int = 0
|
||||
|
||||
@property
|
||||
def objects(self) -> list[ObjectInfo]:
|
||||
"""Alias for contents, for convenience."""
|
||||
return self.contents
|
||||
|
||||
|
||||
@dataclass
|
||||
class BucketStats:
|
||||
"""Statistics for a bucket."""
|
||||
|
||||
bucket: str
|
||||
object_count: int
|
||||
total_size: int
|
||||
compressed_size: int
|
||||
space_saved: int
|
||||
average_compression_ratio: float
|
||||
delta_objects: int
|
||||
direct_objects: int
|
||||
37
src/deltaglider/client_operations/__init__.py
Normal file
37
src/deltaglider/client_operations/__init__.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""Client operation modules for DeltaGliderClient.
|
||||
|
||||
This package contains modular operation implementations:
|
||||
- bucket: S3 bucket management (create, delete, list)
|
||||
- presigned: Presigned URL generation for temporary access
|
||||
- batch: Batch upload/download operations
|
||||
- stats: Statistics and analytics operations
|
||||
"""
|
||||
|
||||
from .batch import download_batch, upload_batch, upload_chunked
|
||||
from .bucket import create_bucket, delete_bucket, list_buckets
|
||||
from .presigned import generate_presigned_post, generate_presigned_url
|
||||
from .stats import (
|
||||
estimate_compression,
|
||||
find_similar_files,
|
||||
get_bucket_stats,
|
||||
get_object_info,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Bucket operations
|
||||
"create_bucket",
|
||||
"delete_bucket",
|
||||
"list_buckets",
|
||||
# Presigned operations
|
||||
"generate_presigned_url",
|
||||
"generate_presigned_post",
|
||||
# Batch operations
|
||||
"upload_chunked",
|
||||
"upload_batch",
|
||||
"download_batch",
|
||||
# Stats operations
|
||||
"get_bucket_stats",
|
||||
"get_object_info",
|
||||
"estimate_compression",
|
||||
"find_similar_files",
|
||||
]
|
||||
159
src/deltaglider/client_operations/batch.py
Normal file
159
src/deltaglider/client_operations/batch.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""Batch upload/download operations for DeltaGlider client.
|
||||
|
||||
This module contains DeltaGlider-specific batch operations:
|
||||
- upload_batch
|
||||
- download_batch
|
||||
- upload_chunked
|
||||
"""
|
||||
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from ..client_models import UploadSummary
|
||||
|
||||
|
||||
def upload_chunked(
|
||||
client: Any, # DeltaGliderClient
|
||||
file_path: str | Path,
|
||||
s3_url: str,
|
||||
chunk_size: int = 5 * 1024 * 1024,
|
||||
progress_callback: Callable[[int, int, int, int], None] | None = None,
|
||||
max_ratio: float = 0.5,
|
||||
) -> UploadSummary:
|
||||
"""Upload a file in chunks with progress callback.
|
||||
|
||||
This method reads the file in chunks to avoid loading large files entirely into memory,
|
||||
making it suitable for uploading very large files. Progress is reported after each chunk.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
file_path: Local file to upload
|
||||
s3_url: S3 destination URL (s3://bucket/path/filename)
|
||||
chunk_size: Size of each chunk in bytes (default 5MB)
|
||||
progress_callback: Callback(chunk_number, total_chunks, bytes_sent, total_bytes)
|
||||
max_ratio: Maximum acceptable delta/file ratio for compression
|
||||
|
||||
Returns:
|
||||
UploadSummary with compression statistics
|
||||
|
||||
Example:
|
||||
def on_progress(chunk_num, total_chunks, bytes_sent, total_bytes):
|
||||
percent = (bytes_sent / total_bytes) * 100
|
||||
print(f"Upload progress: {percent:.1f}%")
|
||||
|
||||
client.upload_chunked(
|
||||
"large_file.zip",
|
||||
"s3://bucket/releases/large_file.zip",
|
||||
chunk_size=10 * 1024 * 1024, # 10MB chunks
|
||||
progress_callback=on_progress
|
||||
)
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
# For small files, just use regular upload
|
||||
if file_size <= chunk_size:
|
||||
if progress_callback:
|
||||
progress_callback(1, 1, file_size, file_size)
|
||||
result: UploadSummary = client.upload(file_path, s3_url, max_ratio=max_ratio)
|
||||
return result
|
||||
|
||||
# Calculate chunks
|
||||
total_chunks = (file_size + chunk_size - 1) // chunk_size
|
||||
|
||||
# Create a temporary file for chunked processing
|
||||
# For now, we read the entire file but report progress in chunks
|
||||
# Future enhancement: implement true streaming upload in storage adapter
|
||||
bytes_read = 0
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
for chunk_num in range(1, total_chunks + 1):
|
||||
# Read chunk (simulated for progress reporting)
|
||||
chunk_data = f.read(chunk_size)
|
||||
bytes_read += len(chunk_data)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(chunk_num, total_chunks, bytes_read, file_size)
|
||||
|
||||
# Perform the actual upload
|
||||
# TODO: When storage adapter supports streaming, pass chunks directly
|
||||
upload_result: UploadSummary = client.upload(file_path, s3_url, max_ratio=max_ratio)
|
||||
|
||||
# Final progress callback
|
||||
if progress_callback:
|
||||
progress_callback(total_chunks, total_chunks, file_size, file_size)
|
||||
|
||||
return upload_result
|
||||
|
||||
|
||||
def upload_batch(
|
||||
client: Any, # DeltaGliderClient
|
||||
files: list[str | Path],
|
||||
s3_prefix: str,
|
||||
max_ratio: float = 0.5,
|
||||
progress_callback: Callable[[str, int, int], None] | None = None,
|
||||
) -> list[UploadSummary]:
|
||||
"""Upload multiple files in batch.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
files: List of local file paths
|
||||
s3_prefix: S3 destination prefix (s3://bucket/prefix/)
|
||||
max_ratio: Maximum acceptable delta/file ratio
|
||||
progress_callback: Callback(filename, current_file_index, total_files)
|
||||
|
||||
Returns:
|
||||
List of UploadSummary objects
|
||||
"""
|
||||
results = []
|
||||
|
||||
for i, file_path in enumerate(files):
|
||||
file_path = Path(file_path)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(file_path.name, i + 1, len(files))
|
||||
|
||||
# Upload each file
|
||||
s3_url = f"{s3_prefix.rstrip('/')}/{file_path.name}"
|
||||
summary = client.upload(file_path, s3_url, max_ratio=max_ratio)
|
||||
results.append(summary)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def download_batch(
|
||||
client: Any, # DeltaGliderClient
|
||||
s3_urls: list[str],
|
||||
output_dir: str | Path,
|
||||
progress_callback: Callable[[str, int, int], None] | None = None,
|
||||
) -> list[Path]:
|
||||
"""Download multiple files in batch.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
s3_urls: List of S3 URLs to download
|
||||
output_dir: Local directory to save files
|
||||
progress_callback: Callback(filename, current_file_index, total_files)
|
||||
|
||||
Returns:
|
||||
List of downloaded file paths
|
||||
"""
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
results = []
|
||||
|
||||
for i, s3_url in enumerate(s3_urls):
|
||||
# Extract filename from URL
|
||||
filename = s3_url.split("/")[-1]
|
||||
if filename.endswith(".delta"):
|
||||
filename = filename[:-6] # Remove .delta suffix
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(filename, i + 1, len(s3_urls))
|
||||
|
||||
output_path = output_dir / filename
|
||||
client.download(s3_url, output_path)
|
||||
results.append(output_path)
|
||||
|
||||
return results
|
||||
152
src/deltaglider/client_operations/bucket.py
Normal file
152
src/deltaglider/client_operations/bucket.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""Bucket management operations for DeltaGlider client.
|
||||
|
||||
This module contains boto3-compatible bucket operations:
|
||||
- create_bucket
|
||||
- delete_bucket
|
||||
- list_buckets
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def create_bucket(
|
||||
client: Any, # DeltaGliderClient (avoiding circular import)
|
||||
Bucket: str,
|
||||
CreateBucketConfiguration: dict[str, str] | None = None,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Create an S3 bucket (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name to create
|
||||
CreateBucketConfiguration: Optional bucket configuration (e.g., LocationConstraint)
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with bucket location
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> client.create_bucket(Bucket='my-bucket')
|
||||
>>> # With region
|
||||
>>> client.create_bucket(
|
||||
... Bucket='my-bucket',
|
||||
... CreateBucketConfiguration={'LocationConstraint': 'us-west-2'}
|
||||
... )
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
params: dict[str, Any] = {"Bucket": Bucket}
|
||||
if CreateBucketConfiguration:
|
||||
params["CreateBucketConfiguration"] = CreateBucketConfiguration
|
||||
|
||||
response = storage_adapter.client.create_bucket(**params)
|
||||
return {
|
||||
"Location": response.get("Location", f"/{Bucket}"),
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "BucketAlreadyExists" in error_msg or "BucketAlreadyOwnedByYou" in error_msg:
|
||||
# Bucket already exists - return success
|
||||
client.service.logger.debug(f"Bucket {Bucket} already exists")
|
||||
return {
|
||||
"Location": f"/{Bucket}",
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
raise RuntimeError(f"Failed to create bucket: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket creation")
|
||||
|
||||
|
||||
def delete_bucket(
|
||||
client: Any, # DeltaGliderClient
|
||||
Bucket: str,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Delete an S3 bucket (boto3-compatible).
|
||||
|
||||
Note: Bucket must be empty before deletion.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name to delete
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with deletion status
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> client.delete_bucket(Bucket='my-bucket')
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
storage_adapter.client.delete_bucket(Bucket=Bucket)
|
||||
return {
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 204,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "NoSuchBucket" in error_msg:
|
||||
# Bucket doesn't exist - return success
|
||||
client.service.logger.debug(f"Bucket {Bucket} does not exist")
|
||||
return {
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 204,
|
||||
},
|
||||
}
|
||||
raise RuntimeError(f"Failed to delete bucket: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket deletion")
|
||||
|
||||
|
||||
def list_buckets(
|
||||
client: Any, # DeltaGliderClient
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""List all S3 buckets (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with bucket list
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> response = client.list_buckets()
|
||||
>>> for bucket in response['Buckets']:
|
||||
... print(bucket['Name'])
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
response = storage_adapter.client.list_buckets()
|
||||
return {
|
||||
"Buckets": response.get("Buckets", []),
|
||||
"Owner": response.get("Owner", {}),
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to list buckets: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket listing")
|
||||
124
src/deltaglider/client_operations/presigned.py
Normal file
124
src/deltaglider/client_operations/presigned.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Presigned URL operations for DeltaGlider client.
|
||||
|
||||
This module contains boto3-compatible presigned URL operations:
|
||||
- generate_presigned_url
|
||||
- generate_presigned_post
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def try_boto3_presigned_operation(
|
||||
client: Any, # DeltaGliderClient
|
||||
operation: str,
|
||||
**kwargs: Any,
|
||||
) -> Any | None:
|
||||
"""Try to generate presigned operation using boto3 client, return None if not available."""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
if operation == "url":
|
||||
return str(storage_adapter.client.generate_presigned_url(**kwargs))
|
||||
elif operation == "post":
|
||||
return dict(storage_adapter.client.generate_presigned_post(**kwargs))
|
||||
except AttributeError:
|
||||
# storage_adapter does not have a 'client' attribute
|
||||
pass
|
||||
except Exception as e:
|
||||
# Fall back to manual construction if needed
|
||||
client.service.logger.warning(f"Failed to generate presigned {operation}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def generate_presigned_url(
|
||||
client: Any, # DeltaGliderClient
|
||||
ClientMethod: str,
|
||||
Params: dict[str, Any],
|
||||
ExpiresIn: int = 3600,
|
||||
) -> str:
|
||||
"""Generate presigned URL (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
ClientMethod: Method name ('get_object' or 'put_object')
|
||||
Params: Parameters dict with Bucket and Key
|
||||
ExpiresIn: URL expiration in seconds
|
||||
|
||||
Returns:
|
||||
Presigned URL string
|
||||
"""
|
||||
# Try boto3 first, fallback to manual construction
|
||||
url = try_boto3_presigned_operation(
|
||||
client,
|
||||
"url",
|
||||
ClientMethod=ClientMethod,
|
||||
Params=Params,
|
||||
ExpiresIn=ExpiresIn,
|
||||
)
|
||||
if url is not None:
|
||||
return str(url)
|
||||
|
||||
# Fallback: construct URL manually (less secure, for dev/testing only)
|
||||
bucket = Params.get("Bucket", "")
|
||||
key = Params.get("Key", "")
|
||||
|
||||
if client.endpoint_url:
|
||||
base_url = client.endpoint_url
|
||||
else:
|
||||
base_url = f"https://{bucket}.s3.amazonaws.com"
|
||||
|
||||
# Warning: This is not a real presigned URL, just a placeholder
|
||||
client.service.logger.warning("Using placeholder presigned URL - not suitable for production")
|
||||
return f"{base_url}/{key}?expires={ExpiresIn}"
|
||||
|
||||
|
||||
def generate_presigned_post(
|
||||
client: Any, # DeltaGliderClient
|
||||
Bucket: str,
|
||||
Key: str,
|
||||
Fields: dict[str, str] | None = None,
|
||||
Conditions: list[Any] | None = None,
|
||||
ExpiresIn: int = 3600,
|
||||
) -> dict[str, Any]:
|
||||
"""Generate presigned POST data for HTML forms (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: S3 bucket name
|
||||
Key: Object key
|
||||
Fields: Additional fields to include
|
||||
Conditions: Upload conditions
|
||||
ExpiresIn: URL expiration in seconds
|
||||
|
||||
Returns:
|
||||
Dict with 'url' and 'fields' for form submission
|
||||
"""
|
||||
# Try boto3 first, fallback to manual construction
|
||||
response = try_boto3_presigned_operation(
|
||||
client,
|
||||
"post",
|
||||
Bucket=Bucket,
|
||||
Key=Key,
|
||||
Fields=Fields,
|
||||
Conditions=Conditions,
|
||||
ExpiresIn=ExpiresIn,
|
||||
)
|
||||
if response is not None:
|
||||
return dict(response)
|
||||
|
||||
# Fallback: return minimal structure for compatibility
|
||||
if client.endpoint_url:
|
||||
url = f"{client.endpoint_url}/{Bucket}"
|
||||
else:
|
||||
url = f"https://{Bucket}.s3.amazonaws.com"
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"fields": {
|
||||
"key": Key,
|
||||
**(Fields or {}),
|
||||
},
|
||||
}
|
||||
337
src/deltaglider/client_operations/stats.py
Normal file
337
src/deltaglider/client_operations/stats.py
Normal file
@@ -0,0 +1,337 @@
|
||||
"""Statistics and analysis operations for DeltaGlider client.
|
||||
|
||||
This module contains DeltaGlider-specific statistics operations:
|
||||
- get_bucket_stats
|
||||
- get_object_info
|
||||
- estimate_compression
|
||||
- find_similar_files
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from ..client_models import BucketStats, CompressionEstimate, ObjectInfo
|
||||
|
||||
|
||||
def get_object_info(
|
||||
client: Any, # DeltaGliderClient
|
||||
s3_url: str,
|
||||
) -> ObjectInfo:
|
||||
"""Get detailed object information including compression stats.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
s3_url: S3 URL of the object
|
||||
|
||||
Returns:
|
||||
ObjectInfo with detailed metadata
|
||||
"""
|
||||
# Parse URL
|
||||
if not s3_url.startswith("s3://"):
|
||||
raise ValueError(f"Invalid S3 URL: {s3_url}")
|
||||
|
||||
s3_path = s3_url[5:]
|
||||
parts = s3_path.split("/", 1)
|
||||
bucket = parts[0]
|
||||
key = parts[1] if len(parts) > 1 else ""
|
||||
|
||||
# Get object metadata
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
if not obj_head:
|
||||
raise FileNotFoundError(f"Object not found: {s3_url}")
|
||||
|
||||
metadata = obj_head.metadata
|
||||
is_delta = key.endswith(".delta")
|
||||
|
||||
return ObjectInfo(
|
||||
key=key,
|
||||
size=obj_head.size,
|
||||
last_modified=metadata.get("last_modified", ""),
|
||||
etag=metadata.get("etag"),
|
||||
original_size=int(metadata.get("file_size", obj_head.size)),
|
||||
compressed_size=obj_head.size,
|
||||
compression_ratio=float(metadata.get("compression_ratio", 0.0)),
|
||||
is_delta=is_delta,
|
||||
reference_key=metadata.get("ref_key"),
|
||||
)
|
||||
|
||||
|
||||
def get_bucket_stats(
|
||||
client: Any, # DeltaGliderClient
|
||||
bucket: str,
|
||||
detailed_stats: bool = False,
|
||||
) -> BucketStats:
|
||||
"""Get statistics for a bucket with optional detailed compression metrics.
|
||||
|
||||
This method provides two modes:
|
||||
- Quick stats (default): Fast overview using LIST only (~50ms)
|
||||
- Detailed stats: Accurate compression metrics with HEAD requests (slower)
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
detailed_stats: If True, fetch accurate compression ratios for delta files (default: False)
|
||||
|
||||
Returns:
|
||||
BucketStats with compression and space savings info
|
||||
|
||||
Performance:
|
||||
- With detailed_stats=False: ~50ms for any bucket size (1 LIST call per 1000 objects)
|
||||
- With detailed_stats=True: ~2-3s per 1000 objects (adds HEAD calls for delta files only)
|
||||
|
||||
Example:
|
||||
# Quick stats for dashboard display
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Objects: {stats.object_count}, Size: {stats.total_size}")
|
||||
|
||||
# Detailed stats for analytics (slower but accurate)
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True)
|
||||
print(f"Compression ratio: {stats.average_compression_ratio:.1%}")
|
||||
"""
|
||||
# List all objects with smart metadata fetching
|
||||
all_objects = []
|
||||
continuation_token = None
|
||||
|
||||
while True:
|
||||
response = client.list_objects(
|
||||
Bucket=bucket,
|
||||
MaxKeys=1000,
|
||||
ContinuationToken=continuation_token,
|
||||
FetchMetadata=detailed_stats, # Only fetch metadata if detailed stats requested
|
||||
)
|
||||
|
||||
# Extract S3Objects from response (with Metadata containing DeltaGlider info)
|
||||
for obj_dict in response["Contents"]:
|
||||
# Convert dict back to ObjectInfo for backward compatibility with stats calculation
|
||||
metadata = obj_dict.get("Metadata", {})
|
||||
# Parse compression ratio safely (handle "unknown" value)
|
||||
compression_ratio_str = metadata.get("deltaglider-compression-ratio", "0.0")
|
||||
try:
|
||||
compression_ratio = (
|
||||
float(compression_ratio_str) if compression_ratio_str != "unknown" else 0.0
|
||||
)
|
||||
except ValueError:
|
||||
compression_ratio = 0.0
|
||||
|
||||
all_objects.append(
|
||||
ObjectInfo(
|
||||
key=obj_dict["Key"],
|
||||
size=obj_dict["Size"],
|
||||
last_modified=obj_dict.get("LastModified", ""),
|
||||
etag=obj_dict.get("ETag"),
|
||||
storage_class=obj_dict.get("StorageClass", "STANDARD"),
|
||||
original_size=int(metadata.get("deltaglider-original-size", obj_dict["Size"])),
|
||||
compressed_size=obj_dict["Size"],
|
||||
is_delta=metadata.get("deltaglider-is-delta", "false") == "true",
|
||||
compression_ratio=compression_ratio,
|
||||
reference_key=metadata.get("deltaglider-reference-key"),
|
||||
)
|
||||
)
|
||||
|
||||
if not response.get("IsTruncated"):
|
||||
break
|
||||
|
||||
continuation_token = response.get("NextContinuationToken")
|
||||
|
||||
# Calculate statistics
|
||||
total_size = 0
|
||||
compressed_size = 0
|
||||
delta_count = 0
|
||||
direct_count = 0
|
||||
|
||||
for obj in all_objects:
|
||||
# Skip reference.bin files - they are internal implementation details
|
||||
# and their size is already accounted for in delta metadata
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
continue
|
||||
|
||||
compressed_size += obj.size
|
||||
|
||||
if obj.is_delta:
|
||||
delta_count += 1
|
||||
# Use actual original size if we have it, otherwise estimate
|
||||
total_size += obj.original_size or obj.size
|
||||
else:
|
||||
direct_count += 1
|
||||
# For non-delta files, original equals compressed
|
||||
total_size += obj.size
|
||||
|
||||
space_saved = total_size - compressed_size
|
||||
avg_ratio = (space_saved / total_size) if total_size > 0 else 0.0
|
||||
|
||||
return BucketStats(
|
||||
bucket=bucket,
|
||||
object_count=len(all_objects),
|
||||
total_size=total_size,
|
||||
compressed_size=compressed_size,
|
||||
space_saved=space_saved,
|
||||
average_compression_ratio=avg_ratio,
|
||||
delta_objects=delta_count,
|
||||
direct_objects=direct_count,
|
||||
)
|
||||
|
||||
|
||||
def estimate_compression(
|
||||
client: Any, # DeltaGliderClient
|
||||
file_path: str | Path,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
sample_size: int = 1024 * 1024,
|
||||
) -> CompressionEstimate:
|
||||
"""Estimate compression ratio before upload.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
file_path: Local file to estimate
|
||||
bucket: Target bucket
|
||||
prefix: Target prefix (for finding similar files)
|
||||
sample_size: Bytes to sample for estimation (default 1MB)
|
||||
|
||||
Returns:
|
||||
CompressionEstimate with predicted compression
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
# Check file extension
|
||||
ext = file_path.suffix.lower()
|
||||
delta_extensions = {
|
||||
".zip",
|
||||
".tar",
|
||||
".gz",
|
||||
".tar.gz",
|
||||
".tgz",
|
||||
".bz2",
|
||||
".tar.bz2",
|
||||
".xz",
|
||||
".tar.xz",
|
||||
".7z",
|
||||
".rar",
|
||||
".dmg",
|
||||
".iso",
|
||||
".pkg",
|
||||
".deb",
|
||||
".rpm",
|
||||
".apk",
|
||||
".jar",
|
||||
".war",
|
||||
".ear",
|
||||
}
|
||||
|
||||
# Already compressed formats that won't benefit from delta
|
||||
incompressible = {".jpg", ".jpeg", ".png", ".mp4", ".mp3", ".avi", ".mov"}
|
||||
|
||||
if ext in incompressible:
|
||||
return CompressionEstimate(
|
||||
original_size=file_size,
|
||||
estimated_compressed_size=file_size,
|
||||
estimated_ratio=0.0,
|
||||
confidence=0.95,
|
||||
should_use_delta=False,
|
||||
)
|
||||
|
||||
if ext not in delta_extensions:
|
||||
# Unknown type, conservative estimate
|
||||
return CompressionEstimate(
|
||||
original_size=file_size,
|
||||
estimated_compressed_size=file_size,
|
||||
estimated_ratio=0.0,
|
||||
confidence=0.5,
|
||||
should_use_delta=file_size > 1024 * 1024, # Only for files > 1MB
|
||||
)
|
||||
|
||||
# Look for similar files in the target location
|
||||
similar_files = find_similar_files(client, bucket, prefix, file_path.name)
|
||||
|
||||
if similar_files:
|
||||
# If we have similar files, estimate high compression
|
||||
estimated_ratio = 0.99 # 99% compression typical for similar versions
|
||||
confidence = 0.9
|
||||
recommended_ref = similar_files[0]["Key"] if similar_files else None
|
||||
else:
|
||||
# First file of its type
|
||||
estimated_ratio = 0.0
|
||||
confidence = 0.7
|
||||
recommended_ref = None
|
||||
|
||||
estimated_size = int(file_size * (1 - estimated_ratio))
|
||||
|
||||
return CompressionEstimate(
|
||||
original_size=file_size,
|
||||
estimated_compressed_size=estimated_size,
|
||||
estimated_ratio=estimated_ratio,
|
||||
confidence=confidence,
|
||||
recommended_reference=recommended_ref,
|
||||
should_use_delta=True,
|
||||
)
|
||||
|
||||
|
||||
def find_similar_files(
|
||||
client: Any, # DeltaGliderClient
|
||||
bucket: str,
|
||||
prefix: str,
|
||||
filename: str,
|
||||
limit: int = 5,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Find similar files that could serve as references.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket
|
||||
prefix: Prefix to search in
|
||||
filename: Filename to match against
|
||||
limit: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of similar files with scores
|
||||
"""
|
||||
# List objects in the prefix (no metadata needed for similarity check)
|
||||
response = client.list_objects(
|
||||
Bucket=bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1000,
|
||||
FetchMetadata=False, # Don't need metadata for similarity
|
||||
)
|
||||
|
||||
similar: list[dict[str, Any]] = []
|
||||
base_name = Path(filename).stem
|
||||
ext = Path(filename).suffix
|
||||
|
||||
for obj in response["Contents"]:
|
||||
obj_key = obj["Key"]
|
||||
obj_base = Path(obj_key).stem
|
||||
obj_ext = Path(obj_key).suffix
|
||||
|
||||
# Skip delta files and references
|
||||
if obj_key.endswith(".delta") or obj_key.endswith("reference.bin"):
|
||||
continue
|
||||
|
||||
score = 0.0
|
||||
|
||||
# Extension match
|
||||
if ext == obj_ext:
|
||||
score += 0.5
|
||||
|
||||
# Base name similarity
|
||||
if base_name in obj_base or obj_base in base_name:
|
||||
score += 0.3
|
||||
|
||||
# Version pattern match
|
||||
if re.search(r"v?\d+[\.\d]*", base_name) and re.search(r"v?\d+[\.\d]*", obj_base):
|
||||
score += 0.2
|
||||
|
||||
if score > 0.5:
|
||||
similar.append(
|
||||
{
|
||||
"Key": obj_key,
|
||||
"Size": obj["Size"],
|
||||
"Similarity": score,
|
||||
"LastModified": obj["LastModified"],
|
||||
}
|
||||
)
|
||||
|
||||
# Sort by similarity
|
||||
similar.sort(key=lambda x: x["Similarity"], reverse=True) # type: ignore
|
||||
|
||||
return similar[:limit]
|
||||
@@ -47,3 +47,15 @@ class PolicyViolationWarning(Warning):
|
||||
"""Policy violation warning."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class CacheMissError(DeltaGliderError):
|
||||
"""Cache miss - file not found in cache."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class CacheCorruptionError(DeltaGliderError):
|
||||
"""Cache corruption - SHA mismatch or tampering detected."""
|
||||
|
||||
pass
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
import tempfile
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import BinaryIO
|
||||
from typing import Any, BinaryIO
|
||||
|
||||
from ..ports import (
|
||||
CachePort,
|
||||
@@ -21,7 +21,6 @@ from .errors import (
|
||||
IntegrityMismatchError,
|
||||
NotFoundError,
|
||||
PolicyViolationWarning,
|
||||
StorageIOError,
|
||||
)
|
||||
from .models import (
|
||||
DeltaMeta,
|
||||
@@ -171,10 +170,28 @@ class DeltaService:
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {object_key.key}")
|
||||
|
||||
# Check if this is a regular S3 object (not uploaded via DeltaGlider)
|
||||
# Regular S3 objects won't have DeltaGlider metadata
|
||||
if "file_sha256" not in obj_head.metadata:
|
||||
raise StorageIOError(f"Missing metadata on {object_key.key}")
|
||||
# This is a regular S3 object, download it directly
|
||||
self.logger.info(
|
||||
"Downloading regular S3 object (no DeltaGlider metadata)",
|
||||
key=object_key.key,
|
||||
)
|
||||
self._get_direct(object_key, obj_head, out)
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
self.logger.log_operation(
|
||||
op="get",
|
||||
key=object_key.key,
|
||||
deltaspace=f"{object_key.bucket}",
|
||||
sizes={"file": obj_head.size},
|
||||
durations={"total": duration},
|
||||
cache_hit=False,
|
||||
)
|
||||
self.metrics.timing("deltaglider.get.duration", duration)
|
||||
return
|
||||
|
||||
# Check if this is a direct upload (non-delta)
|
||||
# Check if this is a direct upload (non-delta) uploaded via DeltaGlider
|
||||
if obj_head.metadata.get("compression") == "none":
|
||||
# Direct download without delta processing
|
||||
self._get_direct(object_key, obj_head, out)
|
||||
@@ -213,7 +230,10 @@ class DeltaService:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmp_path = Path(tmpdir)
|
||||
delta_path = tmp_path / "delta"
|
||||
ref_path = self.cache.ref_path(delta_space.bucket, delta_space.prefix)
|
||||
# SECURITY: Use validated ref to prevent TOCTOU attacks
|
||||
ref_path = self.cache.get_validated_ref(
|
||||
delta_space.bucket, delta_space.prefix, delta_meta.ref_sha256
|
||||
)
|
||||
out_path = tmp_path / "output"
|
||||
|
||||
# Download delta
|
||||
@@ -391,7 +411,8 @@ class DeltaService:
|
||||
if not cache_hit:
|
||||
self._cache_reference(delta_space, ref_sha256)
|
||||
|
||||
ref_path = self.cache.ref_path(delta_space.bucket, delta_space.prefix)
|
||||
# SECURITY: Use validated ref to prevent TOCTOU attacks
|
||||
ref_path = self.cache.get_validated_ref(delta_space.bucket, delta_space.prefix, ref_sha256)
|
||||
|
||||
# Create delta
|
||||
with tempfile.NamedTemporaryFile(suffix=".delta") as delta_file:
|
||||
@@ -584,3 +605,319 @@ class DeltaService:
|
||||
file_size=file_size,
|
||||
file_sha256=file_sha256,
|
||||
)
|
||||
|
||||
def delete(self, object_key: ObjectKey) -> dict[str, Any]:
|
||||
"""Delete an object (delta-aware).
|
||||
|
||||
For delta files, just deletes the delta.
|
||||
For reference files, checks if any deltas depend on it first.
|
||||
For direct uploads, simply deletes the file.
|
||||
|
||||
Returns:
|
||||
dict with deletion details including type and any warnings
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
full_key = f"{object_key.bucket}/{object_key.key}"
|
||||
|
||||
self.logger.info("Starting delete operation", key=object_key.key)
|
||||
|
||||
# Check if object exists
|
||||
obj_head = self.storage.head(full_key)
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {object_key.key}")
|
||||
|
||||
# Determine object type
|
||||
is_reference = object_key.key.endswith("/reference.bin")
|
||||
is_delta = object_key.key.endswith(".delta")
|
||||
is_direct = obj_head.metadata.get("compression") == "none"
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"key": object_key.key,
|
||||
"bucket": object_key.bucket,
|
||||
"deleted": False,
|
||||
"type": "unknown",
|
||||
"warnings": [],
|
||||
}
|
||||
|
||||
if is_reference:
|
||||
# Check if any deltas depend on this reference
|
||||
prefix = object_key.key.rsplit("/", 1)[0] if "/" in object_key.key else ""
|
||||
dependent_deltas = []
|
||||
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
# Check if this delta references our reference
|
||||
delta_head = self.storage.head(f"{object_key.bucket}/{obj.key}")
|
||||
if delta_head and delta_head.metadata.get("ref_key") == object_key.key:
|
||||
dependent_deltas.append(obj.key)
|
||||
|
||||
if dependent_deltas:
|
||||
warnings_list = result["warnings"]
|
||||
assert isinstance(warnings_list, list)
|
||||
warnings_list.append(
|
||||
f"Reference has {len(dependent_deltas)} dependent delta(s). "
|
||||
"Deleting this will make those deltas unrecoverable."
|
||||
)
|
||||
self.logger.warning(
|
||||
"Reference has dependent deltas",
|
||||
ref_key=object_key.key,
|
||||
delta_count=len(dependent_deltas),
|
||||
deltas=dependent_deltas[:5], # Log first 5
|
||||
)
|
||||
|
||||
# Delete the reference
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "reference"
|
||||
result["dependent_deltas"] = len(dependent_deltas)
|
||||
|
||||
# Clear from cache if present
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = object_key.key.rsplit("/", 1)[0]
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {object_key.key}: {e}")
|
||||
|
||||
elif is_delta:
|
||||
# Delete the delta file
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "delta"
|
||||
result["original_name"] = obj_head.metadata.get("original_name", "unknown")
|
||||
|
||||
# Check if this was the last delta in the DeltaSpace - if so, clean up reference.bin
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = "/".join(object_key.key.split("/")[:-1])
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
|
||||
# Check if any other delta files exist in this DeltaSpace
|
||||
remaining_deltas = []
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{deltaspace_prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
remaining_deltas.append(obj.key)
|
||||
|
||||
if not remaining_deltas:
|
||||
# No more deltas - clean up the orphaned reference.bin
|
||||
ref_full_key = f"{object_key.bucket}/{ref_key}"
|
||||
ref_head = self.storage.head(ref_full_key)
|
||||
if ref_head:
|
||||
self.storage.delete(ref_full_key)
|
||||
self.logger.info(
|
||||
"Cleaned up orphaned reference.bin",
|
||||
ref_key=ref_key,
|
||||
reason="no remaining deltas",
|
||||
)
|
||||
result["cleaned_reference"] = ref_key
|
||||
|
||||
# Clear from cache
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {deltaspace_prefix}: {e}")
|
||||
|
||||
elif is_direct:
|
||||
# Simply delete the direct upload
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "direct"
|
||||
result["original_name"] = obj_head.metadata.get("original_name", object_key.key)
|
||||
|
||||
else:
|
||||
# Unknown file type, delete anyway
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "unknown"
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
self.logger.log_operation(
|
||||
op="delete",
|
||||
key=object_key.key,
|
||||
deltaspace=f"{object_key.bucket}",
|
||||
durations={"total": duration},
|
||||
sizes={},
|
||||
cache_hit=False,
|
||||
)
|
||||
self.metrics.timing("deltaglider.delete.duration", duration)
|
||||
self.metrics.increment(f"deltaglider.delete.{result['type']}")
|
||||
|
||||
return result
|
||||
|
||||
def delete_recursive(self, bucket: str, prefix: str) -> dict[str, Any]:
|
||||
"""Recursively delete all objects under a prefix (delta-aware).
|
||||
|
||||
Handles delta relationships intelligently:
|
||||
- Deletes deltas before references
|
||||
- Warns about orphaned deltas
|
||||
- Handles direct uploads
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Prefix to delete recursively
|
||||
|
||||
Returns:
|
||||
dict with deletion statistics and any warnings
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
self.logger.info("Starting recursive delete", bucket=bucket, prefix=prefix)
|
||||
|
||||
# Ensure prefix ends with / for proper directory deletion
|
||||
if prefix and not prefix.endswith("/"):
|
||||
prefix = f"{prefix}/"
|
||||
|
||||
# Collect all objects under prefix
|
||||
objects_to_delete = []
|
||||
references = []
|
||||
deltas = []
|
||||
direct_uploads = []
|
||||
affected_deltaspaces = set()
|
||||
|
||||
for obj in self.storage.list(f"{bucket}/{prefix}" if prefix else bucket):
|
||||
if not obj.key.startswith(prefix) and prefix:
|
||||
continue
|
||||
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
references.append(obj.key)
|
||||
elif obj.key.endswith(".delta"):
|
||||
deltas.append(obj.key)
|
||||
# Track which deltaspaces are affected by this deletion
|
||||
if "/" in obj.key:
|
||||
deltaspace_prefix = "/".join(obj.key.split("/")[:-1])
|
||||
affected_deltaspaces.add(deltaspace_prefix)
|
||||
else:
|
||||
# Check if it's a direct upload
|
||||
obj_head = self.storage.head(f"{bucket}/{obj.key}")
|
||||
if obj_head and obj_head.metadata.get("compression") == "none":
|
||||
direct_uploads.append(obj.key)
|
||||
else:
|
||||
objects_to_delete.append(obj.key)
|
||||
|
||||
# Also check for references in parent directories that might be affected
|
||||
# by the deletion of delta files in affected deltaspaces
|
||||
for deltaspace_prefix in affected_deltaspaces:
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
if ref_key not in references:
|
||||
# Check if this reference exists
|
||||
ref_head = self.storage.head(f"{bucket}/{ref_key}")
|
||||
if ref_head:
|
||||
references.append(ref_key)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": len(deltas),
|
||||
"references_deleted": len(references),
|
||||
"direct_deleted": len(direct_uploads),
|
||||
"other_deleted": len(objects_to_delete),
|
||||
"errors": [],
|
||||
"warnings": [],
|
||||
}
|
||||
|
||||
# Delete in order: other files -> direct uploads -> deltas -> references (with checks)
|
||||
# This ensures we don't delete references that deltas depend on prematurely
|
||||
regular_files = objects_to_delete + direct_uploads + deltas
|
||||
|
||||
# Delete regular files first
|
||||
for key in regular_files:
|
||||
try:
|
||||
self.storage.delete(f"{bucket}/{key}")
|
||||
deleted_count = result["deleted_count"]
|
||||
assert isinstance(deleted_count, int)
|
||||
result["deleted_count"] = deleted_count + 1
|
||||
self.logger.debug(f"Deleted {key}")
|
||||
except Exception as e:
|
||||
failed_count = result["failed_count"]
|
||||
assert isinstance(failed_count, int)
|
||||
result["failed_count"] = failed_count + 1
|
||||
errors_list = result["errors"]
|
||||
assert isinstance(errors_list, list)
|
||||
errors_list.append(f"Failed to delete {key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete {key}: {e}")
|
||||
|
||||
# Handle references intelligently - only delete if no files outside deletion scope depend on them
|
||||
references_kept = 0
|
||||
for ref_key in references:
|
||||
try:
|
||||
# Extract deltaspace prefix from reference.bin path
|
||||
if ref_key.endswith("/reference.bin"):
|
||||
deltaspace_prefix = ref_key[:-14] # Remove "/reference.bin"
|
||||
else:
|
||||
deltaspace_prefix = ""
|
||||
|
||||
# Check if there are any remaining files in this deltaspace
|
||||
# (outside of the deletion prefix)
|
||||
deltaspace_list_prefix = (
|
||||
f"{bucket}/{deltaspace_prefix}" if deltaspace_prefix else bucket
|
||||
)
|
||||
remaining_objects = list(self.storage.list(deltaspace_list_prefix))
|
||||
|
||||
# Filter out objects that are being deleted (within our deletion scope)
|
||||
# and the reference.bin file itself
|
||||
deletion_prefix_full = f"{bucket}/{prefix}" if prefix else bucket
|
||||
has_remaining_files = False
|
||||
|
||||
for remaining_obj in remaining_objects:
|
||||
obj_full_path = f"{bucket}/{remaining_obj.key}"
|
||||
# Skip if this object is within our deletion scope
|
||||
if prefix and obj_full_path.startswith(deletion_prefix_full):
|
||||
continue
|
||||
# Skip if this is the reference.bin file itself
|
||||
if remaining_obj.key == ref_key:
|
||||
continue
|
||||
# If we find any other file, the reference is still needed
|
||||
has_remaining_files = True
|
||||
break
|
||||
|
||||
if not has_remaining_files:
|
||||
# Safe to delete this reference.bin
|
||||
self.storage.delete(f"{bucket}/{ref_key}")
|
||||
deleted_count = result["deleted_count"]
|
||||
assert isinstance(deleted_count, int)
|
||||
result["deleted_count"] = deleted_count + 1
|
||||
self.logger.debug(f"Deleted reference {ref_key}")
|
||||
else:
|
||||
# Keep the reference as it's still needed
|
||||
references_kept += 1
|
||||
warnings_list = result["warnings"]
|
||||
assert isinstance(warnings_list, list)
|
||||
warnings_list.append(f"Kept reference {ref_key} (still in use)")
|
||||
self.logger.info(
|
||||
f"Kept reference {ref_key} - still in use outside deletion scope"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
failed_count = result["failed_count"]
|
||||
assert isinstance(failed_count, int)
|
||||
result["failed_count"] = failed_count + 1
|
||||
errors_list = result["errors"]
|
||||
assert isinstance(errors_list, list)
|
||||
errors_list.append(f"Failed to delete reference {ref_key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete reference {ref_key}: {e}")
|
||||
|
||||
# Update reference deletion count
|
||||
references_deleted = result["references_deleted"]
|
||||
assert isinstance(references_deleted, int)
|
||||
result["references_deleted"] = references_deleted - references_kept
|
||||
|
||||
# Clear any cached references for this prefix
|
||||
if references:
|
||||
try:
|
||||
self.cache.evict(bucket, prefix.rstrip("/") if prefix else "")
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {bucket}/{prefix}: {e}")
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
self.logger.info(
|
||||
"Recursive delete complete",
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
deleted=result["deleted_count"],
|
||||
failed=result["failed_count"],
|
||||
duration=duration,
|
||||
)
|
||||
self.metrics.timing("deltaglider.delete_recursive.duration", duration)
|
||||
self.metrics.increment("deltaglider.delete_recursive.completed")
|
||||
|
||||
return result
|
||||
|
||||
@@ -15,6 +15,26 @@ class CachePort(Protocol):
|
||||
"""Check if reference exists and matches SHA."""
|
||||
...
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with atomic SHA validation.
|
||||
|
||||
This method MUST be used instead of ref_path() to prevent TOCTOU attacks.
|
||||
It validates the SHA256 hash at the time of use, not just at cache check time.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Prefix/deltaspace within bucket
|
||||
expected_sha: Expected SHA256 hash of the file
|
||||
|
||||
Returns:
|
||||
Path to the validated cached file
|
||||
|
||||
Raises:
|
||||
CacheMissError: If cached file doesn't exist
|
||||
CacheCorruptionError: If SHA doesn't match (file corrupted or tampered)
|
||||
"""
|
||||
...
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Cache reference file."""
|
||||
...
|
||||
|
||||
152
src/deltaglider/response_builders.py
Normal file
152
src/deltaglider/response_builders.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""Type-safe response builders using TypedDicts for internal type safety.
|
||||
|
||||
This module provides builder functions that construct boto3-compatible responses
|
||||
with full compile-time type validation using TypedDicts. At runtime, TypedDicts
|
||||
are plain dicts, so there's no conversion overhead.
|
||||
|
||||
Benefits:
|
||||
- Field name typos caught by mypy (e.g., "HTTPStatusCode" → "HttpStatusCode")
|
||||
- Wrong types caught by mypy (e.g., string instead of int)
|
||||
- Missing required fields caught by mypy
|
||||
- Extra unknown fields caught by mypy
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from .types import (
|
||||
CommonPrefix,
|
||||
DeleteObjectResponse,
|
||||
GetObjectResponse,
|
||||
ListObjectsV2Response,
|
||||
PutObjectResponse,
|
||||
ResponseMetadata,
|
||||
S3Object,
|
||||
)
|
||||
|
||||
|
||||
def build_response_metadata(status_code: int = 200) -> ResponseMetadata:
|
||||
"""Build ResponseMetadata with full type safety via TypedDict.
|
||||
|
||||
TypedDict is a dict at runtime - no conversion needed!
|
||||
mypy validates all fields match ResponseMetadata TypedDict.
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
"""
|
||||
# Build as TypedDict - mypy validates field names and types!
|
||||
metadata: ResponseMetadata = {
|
||||
"HTTPStatusCode": status_code,
|
||||
# All other fields are NotRequired - can be omitted!
|
||||
}
|
||||
return metadata # Returns dict at runtime, ResponseMetadata type at compile-time
|
||||
|
||||
|
||||
def build_put_response(
|
||||
etag: str,
|
||||
*,
|
||||
version_id: str | None = None,
|
||||
deltaglider_info: dict[str, Any] | None = None,
|
||||
) -> PutObjectResponse:
|
||||
"""Build PutObjectResponse with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: PutObjectResponse = {
|
||||
"ETag": etag,
|
||||
"ResponseMetadata": build_response_metadata(),
|
||||
}
|
||||
|
||||
if version_id:
|
||||
response["VersionId"] = version_id
|
||||
|
||||
# DeltaGlider extension - add as Any field
|
||||
if deltaglider_info:
|
||||
response["DeltaGliderInfo"] = deltaglider_info # type: ignore[typeddict-item]
|
||||
|
||||
return response # Returns dict at runtime, PutObjectResponse type at compile-time
|
||||
|
||||
|
||||
def build_get_response(
|
||||
body: Any,
|
||||
content_length: int,
|
||||
etag: str,
|
||||
metadata: dict[str, Any],
|
||||
) -> GetObjectResponse:
|
||||
"""Build GetObjectResponse with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: GetObjectResponse = {
|
||||
"Body": body,
|
||||
"ContentLength": content_length,
|
||||
"ETag": etag,
|
||||
"Metadata": metadata,
|
||||
"ResponseMetadata": build_response_metadata(),
|
||||
}
|
||||
return response # Returns dict at runtime, GetObjectResponse type at compile-time
|
||||
|
||||
|
||||
def build_list_objects_response(
|
||||
bucket: str,
|
||||
prefix: str,
|
||||
delimiter: str,
|
||||
max_keys: int,
|
||||
contents: list[S3Object],
|
||||
common_prefixes: list[CommonPrefix] | None,
|
||||
is_truncated: bool,
|
||||
next_continuation_token: str | None,
|
||||
continuation_token: str | None,
|
||||
) -> ListObjectsV2Response:
|
||||
"""Build ListObjectsV2Response with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: ListObjectsV2Response = {
|
||||
"IsTruncated": is_truncated,
|
||||
"Contents": contents,
|
||||
"Name": bucket,
|
||||
"Prefix": prefix,
|
||||
"Delimiter": delimiter,
|
||||
"MaxKeys": max_keys,
|
||||
"KeyCount": len(contents),
|
||||
"ResponseMetadata": build_response_metadata(),
|
||||
}
|
||||
|
||||
# Add optional fields
|
||||
if common_prefixes:
|
||||
response["CommonPrefixes"] = common_prefixes
|
||||
|
||||
if next_continuation_token:
|
||||
response["NextContinuationToken"] = next_continuation_token
|
||||
|
||||
if continuation_token:
|
||||
response["ContinuationToken"] = continuation_token
|
||||
|
||||
return response # Returns dict at runtime, ListObjectsV2Response type at compile-time
|
||||
|
||||
|
||||
def build_delete_response(
|
||||
delete_marker: bool = False,
|
||||
status_code: int = 204,
|
||||
deltaglider_info: dict[str, Any] | None = None,
|
||||
) -> DeleteObjectResponse:
|
||||
"""Build DeleteObjectResponse with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: DeleteObjectResponse = {
|
||||
"DeleteMarker": delete_marker,
|
||||
"ResponseMetadata": build_response_metadata(status_code),
|
||||
}
|
||||
|
||||
# DeltaGlider extension
|
||||
if deltaglider_info:
|
||||
response["DeltaGliderInfo"] = deltaglider_info # type: ignore[typeddict-item]
|
||||
|
||||
return response # Returns dict at runtime, DeleteObjectResponse type at compile-time
|
||||
355
src/deltaglider/types.py
Normal file
355
src/deltaglider/types.py
Normal file
@@ -0,0 +1,355 @@
|
||||
"""Type definitions for boto3-compatible responses.
|
||||
|
||||
These TypedDict definitions provide type hints for DeltaGlider's boto3-compatible
|
||||
responses. All methods return plain `dict[str, Any]` at runtime for maximum
|
||||
flexibility and boto3 compatibility.
|
||||
|
||||
## Basic Usage (Recommended)
|
||||
|
||||
Use DeltaGlider with simple dict access - no type imports needed:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Returns plain dict - 100% boto3 compatible
|
||||
response = client.put_object(Bucket='my-bucket', Key='file.zip', Body=data)
|
||||
print(response['ETag'])
|
||||
|
||||
# List objects with dict access
|
||||
listing = client.list_objects(Bucket='my-bucket')
|
||||
for obj in listing['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
```
|
||||
|
||||
## Optional Type Hints
|
||||
|
||||
For IDE autocomplete and type checking, you can use our convenience TypedDicts:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
from deltaglider.types import PutObjectResponse, ListObjectsV2Response
|
||||
|
||||
client = create_client()
|
||||
response: PutObjectResponse = client.put_object(...) # IDE autocomplete
|
||||
listing: ListObjectsV2Response = client.list_objects(...)
|
||||
```
|
||||
|
||||
## Advanced: boto3-stubs Integration
|
||||
|
||||
For strictest type checking (requires boto3-stubs installation):
|
||||
|
||||
```bash
|
||||
pip install boto3-stubs[s3]
|
||||
```
|
||||
|
||||
```python
|
||||
from mypy_boto3_s3.type_defs import PutObjectOutputTypeDef
|
||||
response: PutObjectOutputTypeDef = client.put_object(...)
|
||||
```
|
||||
|
||||
**Note**: boto3-stubs TypedDefs are very strict and require ALL optional fields.
|
||||
DeltaGlider returns partial dicts for better boto3 compatibility, so boto3-stubs
|
||||
types may show false positive errors. Use `dict[str, Any]` or our TypedDicts instead.
|
||||
|
||||
## Design Philosophy
|
||||
|
||||
DeltaGlider returns `dict[str, Any]` from all boto3-compatible methods because:
|
||||
1. **Flexibility**: boto3 responses vary by service and operation
|
||||
2. **Compatibility**: Exact match with boto3 runtime behavior
|
||||
3. **Simplicity**: No complex type dependencies for users
|
||||
4. **Optional Typing**: Users choose their preferred level of type safety
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Literal, NotRequired, TypedDict
|
||||
|
||||
# ============================================================================
|
||||
# S3 Object Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class S3Object(TypedDict):
|
||||
"""An S3 object returned in list operations.
|
||||
|
||||
Compatible with boto3's S3.Client.list_objects_v2() response Contents.
|
||||
"""
|
||||
|
||||
Key: str
|
||||
Size: int
|
||||
LastModified: datetime
|
||||
ETag: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
Owner: NotRequired[dict[str, str]]
|
||||
Metadata: NotRequired[dict[str, str]]
|
||||
|
||||
|
||||
class CommonPrefix(TypedDict):
|
||||
"""A common prefix (directory) in S3 listing.
|
||||
|
||||
Compatible with boto3's S3.Client.list_objects_v2() response CommonPrefixes.
|
||||
"""
|
||||
|
||||
Prefix: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Response Metadata (used in all responses)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class ResponseMetadata(TypedDict):
|
||||
"""Metadata about the API response.
|
||||
|
||||
Compatible with all boto3 responses.
|
||||
"""
|
||||
|
||||
RequestId: NotRequired[str]
|
||||
HostId: NotRequired[str]
|
||||
HTTPStatusCode: int
|
||||
HTTPHeaders: NotRequired[dict[str, str]]
|
||||
RetryAttempts: NotRequired[int]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# List Operations Response Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class ListObjectsV2Response(TypedDict):
|
||||
"""Response from list_objects_v2 operation.
|
||||
|
||||
100% compatible with boto3's S3.Client.list_objects_v2() response.
|
||||
|
||||
Example:
|
||||
```python
|
||||
client = create_client()
|
||||
response: ListObjectsV2Response = client.list_objects(
|
||||
Bucket='my-bucket',
|
||||
Prefix='path/',
|
||||
Delimiter='/'
|
||||
)
|
||||
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
for prefix in response.get('CommonPrefixes', []):
|
||||
print(f"Directory: {prefix['Prefix']}")
|
||||
```
|
||||
"""
|
||||
|
||||
Contents: list[S3Object]
|
||||
Name: NotRequired[str] # Bucket name
|
||||
Prefix: NotRequired[str]
|
||||
Delimiter: NotRequired[str]
|
||||
MaxKeys: NotRequired[int]
|
||||
CommonPrefixes: NotRequired[list[CommonPrefix]]
|
||||
EncodingType: NotRequired[str]
|
||||
KeyCount: NotRequired[int]
|
||||
ContinuationToken: NotRequired[str]
|
||||
NextContinuationToken: NotRequired[str]
|
||||
StartAfter: NotRequired[str]
|
||||
IsTruncated: NotRequired[bool]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Put/Get/Delete Response Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class PutObjectResponse(TypedDict):
|
||||
"""Response from put_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.put_object() response.
|
||||
"""
|
||||
|
||||
ETag: str
|
||||
VersionId: NotRequired[str]
|
||||
ServerSideEncryption: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class GetObjectResponse(TypedDict):
|
||||
"""Response from get_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.get_object() response.
|
||||
"""
|
||||
|
||||
Body: Any # StreamingBody in boto3, bytes in DeltaGlider
|
||||
ContentLength: int
|
||||
ContentType: NotRequired[str]
|
||||
ETag: NotRequired[str]
|
||||
LastModified: NotRequired[datetime]
|
||||
Metadata: NotRequired[dict[str, str]]
|
||||
VersionId: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class DeleteObjectResponse(TypedDict):
|
||||
"""Response from delete_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_object() response.
|
||||
"""
|
||||
|
||||
DeleteMarker: NotRequired[bool]
|
||||
VersionId: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class DeletedObject(TypedDict):
|
||||
"""A successfully deleted object.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_objects() response Deleted.
|
||||
"""
|
||||
|
||||
Key: str
|
||||
VersionId: NotRequired[str]
|
||||
DeleteMarker: NotRequired[bool]
|
||||
DeleteMarkerVersionId: NotRequired[str]
|
||||
|
||||
|
||||
class DeleteError(TypedDict):
|
||||
"""An error that occurred during deletion.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_objects() response Errors.
|
||||
"""
|
||||
|
||||
Key: str
|
||||
Code: str
|
||||
Message: str
|
||||
VersionId: NotRequired[str]
|
||||
|
||||
|
||||
class DeleteObjectsResponse(TypedDict):
|
||||
"""Response from delete_objects operation.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_objects() response.
|
||||
"""
|
||||
|
||||
Deleted: NotRequired[list[DeletedObject]]
|
||||
Errors: NotRequired[list[DeleteError]]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Head Object Response
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class HeadObjectResponse(TypedDict):
|
||||
"""Response from head_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.head_object() response.
|
||||
"""
|
||||
|
||||
ContentLength: int
|
||||
ContentType: NotRequired[str]
|
||||
ETag: NotRequired[str]
|
||||
LastModified: NotRequired[datetime]
|
||||
Metadata: NotRequired[dict[str, str]]
|
||||
VersionId: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Bucket Operations
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class Bucket(TypedDict):
|
||||
"""An S3 bucket.
|
||||
|
||||
Compatible with boto3's S3.Client.list_buckets() response Buckets.
|
||||
"""
|
||||
|
||||
Name: str
|
||||
CreationDate: datetime
|
||||
|
||||
|
||||
class ListBucketsResponse(TypedDict):
|
||||
"""Response from list_buckets operation.
|
||||
|
||||
Compatible with boto3's S3.Client.list_buckets() response.
|
||||
"""
|
||||
|
||||
Buckets: list[Bucket]
|
||||
Owner: NotRequired[dict[str, str]]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class CreateBucketResponse(TypedDict):
|
||||
"""Response from create_bucket operation.
|
||||
|
||||
Compatible with boto3's S3.Client.create_bucket() response.
|
||||
"""
|
||||
|
||||
Location: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Multipart Upload Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class CompletedPart(TypedDict):
|
||||
"""A completed part in a multipart upload."""
|
||||
|
||||
PartNumber: int
|
||||
ETag: str
|
||||
|
||||
|
||||
class CompleteMultipartUploadResponse(TypedDict):
|
||||
"""Response from complete_multipart_upload operation."""
|
||||
|
||||
Location: NotRequired[str]
|
||||
Bucket: NotRequired[str]
|
||||
Key: NotRequired[str]
|
||||
ETag: NotRequired[str]
|
||||
VersionId: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Copy Operations
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class CopyObjectResponse(TypedDict):
|
||||
"""Response from copy_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.copy_object() response.
|
||||
"""
|
||||
|
||||
CopyObjectResult: NotRequired[dict[str, Any]]
|
||||
ETag: NotRequired[str]
|
||||
LastModified: NotRequired[datetime]
|
||||
VersionId: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Type Aliases for Convenience
|
||||
# ============================================================================
|
||||
|
||||
# Common parameter types
|
||||
BucketName = str
|
||||
ObjectKey = str
|
||||
Prefix = str
|
||||
Delimiter = str
|
||||
|
||||
# Storage class options
|
||||
StorageClass = Literal[
|
||||
"STANDARD",
|
||||
"REDUCED_REDUNDANCY",
|
||||
"STANDARD_IA",
|
||||
"ONEZONE_IA",
|
||||
"INTELLIGENT_TIERING",
|
||||
"GLACIER",
|
||||
"DEEP_ARCHIVE",
|
||||
"GLACIER_IR",
|
||||
]
|
||||
@@ -8,7 +8,7 @@ from unittest.mock import Mock
|
||||
import pytest
|
||||
|
||||
from deltaglider.adapters import (
|
||||
FsCacheAdapter,
|
||||
ContentAddressedCache,
|
||||
NoopMetricsAdapter,
|
||||
Sha256Adapter,
|
||||
StdLoggerAdapter,
|
||||
@@ -59,9 +59,9 @@ def real_hasher():
|
||||
|
||||
@pytest.fixture
|
||||
def cache_adapter(temp_dir, real_hasher):
|
||||
"""Create filesystem cache adapter."""
|
||||
"""Create content-addressed storage cache adapter."""
|
||||
cache_dir = temp_dir / "cache"
|
||||
return FsCacheAdapter(cache_dir, real_hasher)
|
||||
return ContentAddressedCache(cache_dir, real_hasher)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
@@ -15,10 +15,19 @@ from deltaglider.app.cli.main import cli
|
||||
def extract_json_from_cli_output(output: str) -> dict:
|
||||
"""Extract JSON from CLI output that may contain log messages."""
|
||||
lines = output.split("\n")
|
||||
json_start = next(i for i, line in enumerate(lines) if line.strip().startswith("{"))
|
||||
json_end = next(i for i in range(json_start, len(lines)) if lines[i].strip() == "}") + 1
|
||||
json_text = "\n".join(lines[json_start:json_end])
|
||||
return json.loads(json_text)
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip().startswith("{"):
|
||||
json_start = i
|
||||
json_end = (
|
||||
next(
|
||||
(j for j in range(json_start, len(lines)) if lines[j].strip() == "}"),
|
||||
len(lines) - 1,
|
||||
)
|
||||
+ 1
|
||||
)
|
||||
json_text = "\n".join(lines[json_start:json_end])
|
||||
return json.loads(json_text)
|
||||
raise ValueError("No JSON found in CLI output")
|
||||
|
||||
|
||||
@pytest.mark.e2e
|
||||
@@ -72,34 +81,35 @@ class TestLocalStackE2E:
|
||||
file2.write_text("Plugin version 1.0.1 content with minor changes")
|
||||
|
||||
# Upload first file (becomes reference)
|
||||
result = runner.invoke(cli, ["put", str(file1), f"s3://{test_bucket}/plugins/"])
|
||||
result = runner.invoke(cli, ["cp", str(file1), f"s3://{test_bucket}/plugins/"])
|
||||
assert result.exit_code == 0
|
||||
output1 = extract_json_from_cli_output(result.output)
|
||||
assert output1["operation"] == "create_reference"
|
||||
assert output1["key"] == "plugins/reference.bin"
|
||||
assert "reference" in result.output.lower() or "upload:" in result.output
|
||||
|
||||
# Verify reference was created
|
||||
objects = s3_client.list_objects_v2(Bucket=test_bucket, Prefix="plugins/")
|
||||
# Verify reference was created (deltaspace is root, files are at root level)
|
||||
objects = s3_client.list_objects_v2(Bucket=test_bucket)
|
||||
assert "Contents" in objects
|
||||
keys = [obj["Key"] for obj in objects["Contents"]]
|
||||
assert "plugins/reference.bin" in keys
|
||||
assert "plugins/plugin-v1.0.0.zip.delta" in keys
|
||||
# Files are stored at root level: reference.bin and plugin-v1.0.0.zip.delta
|
||||
assert "reference.bin" in keys
|
||||
assert "plugin-v1.0.0.zip.delta" in keys
|
||||
|
||||
# Upload second file (creates delta)
|
||||
result = runner.invoke(cli, ["put", str(file2), f"s3://{test_bucket}/plugins/"])
|
||||
result = runner.invoke(cli, ["cp", str(file2), f"s3://{test_bucket}/plugins/"])
|
||||
assert result.exit_code == 0
|
||||
output2 = extract_json_from_cli_output(result.output)
|
||||
assert output2["operation"] == "create_delta"
|
||||
assert output2["key"] == "plugins/plugin-v1.0.1.zip.delta"
|
||||
assert "delta_ratio" in output2
|
||||
assert "upload:" in result.output
|
||||
|
||||
# Verify delta was created
|
||||
objects = s3_client.list_objects_v2(Bucket=test_bucket)
|
||||
keys = [obj["Key"] for obj in objects["Contents"]]
|
||||
assert "plugin-v1.0.1.zip.delta" in keys
|
||||
|
||||
# Download and verify second file
|
||||
output_file = tmpdir / "downloaded.zip"
|
||||
result = runner.invoke(
|
||||
cli,
|
||||
[
|
||||
"get",
|
||||
f"s3://{test_bucket}/plugins/plugin-v1.0.1.zip.delta",
|
||||
"-o",
|
||||
"cp",
|
||||
f"s3://{test_bucket}/plugin-v1.0.1.zip.delta",
|
||||
str(output_file),
|
||||
],
|
||||
)
|
||||
@@ -109,41 +119,42 @@ class TestLocalStackE2E:
|
||||
# Verify integrity
|
||||
result = runner.invoke(
|
||||
cli,
|
||||
["verify", f"s3://{test_bucket}/plugins/plugin-v1.0.1.zip.delta"],
|
||||
["verify", f"s3://{test_bucket}/plugin-v1.0.1.zip.delta"],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
verify_output = extract_json_from_cli_output(result.output)
|
||||
assert verify_output["valid"] is True
|
||||
|
||||
def test_multiple_deltaspaces(self, test_bucket, s3_client):
|
||||
"""Test multiple deltaspace directories with separate references."""
|
||||
"""Test shared deltaspace with multiple files."""
|
||||
runner = CliRunner()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create test files for different deltaspaces
|
||||
# Create test files for the same deltaspace
|
||||
file_a1 = tmpdir / "app-a-v1.zip"
|
||||
file_a1.write_text("Application A version 1")
|
||||
|
||||
file_b1 = tmpdir / "app-b-v1.zip"
|
||||
file_b1.write_text("Application B version 1")
|
||||
|
||||
# Upload to different deltaspaces
|
||||
result = runner.invoke(cli, ["put", str(file_a1), f"s3://{test_bucket}/apps/app-a/"])
|
||||
# Upload to same deltaspace (apps/) with different target paths
|
||||
result = runner.invoke(cli, ["cp", str(file_a1), f"s3://{test_bucket}/apps/app-a/"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
result = runner.invoke(cli, ["put", str(file_b1), f"s3://{test_bucket}/apps/app-b/"])
|
||||
result = runner.invoke(cli, ["cp", str(file_b1), f"s3://{test_bucket}/apps/app-b/"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
# Verify each deltaspace has its own reference
|
||||
objects_a = s3_client.list_objects_v2(Bucket=test_bucket, Prefix="apps/app-a/")
|
||||
keys_a = [obj["Key"] for obj in objects_a["Contents"]]
|
||||
assert "apps/app-a/reference.bin" in keys_a
|
||||
|
||||
objects_b = s3_client.list_objects_v2(Bucket=test_bucket, Prefix="apps/app-b/")
|
||||
keys_b = [obj["Key"] for obj in objects_b["Contents"]]
|
||||
assert "apps/app-b/reference.bin" in keys_b
|
||||
# Verify deltaspace has reference (both files share apps/ deltaspace)
|
||||
objects = s3_client.list_objects_v2(Bucket=test_bucket, Prefix="apps/")
|
||||
assert "Contents" in objects
|
||||
keys = [obj["Key"] for obj in objects["Contents"]]
|
||||
# Should have: apps/reference.bin, apps/app-a-v1.zip.delta, apps/app-b-v1.zip.delta
|
||||
# Both files share the same deltaspace (apps/) so only one reference
|
||||
assert "apps/reference.bin" in keys
|
||||
assert "apps/app-a-v1.zip.delta" in keys
|
||||
assert "apps/app-b-v1.zip.delta" in keys
|
||||
|
||||
def test_large_delta_warning(self, test_bucket, s3_client):
|
||||
"""Test delta compression with different content."""
|
||||
@@ -160,14 +171,14 @@ class TestLocalStackE2E:
|
||||
file2.write_text("B" * 1000) # Completely different
|
||||
|
||||
# Upload first file
|
||||
result = runner.invoke(cli, ["put", str(file1), f"s3://{test_bucket}/test/"])
|
||||
result = runner.invoke(cli, ["cp", str(file1), f"s3://{test_bucket}/test/"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
# Upload second file with low max-ratio
|
||||
result = runner.invoke(
|
||||
cli,
|
||||
[
|
||||
"put",
|
||||
"cp",
|
||||
str(file2),
|
||||
f"s3://{test_bucket}/test/",
|
||||
"--max-ratio",
|
||||
@@ -175,9 +186,11 @@ class TestLocalStackE2E:
|
||||
], # Very low threshold
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
# Even with completely different content, xdelta3 is efficient
|
||||
output = extract_json_from_cli_output(result.output)
|
||||
assert output["operation"] == "create_delta"
|
||||
# Delta ratio should be small even for different files (xdelta3 is very efficient)
|
||||
assert "delta_ratio" in output
|
||||
assert output["delta_ratio"] > 0.01 # Should exceed the very low threshold we set
|
||||
# Should still upload successfully even though delta exceeds threshold
|
||||
assert "upload:" in result.output
|
||||
|
||||
# Verify delta was created
|
||||
objects = s3_client.list_objects_v2(Bucket=test_bucket)
|
||||
assert "Contents" in objects
|
||||
keys = [obj["Key"] for obj in objects["Contents"]]
|
||||
assert "file2.zip.delta" in keys
|
||||
|
||||
237
tests/integration/test_bucket_management.py
Normal file
237
tests/integration/test_bucket_management.py
Normal file
@@ -0,0 +1,237 @@
|
||||
"""Tests for bucket management APIs."""
|
||||
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.main import create_service
|
||||
from deltaglider.client import DeltaGliderClient
|
||||
|
||||
|
||||
class TestBucketManagement:
|
||||
"""Test bucket creation, listing, and deletion."""
|
||||
|
||||
def test_create_bucket_success(self):
|
||||
"""Test creating a bucket successfully."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock boto3 client
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.create_bucket.return_value = {"Location": "/test-bucket"}
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.create_bucket(Bucket="test-bucket")
|
||||
|
||||
# Verify response
|
||||
assert response["Location"] == "/test-bucket"
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
# Verify boto3 was called correctly
|
||||
mock_boto3_client.create_bucket.assert_called_once_with(Bucket="test-bucket")
|
||||
|
||||
def test_create_bucket_with_region(self):
|
||||
"""Test creating a bucket in a specific region."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock boto3 client
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.create_bucket.return_value = {
|
||||
"Location": "http://test-bucket.s3.us-west-2.amazonaws.com/"
|
||||
}
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.create_bucket(
|
||||
Bucket="test-bucket",
|
||||
CreateBucketConfiguration={"LocationConstraint": "us-west-2"},
|
||||
)
|
||||
|
||||
# Verify response
|
||||
assert "Location" in response
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
# Verify boto3 was called with region config
|
||||
mock_boto3_client.create_bucket.assert_called_once_with(
|
||||
Bucket="test-bucket", CreateBucketConfiguration={"LocationConstraint": "us-west-2"}
|
||||
)
|
||||
|
||||
def test_create_bucket_already_exists(self):
|
||||
"""Test creating a bucket that already exists returns success."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock boto3 client to raise BucketAlreadyExists
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.create_bucket.side_effect = Exception("BucketAlreadyOwnedByYou")
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.create_bucket(Bucket="existing-bucket")
|
||||
|
||||
# Should return success (idempotent)
|
||||
assert response["Location"] == "/existing-bucket"
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_list_buckets_success(self):
|
||||
"""Test listing buckets."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock boto3 client
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.list_buckets.return_value = {
|
||||
"Buckets": [
|
||||
{"Name": "bucket1", "CreationDate": "2025-01-01T00:00:00Z"},
|
||||
{"Name": "bucket2", "CreationDate": "2025-01-02T00:00:00Z"},
|
||||
],
|
||||
"Owner": {"DisplayName": "test-user", "ID": "12345"},
|
||||
}
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_buckets()
|
||||
|
||||
# Verify response
|
||||
assert len(response["Buckets"]) == 2
|
||||
assert response["Buckets"][0]["Name"] == "bucket1"
|
||||
assert response["Buckets"][1]["Name"] == "bucket2"
|
||||
assert response["Owner"]["DisplayName"] == "test-user"
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_list_buckets_empty(self):
|
||||
"""Test listing buckets when none exist."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock boto3 client with empty result
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.list_buckets.return_value = {"Buckets": [], "Owner": {}}
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_buckets()
|
||||
|
||||
# Verify empty list
|
||||
assert response["Buckets"] == []
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_delete_bucket_success(self):
|
||||
"""Test deleting a bucket successfully."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock boto3 client
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.delete_bucket.return_value = None
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.delete_bucket(Bucket="test-bucket")
|
||||
|
||||
# Verify response
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 204
|
||||
|
||||
# Verify boto3 was called
|
||||
mock_boto3_client.delete_bucket.assert_called_once_with(Bucket="test-bucket")
|
||||
|
||||
def test_delete_bucket_not_found(self):
|
||||
"""Test deleting a bucket that doesn't exist returns success."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock boto3 client to raise NoSuchBucket
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.delete_bucket.side_effect = Exception("NoSuchBucket")
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.delete_bucket(Bucket="nonexistent-bucket")
|
||||
|
||||
# Should return success (idempotent)
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 204
|
||||
|
||||
def test_delete_bucket_not_empty_raises_error(self):
|
||||
"""Test deleting a non-empty bucket raises an error."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock boto3 client to raise BucketNotEmpty
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.delete_bucket.side_effect = Exception(
|
||||
"BucketNotEmpty: The bucket you tried to delete is not empty"
|
||||
)
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Failed to delete bucket"):
|
||||
client.delete_bucket(Bucket="full-bucket")
|
||||
|
||||
def test_bucket_methods_without_boto3_client(self):
|
||||
"""Test that bucket methods raise NotImplementedError when storage doesn't support it."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Storage adapter without boto3 client (no 'client' attribute)
|
||||
delattr(mock_storage, "client")
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
# All bucket methods should raise NotImplementedError
|
||||
with pytest.raises(NotImplementedError):
|
||||
client.create_bucket(Bucket="test")
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
client.delete_bucket(Bucket="test")
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
client.list_buckets()
|
||||
|
||||
def test_complete_bucket_lifecycle(self):
|
||||
"""Test complete bucket lifecycle: create, use, delete."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock boto3 client
|
||||
mock_boto3_client = Mock()
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
# Setup responses
|
||||
mock_boto3_client.create_bucket.return_value = {"Location": "/test-lifecycle"}
|
||||
mock_boto3_client.list_buckets.return_value = {
|
||||
"Buckets": [{"Name": "test-lifecycle", "CreationDate": "2025-01-01T00:00:00Z"}],
|
||||
"Owner": {},
|
||||
}
|
||||
mock_boto3_client.delete_bucket.return_value = None
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
# 1. Create bucket
|
||||
create_response = client.create_bucket(Bucket="test-lifecycle")
|
||||
assert create_response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
# 2. List buckets - verify it exists
|
||||
list_response = client.list_buckets()
|
||||
bucket_names = [b["Name"] for b in list_response["Buckets"]]
|
||||
assert "test-lifecycle" in bucket_names
|
||||
|
||||
# 3. Delete bucket
|
||||
delete_response = client.delete_bucket(Bucket="test-lifecycle")
|
||||
assert delete_response["ResponseMetadata"]["HTTPStatusCode"] == 204
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
481
tests/integration/test_client.py
Normal file
481
tests/integration/test_client.py
Normal file
@@ -0,0 +1,481 @@
|
||||
"""Tests for the DeltaGlider client with boto3-compatible APIs."""
|
||||
|
||||
import hashlib
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider import create_client
|
||||
from deltaglider.client import (
|
||||
BucketStats,
|
||||
CompressionEstimate,
|
||||
ObjectInfo,
|
||||
)
|
||||
|
||||
|
||||
class MockStorage:
|
||||
"""Mock storage for testing."""
|
||||
|
||||
def __init__(self):
|
||||
self.objects = {}
|
||||
|
||||
def head(self, key):
|
||||
"""Mock head operation."""
|
||||
from deltaglider.ports.storage import ObjectHead
|
||||
|
||||
if key in self.objects:
|
||||
obj = self.objects[key]
|
||||
return ObjectHead(
|
||||
key=key,
|
||||
size=obj["size"],
|
||||
etag=obj.get("etag", "mock-etag"),
|
||||
last_modified=obj.get("last_modified", datetime.now(UTC)),
|
||||
metadata=obj.get("metadata", {}),
|
||||
)
|
||||
return None
|
||||
|
||||
def list(self, prefix):
|
||||
"""Mock list operation for StoragePort interface."""
|
||||
for key, _obj in self.objects.items():
|
||||
if key.startswith(prefix):
|
||||
obj_head = self.head(key)
|
||||
if obj_head is not None:
|
||||
yield obj_head
|
||||
|
||||
def list_objects(self, bucket, prefix="", delimiter="", max_keys=1000, start_after=None):
|
||||
"""Mock list_objects operation for S3 features."""
|
||||
objects = []
|
||||
common_prefixes = set()
|
||||
|
||||
for key in sorted(self.objects.keys()):
|
||||
if not key.startswith(f"{bucket}/"):
|
||||
continue
|
||||
|
||||
obj_key = key[len(bucket) + 1 :] # Remove bucket prefix
|
||||
if prefix and not obj_key.startswith(prefix):
|
||||
continue
|
||||
|
||||
if delimiter:
|
||||
# Find common prefixes
|
||||
rel_key = obj_key[len(prefix) :] if prefix else obj_key
|
||||
delimiter_pos = rel_key.find(delimiter)
|
||||
if delimiter_pos > -1:
|
||||
common_prefix = prefix + rel_key[: delimiter_pos + 1]
|
||||
common_prefixes.add(common_prefix)
|
||||
continue
|
||||
|
||||
obj = self.objects[key]
|
||||
objects.append(
|
||||
{
|
||||
"key": obj_key,
|
||||
"size": obj["size"],
|
||||
"last_modified": obj.get("last_modified", "2025-01-01T00:00:00Z"),
|
||||
"etag": obj.get("etag", "mock-etag"),
|
||||
"storage_class": obj.get("storage_class", "STANDARD"),
|
||||
}
|
||||
)
|
||||
|
||||
if len(objects) >= max_keys:
|
||||
break
|
||||
|
||||
return {
|
||||
"objects": objects,
|
||||
"common_prefixes": sorted(list(common_prefixes)),
|
||||
"is_truncated": False,
|
||||
"next_continuation_token": None,
|
||||
"key_count": len(objects),
|
||||
}
|
||||
|
||||
def get(self, key):
|
||||
"""Mock get operation."""
|
||||
import io
|
||||
|
||||
if key in self.objects:
|
||||
return io.BytesIO(self.objects[key].get("data", b"mock data"))
|
||||
raise FileNotFoundError(f"Object not found: {key}")
|
||||
|
||||
def put(self, key, body, metadata, content_type="application/octet-stream"):
|
||||
"""Mock put operation."""
|
||||
from deltaglider.ports.storage import PutResult
|
||||
|
||||
if hasattr(body, "read"):
|
||||
data = body.read()
|
||||
elif isinstance(body, Path):
|
||||
data = body.read_bytes()
|
||||
else:
|
||||
data = body
|
||||
|
||||
self.objects[key] = {
|
||||
"data": data,
|
||||
"size": len(data),
|
||||
"metadata": metadata,
|
||||
"content_type": content_type,
|
||||
}
|
||||
|
||||
return PutResult(etag="mock-etag", version_id=None)
|
||||
|
||||
def delete(self, key):
|
||||
"""Mock delete operation."""
|
||||
if key in self.objects:
|
||||
del self.objects[key]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(tmp_path):
|
||||
"""Create a client with mocked storage."""
|
||||
client = create_client()
|
||||
|
||||
# Replace storage with mock
|
||||
mock_storage = MockStorage()
|
||||
client.service.storage = mock_storage
|
||||
|
||||
# Pre-populate some test objects
|
||||
mock_storage.objects = {
|
||||
"test-bucket/file1.txt": {"size": 100, "metadata": {}},
|
||||
"test-bucket/folder1/file2.txt": {"size": 200, "metadata": {}},
|
||||
"test-bucket/folder1/file3.txt": {"size": 300, "metadata": {}},
|
||||
"test-bucket/folder2/file4.txt": {"size": 400, "metadata": {}},
|
||||
"test-bucket/archive.zip.delta": {
|
||||
"size": 50,
|
||||
"metadata": {"file_size": "1000", "compression_ratio": "0.95"},
|
||||
},
|
||||
}
|
||||
|
||||
return client
|
||||
|
||||
|
||||
class TestCredentialHandling:
|
||||
"""Test AWS credential passing."""
|
||||
|
||||
def test_create_client_with_explicit_credentials(self, tmp_path):
|
||||
"""Test that credentials can be passed directly to create_client."""
|
||||
# This test verifies the API accepts credentials, not that they work
|
||||
# (we'd need a real S3 or LocalStack for that)
|
||||
client = create_client(
|
||||
aws_access_key_id="AKIAIOSFODNN7EXAMPLE",
|
||||
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
||||
region_name="us-west-2",
|
||||
)
|
||||
|
||||
# Verify the client was created
|
||||
assert client is not None
|
||||
assert client.service is not None
|
||||
|
||||
# Verify credentials were passed to the storage adapter's boto3 client
|
||||
# The storage adapter should have a client with these credentials
|
||||
storage = client.service.storage
|
||||
assert hasattr(storage, "client")
|
||||
|
||||
# Check that the boto3 client was configured with our credentials
|
||||
# Note: boto3 doesn't expose credentials directly, but we can verify
|
||||
# the client was created (if credentials were invalid, this would fail)
|
||||
assert storage.client is not None
|
||||
|
||||
def test_create_client_with_session_token(self, tmp_path):
|
||||
"""Test passing temporary credentials with session token."""
|
||||
client = create_client(
|
||||
aws_access_key_id="ASIAIOSFODNN7EXAMPLE",
|
||||
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
||||
aws_session_token="FwoGZXIvYXdzEBEaDH...",
|
||||
)
|
||||
|
||||
assert client is not None
|
||||
assert client.service.storage.client is not None
|
||||
|
||||
def test_create_client_without_credentials_uses_environment(self, tmp_path):
|
||||
"""Test that omitting credentials falls back to environment/IAM."""
|
||||
# This should use boto3's default credential chain
|
||||
client = create_client()
|
||||
|
||||
assert client is not None
|
||||
assert client.service.storage.client is not None
|
||||
|
||||
def test_create_client_with_endpoint_and_credentials(self, tmp_path):
|
||||
"""Test passing both endpoint URL and credentials."""
|
||||
client = create_client(
|
||||
endpoint_url="http://localhost:9000",
|
||||
aws_access_key_id="minioadmin",
|
||||
aws_secret_access_key="minioadmin",
|
||||
)
|
||||
|
||||
assert client is not None
|
||||
# Endpoint should be available
|
||||
assert client.endpoint_url == "http://localhost:9000"
|
||||
|
||||
|
||||
class TestBoto3Compatibility:
|
||||
"""Test boto3-compatible methods."""
|
||||
|
||||
def test_put_object_with_bytes(self, client):
|
||||
"""Test put_object with byte data."""
|
||||
response = client.put_object(Bucket="test-bucket", Key="test.txt", Body=b"Hello World")
|
||||
|
||||
assert "ETag" in response
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
# Check object was stored
|
||||
obj = client.service.storage.objects["test-bucket/test.txt"]
|
||||
assert obj["data"] == b"Hello World"
|
||||
|
||||
def test_put_object_with_string(self, client):
|
||||
"""Test put_object with string data."""
|
||||
response = client.put_object(Bucket="test-bucket", Key="test2.txt", Body="Hello String")
|
||||
|
||||
assert "ETag" in response
|
||||
obj = client.service.storage.objects["test-bucket/test2.txt"]
|
||||
assert obj["data"] == b"Hello String"
|
||||
|
||||
def test_get_object(self, client):
|
||||
"""Test get_object retrieval."""
|
||||
# For this test, we'll bypass the DeltaGlider logic and test the client directly
|
||||
# Since the core DeltaGlider always looks for .delta files, we'll mock a .delta file
|
||||
import hashlib
|
||||
|
||||
content = b"Test Content"
|
||||
sha256 = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Add as a direct file (not delta)
|
||||
client.service.storage.objects["test-bucket/get-test.txt"] = {
|
||||
"data": content,
|
||||
"size": len(content),
|
||||
"metadata": {
|
||||
"file_sha256": sha256,
|
||||
"file_size": str(len(content)),
|
||||
"original_name": "get-test.txt",
|
||||
"compression": "none", # Mark as direct upload
|
||||
"tool": "deltaglider/0.2.0",
|
||||
},
|
||||
}
|
||||
|
||||
response = client.get_object(Bucket="test-bucket", Key="get-test.txt")
|
||||
|
||||
assert "Body" in response
|
||||
content = response["Body"].read()
|
||||
assert content == b"Test Content"
|
||||
|
||||
def test_get_object_regular_s3_file(self, client):
|
||||
"""Test get_object with regular S3 files (not uploaded via DeltaGlider)."""
|
||||
|
||||
content = b"Regular S3 File Content"
|
||||
|
||||
# Add as a regular S3 object WITHOUT DeltaGlider metadata
|
||||
client.service.storage.objects["test-bucket/regular-file.pdf"] = {
|
||||
"data": content,
|
||||
"size": len(content),
|
||||
"metadata": {}, # No DeltaGlider metadata
|
||||
}
|
||||
|
||||
# Should successfully download the regular S3 object
|
||||
response = client.get_object(Bucket="test-bucket", Key="regular-file.pdf")
|
||||
|
||||
assert "Body" in response
|
||||
downloaded_content = response["Body"].read()
|
||||
assert downloaded_content == content
|
||||
assert response["ContentLength"] == len(content)
|
||||
|
||||
def test_list_objects(self, client):
|
||||
"""Test list_objects with various options (boto3-compatible dict response)."""
|
||||
# List all objects (default: FetchMetadata=False)
|
||||
response = client.list_objects(Bucket="test-bucket")
|
||||
|
||||
# Response is now a boto3-compatible dict (not ListObjectsResponse)
|
||||
assert isinstance(response, dict)
|
||||
assert response["KeyCount"] > 0
|
||||
assert len(response["Contents"]) > 0
|
||||
|
||||
# Verify S3Object structure
|
||||
for obj in response["Contents"]:
|
||||
assert "Key" in obj
|
||||
assert "Size" in obj
|
||||
assert "LastModified" in obj
|
||||
assert "Metadata" in obj # DeltaGlider metadata
|
||||
|
||||
# Test with FetchMetadata=True (should only affect delta files)
|
||||
response_with_metadata = client.list_objects(Bucket="test-bucket", FetchMetadata=True)
|
||||
assert isinstance(response_with_metadata, dict)
|
||||
assert response_with_metadata["KeyCount"] > 0
|
||||
|
||||
def test_list_objects_with_delimiter(self, client):
|
||||
"""Test list_objects with delimiter for folder simulation (boto3-compatible dict response)."""
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="", Delimiter="/")
|
||||
|
||||
# Should have common prefixes for folders
|
||||
assert len(response.get("CommonPrefixes", [])) > 0
|
||||
assert {"Prefix": "folder1/"} in response["CommonPrefixes"]
|
||||
assert {"Prefix": "folder2/"} in response["CommonPrefixes"]
|
||||
|
||||
def test_delete_object(self, client):
|
||||
"""Test delete_object."""
|
||||
# Add object
|
||||
client.service.storage.objects["test-bucket/to-delete.txt"] = {"size": 10}
|
||||
|
||||
response = client.delete_object(Bucket="test-bucket", Key="to-delete.txt")
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 204
|
||||
assert "test-bucket/to-delete.txt" not in client.service.storage.objects
|
||||
|
||||
def test_delete_object_with_delta_suffix_fallback(self, client):
|
||||
"""Test delete_object with automatic .delta suffix fallback."""
|
||||
# Add object with .delta suffix (as DeltaGlider stores it)
|
||||
client.service.storage.objects["test-bucket/file.zip.delta"] = {
|
||||
"size": 100,
|
||||
"metadata": {
|
||||
"original_name": "file.zip",
|
||||
"compression": "delta",
|
||||
},
|
||||
}
|
||||
|
||||
# Delete using original name (without .delta)
|
||||
response = client.delete_object(Bucket="test-bucket", Key="file.zip")
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 204
|
||||
assert response["DeltaGliderInfo"]["Deleted"] is True
|
||||
assert "test-bucket/file.zip.delta" not in client.service.storage.objects
|
||||
|
||||
def test_delete_objects(self, client):
|
||||
"""Test batch delete."""
|
||||
# Add objects
|
||||
client.service.storage.objects["test-bucket/del1.txt"] = {"size": 10}
|
||||
client.service.storage.objects["test-bucket/del2.txt"] = {"size": 20}
|
||||
|
||||
response = client.delete_objects(
|
||||
Bucket="test-bucket",
|
||||
Delete={"Objects": [{"Key": "del1.txt"}, {"Key": "del2.txt"}]},
|
||||
)
|
||||
|
||||
assert len(response["Deleted"]) == 2
|
||||
assert "test-bucket/del1.txt" not in client.service.storage.objects
|
||||
|
||||
|
||||
class TestDeltaGliderFeatures:
|
||||
"""Test DeltaGlider-specific features."""
|
||||
|
||||
def test_compression_estimation_for_archive(self, client, tmp_path):
|
||||
"""Test compression estimation for archive files."""
|
||||
# Create a fake zip file
|
||||
test_file = tmp_path / "test.zip"
|
||||
test_file.write_bytes(b"PK\x03\x04" + b"0" * 1000)
|
||||
|
||||
estimate = client.estimate_compression(test_file, "test-bucket", "archives/")
|
||||
|
||||
assert isinstance(estimate, CompressionEstimate)
|
||||
assert estimate.should_use_delta is True
|
||||
assert estimate.original_size == test_file.stat().st_size
|
||||
|
||||
def test_compression_estimation_for_image(self, client, tmp_path):
|
||||
"""Test compression estimation for incompressible files."""
|
||||
test_file = tmp_path / "image.jpg"
|
||||
test_file.write_bytes(b"\xff\xd8\xff" + b"0" * 1000) # JPEG header
|
||||
|
||||
estimate = client.estimate_compression(test_file, "test-bucket", "images/")
|
||||
|
||||
assert estimate.should_use_delta is False
|
||||
assert estimate.estimated_ratio == 0.0
|
||||
|
||||
def test_find_similar_files(self, client):
|
||||
"""Test finding similar files for delta compression."""
|
||||
similar = client.find_similar_files("test-bucket", "folder1/", "file_v1.txt")
|
||||
|
||||
assert isinstance(similar, list)
|
||||
# Should find files in folder1
|
||||
assert any("folder1/" in item["Key"] for item in similar)
|
||||
|
||||
def test_upload_batch(self, client, tmp_path):
|
||||
"""Test batch upload functionality."""
|
||||
# Create test files
|
||||
files = []
|
||||
for i in range(3):
|
||||
f = tmp_path / f"batch{i}.txt"
|
||||
f.write_text(f"Content {i}")
|
||||
files.append(f)
|
||||
|
||||
results = client.upload_batch(files, "s3://test-bucket/batch/")
|
||||
|
||||
assert len(results) == 3
|
||||
for result in results:
|
||||
assert result.original_size > 0
|
||||
|
||||
def test_download_batch(self, client, tmp_path):
|
||||
"""Test batch download functionality."""
|
||||
# Add test objects with proper metadata
|
||||
for i in range(3):
|
||||
key = f"test-bucket/download/file{i}.txt"
|
||||
content = f"Content {i}".encode()
|
||||
client.service.storage.objects[key] = {
|
||||
"data": content,
|
||||
"size": len(content),
|
||||
"metadata": {
|
||||
"file_sha256": hashlib.sha256(content).hexdigest(),
|
||||
"file_size": str(len(content)),
|
||||
"compression": "none", # Mark as direct upload
|
||||
"tool": "deltaglider/0.2.0",
|
||||
},
|
||||
}
|
||||
|
||||
s3_urls = [f"s3://test-bucket/download/file{i}.txt" for i in range(3)]
|
||||
results = client.download_batch(s3_urls, tmp_path)
|
||||
|
||||
assert len(results) == 3
|
||||
for i, path in enumerate(results):
|
||||
assert path.exists()
|
||||
assert path.read_text() == f"Content {i}"
|
||||
|
||||
def test_get_object_info(self, client):
|
||||
"""Test getting detailed object information."""
|
||||
# Use the pre-populated delta object
|
||||
info = client.get_object_info("s3://test-bucket/archive.zip.delta")
|
||||
|
||||
assert isinstance(info, ObjectInfo)
|
||||
assert info.is_delta is True
|
||||
assert info.original_size == 1000
|
||||
assert info.compressed_size == 50
|
||||
assert info.compression_ratio == 0.95
|
||||
|
||||
def test_get_bucket_stats(self, client):
|
||||
"""Test getting bucket statistics."""
|
||||
# Test quick stats (default: detailed_stats=False)
|
||||
stats = client.get_bucket_stats("test-bucket")
|
||||
|
||||
assert isinstance(stats, BucketStats)
|
||||
assert stats.object_count > 0
|
||||
assert stats.total_size > 0
|
||||
assert stats.delta_objects >= 1 # We have archive.zip.delta
|
||||
|
||||
# Test with detailed_stats=True
|
||||
detailed_stats = client.get_bucket_stats("test-bucket", detailed_stats=True)
|
||||
assert isinstance(detailed_stats, BucketStats)
|
||||
assert detailed_stats.object_count == stats.object_count
|
||||
|
||||
def test_upload_chunked(self, client, tmp_path):
|
||||
"""Test chunked upload with progress callback."""
|
||||
# Create a test file
|
||||
test_file = tmp_path / "large.bin"
|
||||
test_file.write_bytes(b"X" * (10 * 1024)) # 10KB
|
||||
|
||||
progress_calls = []
|
||||
|
||||
def progress_callback(chunk_num, total_chunks, bytes_sent, total_bytes):
|
||||
progress_calls.append((chunk_num, total_chunks, bytes_sent, total_bytes))
|
||||
|
||||
result = client.upload_chunked(
|
||||
test_file,
|
||||
"s3://test-bucket/large.bin",
|
||||
chunk_size=3 * 1024, # 3KB chunks
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
assert result.original_size == 10 * 1024
|
||||
assert len(progress_calls) > 0 # Progress was reported
|
||||
|
||||
def test_generate_presigned_url(self, client):
|
||||
"""Test presigned URL generation (placeholder)."""
|
||||
url = client.generate_presigned_url(
|
||||
ClientMethod="get_object",
|
||||
Params={"Bucket": "test-bucket", "Key": "file.txt"},
|
||||
ExpiresIn=3600,
|
||||
)
|
||||
|
||||
assert isinstance(url, str)
|
||||
assert "file.txt" in url
|
||||
assert "expires=3600" in url
|
||||
524
tests/integration/test_delete_objects_recursive.py
Normal file
524
tests/integration/test_delete_objects_recursive.py
Normal file
@@ -0,0 +1,524 @@
|
||||
"""Comprehensive tests for DeltaGliderClient.delete_objects_recursive() method."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider import create_client
|
||||
|
||||
|
||||
class MockStorage:
|
||||
"""Mock storage for testing."""
|
||||
|
||||
def __init__(self):
|
||||
self.objects = {}
|
||||
self.delete_calls = []
|
||||
|
||||
def head(self, key):
|
||||
"""Mock head operation."""
|
||||
from deltaglider.ports.storage import ObjectHead
|
||||
|
||||
if key in self.objects:
|
||||
obj = self.objects[key]
|
||||
return ObjectHead(
|
||||
key=key,
|
||||
size=obj["size"],
|
||||
etag=obj.get("etag", "mock-etag"),
|
||||
last_modified=obj.get("last_modified", datetime.now(UTC)),
|
||||
metadata=obj.get("metadata", {}),
|
||||
)
|
||||
return None
|
||||
|
||||
def list(self, prefix):
|
||||
"""Mock list operation for StoragePort interface."""
|
||||
for key, _obj in self.objects.items():
|
||||
if key.startswith(prefix):
|
||||
obj_head = self.head(key)
|
||||
if obj_head is not None:
|
||||
yield obj_head
|
||||
|
||||
def delete(self, key):
|
||||
"""Mock delete operation."""
|
||||
self.delete_calls.append(key)
|
||||
if key in self.objects:
|
||||
del self.objects[key]
|
||||
return True
|
||||
return False
|
||||
|
||||
def get(self, key):
|
||||
"""Mock get operation."""
|
||||
if key in self.objects:
|
||||
return self.objects[key].get("content", b"mock-content")
|
||||
return None
|
||||
|
||||
def put(self, key, data, metadata=None):
|
||||
"""Mock put operation."""
|
||||
self.objects[key] = {
|
||||
"size": len(data),
|
||||
"content": data,
|
||||
"metadata": metadata or {},
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_storage():
|
||||
"""Create mock storage."""
|
||||
return MockStorage()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(tmp_path):
|
||||
"""Create DeltaGliderClient with mock storage."""
|
||||
# Use create_client to get a properly configured client
|
||||
client = create_client()
|
||||
|
||||
# Replace storage with mock
|
||||
mock_storage = MockStorage()
|
||||
client.service.storage = mock_storage
|
||||
|
||||
return client
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveBasicFunctionality:
|
||||
"""Test basic functionality of delete_objects_recursive."""
|
||||
|
||||
def test_delete_single_object_with_file_prefix(self, client):
|
||||
"""Test deleting a single object when prefix is a file (no trailing slash)."""
|
||||
# Setup: Add a regular file
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify response structure
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert "DeletedCount" in response
|
||||
assert "FailedCount" in response
|
||||
assert "DeltaGliderInfo" in response
|
||||
|
||||
# Verify DeltaGliderInfo structure
|
||||
info = response["DeltaGliderInfo"]
|
||||
assert "DeltasDeleted" in info
|
||||
assert "ReferencesDeleted" in info
|
||||
assert "DirectDeleted" in info
|
||||
assert "OtherDeleted" in info
|
||||
|
||||
def test_delete_directory_with_trailing_slash(self, client):
|
||||
"""Test deleting all objects under a prefix with trailing slash."""
|
||||
# Setup: Add multiple files under a prefix
|
||||
client.service.storage.objects["test-bucket/dir/file1.txt"] = {"size": 100}
|
||||
client.service.storage.objects["test-bucket/dir/file2.txt"] = {"size": 200}
|
||||
client.service.storage.objects["test-bucket/dir/sub/file3.txt"] = {"size": 300}
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="dir/")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert response["DeletedCount"] >= 0
|
||||
assert response["FailedCount"] == 0
|
||||
|
||||
def test_delete_empty_prefix_returns_zero_counts(self, client):
|
||||
"""Test deleting with empty prefix returns zero counts."""
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert response["DeletedCount"] >= 0
|
||||
assert response["FailedCount"] == 0
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveDeltaSuffixHandling:
|
||||
"""Test delta suffix fallback logic."""
|
||||
|
||||
def test_delete_file_with_delta_suffix_fallback(self, client):
|
||||
"""Test that delete falls back to .delta suffix if original not found."""
|
||||
# Setup: Add file with .delta suffix
|
||||
client.service.storage.objects["test-bucket/archive.zip.delta"] = {
|
||||
"size": 500,
|
||||
"metadata": {"original_name": "archive.zip"},
|
||||
}
|
||||
|
||||
# Execute: Delete using original name (without .delta)
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="archive.zip")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert "test-bucket/archive.zip.delta" not in client.service.storage.objects
|
||||
|
||||
def test_delete_file_already_with_delta_suffix(self, client):
|
||||
"""Test deleting a file that already has .delta suffix."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.zip.delta"] = {"size": 300}
|
||||
|
||||
# Execute: Delete using .delta suffix directly
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.zip.delta")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_delta_suffix_not_added_for_directory_prefix(self, client):
|
||||
"""Test that .delta suffix is not added when prefix ends with /."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/dir/file.txt"] = {"size": 100}
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="dir/")
|
||||
|
||||
# Verify - should not attempt to delete "dir/.delta"
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveStatisticsAggregation:
|
||||
"""Test statistics aggregation from core service."""
|
||||
|
||||
def test_aggregates_deleted_count_from_service_and_single_deletes(self, client):
|
||||
"""Test that deleted counts are aggregated correctly."""
|
||||
# Setup: Mock service.delete_recursive to return specific counts
|
||||
mock_result = {
|
||||
"deleted_count": 5,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 2,
|
||||
"references_deleted": 1,
|
||||
"direct_deleted": 2,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="test/")
|
||||
|
||||
# Verify aggregation
|
||||
assert response["DeletedCount"] == 5
|
||||
assert response["FailedCount"] == 0
|
||||
assert response["DeltaGliderInfo"]["DeltasDeleted"] == 2
|
||||
assert response["DeltaGliderInfo"]["ReferencesDeleted"] == 1
|
||||
assert response["DeltaGliderInfo"]["DirectDeleted"] == 2
|
||||
assert response["DeltaGliderInfo"]["OtherDeleted"] == 0
|
||||
|
||||
def test_aggregates_single_delete_counts_with_service_counts(self, client):
|
||||
"""Test that single file deletes are aggregated with service counts."""
|
||||
# Setup: Add file to trigger single delete path
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service.delete_recursive to return additional counts
|
||||
mock_result = {
|
||||
"deleted_count": 3,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 1,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 2,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify that counts include both single delete and service delete
|
||||
assert response["DeletedCount"] >= 3 # At least service count
|
||||
assert response["DeltaGliderInfo"]["DeltasDeleted"] >= 1
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveErrorHandling:
|
||||
"""Test error handling and error aggregation."""
|
||||
|
||||
def test_single_delete_error_captured_in_errors_list(self, client):
|
||||
"""Test that errors from single deletes are captured."""
|
||||
# Setup: Add file
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock delete_with_delta_suffix to raise exception
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.side_effect = RuntimeError("Simulated delete error")
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify error captured
|
||||
assert response["FailedCount"] > 0
|
||||
assert "Errors" in response
|
||||
assert any("Simulated delete error" in err for err in response["Errors"])
|
||||
|
||||
def test_service_errors_propagated_in_response(self, client):
|
||||
"""Test that errors from service.delete_recursive are propagated."""
|
||||
# Mock service to return errors
|
||||
mock_result = {
|
||||
"deleted_count": 2,
|
||||
"failed_count": 1,
|
||||
"deltas_deleted": 2,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
"errors": ["Error deleting object1", "Error deleting object2"],
|
||||
}
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="test/")
|
||||
|
||||
# Verify
|
||||
assert response["FailedCount"] == 1
|
||||
assert "Errors" in response
|
||||
assert "Error deleting object1" in response["Errors"]
|
||||
assert "Error deleting object2" in response["Errors"]
|
||||
|
||||
def test_combines_single_and_service_errors(self, client):
|
||||
"""Test that errors from both single deletes and service are combined."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service to also return errors
|
||||
mock_result = {
|
||||
"deleted_count": 1,
|
||||
"failed_count": 1,
|
||||
"deltas_deleted": 0,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
"errors": ["Service delete error"],
|
||||
}
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to raise exception
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.side_effect = RuntimeError("Single delete error")
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify both errors present
|
||||
assert "Errors" in response
|
||||
errors_str = " ".join(response["Errors"])
|
||||
assert "Single delete error" in errors_str
|
||||
assert "Service delete error" in errors_str
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveWarningsHandling:
|
||||
"""Test warning aggregation."""
|
||||
|
||||
def test_service_warnings_propagated_in_response(self, client):
|
||||
"""Test that warnings from service.delete_recursive are propagated."""
|
||||
# Mock service to return warnings
|
||||
mock_result = {
|
||||
"deleted_count": 3,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 2,
|
||||
"references_deleted": 1,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
"warnings": ["Reference deleted, 2 dependent deltas invalidated"],
|
||||
}
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="test/")
|
||||
|
||||
# Verify
|
||||
assert "Warnings" in response
|
||||
assert "Reference deleted, 2 dependent deltas invalidated" in response["Warnings"]
|
||||
|
||||
def test_single_delete_warnings_propagated(self, client):
|
||||
"""Test that warnings from single deletes are captured."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/ref.bin"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = {
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 0,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to return warnings
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"ref.bin",
|
||||
{
|
||||
"deleted": True,
|
||||
"type": "reference",
|
||||
"warnings": ["Warning from single delete"],
|
||||
},
|
||||
)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="ref.bin")
|
||||
|
||||
# Verify
|
||||
assert "Warnings" in response
|
||||
assert "Warning from single delete" in response["Warnings"]
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveSingleDeleteDetails:
|
||||
"""Test SingleDeletes detail tracking."""
|
||||
|
||||
def test_single_delete_details_included_for_file_prefix(self, client):
|
||||
"""Test that SingleDeletes details are included when deleting file prefix."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = {
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 0,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"file.txt",
|
||||
{
|
||||
"deleted": True,
|
||||
"type": "direct",
|
||||
"dependent_deltas": 0,
|
||||
"warnings": [],
|
||||
},
|
||||
)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify
|
||||
assert "SingleDeletes" in response["DeltaGliderInfo"]
|
||||
single_deletes = response["DeltaGliderInfo"]["SingleDeletes"]
|
||||
assert len(single_deletes) > 0
|
||||
assert single_deletes[0]["Key"] == "file.txt"
|
||||
assert single_deletes[0]["Type"] == "direct"
|
||||
assert "DependentDeltas" in single_deletes[0]
|
||||
assert "Warnings" in single_deletes[0]
|
||||
|
||||
def test_single_delete_includes_stored_key_when_different(self, client):
|
||||
"""Test that StoredKey is included when actual key differs from requested."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.zip.delta"] = {"size": 200}
|
||||
|
||||
# Mock delete_with_delta_suffix to return different key
|
||||
from deltaglider import client_delete_helpers
|
||||
|
||||
original_delete = client_delete_helpers.delete_with_delta_suffix
|
||||
|
||||
def mock_delete(service, bucket, key):
|
||||
actual_key = "file.zip.delta" if key == "file.zip" else key
|
||||
return (
|
||||
actual_key,
|
||||
{
|
||||
"deleted": True,
|
||||
"type": "delta",
|
||||
"dependent_deltas": 0,
|
||||
"warnings": [],
|
||||
},
|
||||
)
|
||||
|
||||
client_delete_helpers.delete_with_delta_suffix = mock_delete
|
||||
|
||||
# Mock service
|
||||
mock_result = {
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 0,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
try:
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.zip")
|
||||
|
||||
# Verify
|
||||
assert "SingleDeletes" in response["DeltaGliderInfo"]
|
||||
single_deletes = response["DeltaGliderInfo"]["SingleDeletes"]
|
||||
if len(single_deletes) > 0:
|
||||
# If actual key differs, StoredKey should be present
|
||||
detail = single_deletes[0]
|
||||
if detail["Key"] != "file.zip.delta":
|
||||
assert "StoredKey" in detail
|
||||
finally:
|
||||
client_delete_helpers.delete_with_delta_suffix = original_delete
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveEdgeCases:
|
||||
"""Test edge cases and boundary conditions."""
|
||||
|
||||
def test_nonexistent_prefix_returns_zero_counts(self, client):
|
||||
"""Test deleting nonexistent prefix returns zero counts."""
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="nonexistent/path/")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert response["DeletedCount"] >= 0
|
||||
assert response["FailedCount"] == 0
|
||||
|
||||
def test_duplicate_candidates_handled_correctly(self, client):
|
||||
"""Test that duplicate delete candidates are handled correctly."""
|
||||
# Setup: This tests the seen_candidates logic
|
||||
client.service.storage.objects["test-bucket/file.delta"] = {"size": 100}
|
||||
|
||||
# Execute: Should not attempt to delete "file.delta" twice
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.delta")
|
||||
|
||||
# Verify no errors
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_unknown_result_type_categorized_as_other(self, client):
|
||||
"""Test that unknown result types are categorized as 'other'."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = {
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 0,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to return unknown type
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"file.txt",
|
||||
{
|
||||
"deleted": True,
|
||||
"type": "unknown_type", # Not in single_counts keys
|
||||
"dependent_deltas": 0,
|
||||
"warnings": [],
|
||||
},
|
||||
)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify it's categorized as "other"
|
||||
assert response["DeltaGliderInfo"]["OtherDeleted"] >= 1
|
||||
# Also verify the detail shows the unknown type
|
||||
if "SingleDeletes" in response["DeltaGliderInfo"]:
|
||||
assert response["DeltaGliderInfo"]["SingleDeletes"][0]["Type"] == "unknown_type"
|
||||
|
||||
def test_kwargs_parameter_accepted(self, client):
|
||||
"""Test that additional kwargs are accepted without error."""
|
||||
# Execute with extra parameters
|
||||
response = client.delete_objects_recursive(
|
||||
Bucket="test-bucket",
|
||||
Prefix="test/",
|
||||
ExtraParam="value", # Should be ignored
|
||||
AnotherParam=123,
|
||||
)
|
||||
|
||||
# Verify no errors
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
445
tests/integration/test_filtering_and_cleanup.py
Normal file
445
tests/integration/test_filtering_and_cleanup.py
Normal file
@@ -0,0 +1,445 @@
|
||||
"""Tests for SDK filtering and delete cleanup functionality."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.main import create_service
|
||||
from deltaglider.client import DeltaGliderClient
|
||||
from deltaglider.core import ObjectKey
|
||||
from deltaglider.ports.storage import ObjectHead
|
||||
|
||||
|
||||
class TestSDKFiltering:
|
||||
"""Test that SDK filters .delta and reference.bin from list_objects()."""
|
||||
|
||||
def test_list_objects_filters_delta_suffix(self):
|
||||
"""Test that .delta suffix is stripped from object keys."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock list_objects response with .delta files
|
||||
mock_storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{
|
||||
"key": "releases/app-v1.zip.delta",
|
||||
"size": 1000,
|
||||
"last_modified": "2025-01-01T00:00:00Z",
|
||||
"etag": "abc123",
|
||||
"storage_class": "STANDARD",
|
||||
},
|
||||
{
|
||||
"key": "releases/app-v2.zip.delta",
|
||||
"size": 1500,
|
||||
"last_modified": "2025-01-02T00:00:00Z",
|
||||
"etag": "def456",
|
||||
"storage_class": "STANDARD",
|
||||
},
|
||||
{
|
||||
"key": "releases/README.md",
|
||||
"size": 500,
|
||||
"last_modified": "2025-01-03T00:00:00Z",
|
||||
"etag": "ghi789",
|
||||
"storage_class": "STANDARD",
|
||||
},
|
||||
],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": False,
|
||||
"next_continuation_token": None,
|
||||
}
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="releases/")
|
||||
|
||||
# Response is now a boto3-compatible dict
|
||||
contents = response["Contents"]
|
||||
|
||||
# Verify .delta suffix is stripped
|
||||
keys = [obj["Key"] for obj in contents]
|
||||
assert "releases/app-v1.zip" in keys
|
||||
assert "releases/app-v2.zip" in keys
|
||||
assert "releases/README.md" in keys
|
||||
|
||||
# Verify NO .delta suffixes in output
|
||||
for key in keys:
|
||||
assert not key.endswith(".delta"), f"Found .delta suffix in: {key}"
|
||||
|
||||
# Verify is_delta flag is set correctly in Metadata
|
||||
delta_objects = [
|
||||
obj for obj in contents if obj.get("Metadata", {}).get("deltaglider-is-delta") == "true"
|
||||
]
|
||||
assert len(delta_objects) == 2
|
||||
|
||||
def test_list_objects_filters_reference_bin(self):
|
||||
"""Test that reference.bin files are completely filtered out."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock list_objects response with reference.bin files
|
||||
mock_storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{
|
||||
"key": "releases/reference.bin",
|
||||
"size": 50000,
|
||||
"last_modified": "2025-01-01T00:00:00Z",
|
||||
"etag": "ref123",
|
||||
"storage_class": "STANDARD",
|
||||
},
|
||||
{
|
||||
"key": "releases/1.0/reference.bin",
|
||||
"size": 50000,
|
||||
"last_modified": "2025-01-01T00:00:00Z",
|
||||
"etag": "ref456",
|
||||
"storage_class": "STANDARD",
|
||||
},
|
||||
{
|
||||
"key": "releases/app.zip.delta",
|
||||
"size": 1000,
|
||||
"last_modified": "2025-01-02T00:00:00Z",
|
||||
"etag": "app123",
|
||||
"storage_class": "STANDARD",
|
||||
},
|
||||
],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": False,
|
||||
"next_continuation_token": None,
|
||||
}
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="releases/")
|
||||
|
||||
# Response is now a boto3-compatible dict
|
||||
contents = response["Contents"]
|
||||
|
||||
# Verify NO reference.bin files in output
|
||||
keys = [obj["Key"] for obj in contents]
|
||||
for key in keys:
|
||||
assert not key.endswith("reference.bin"), f"Found reference.bin in: {key}"
|
||||
|
||||
# Should only have the app.zip (with .delta stripped)
|
||||
assert len(contents) == 1
|
||||
assert contents[0]["Key"] == "releases/app.zip"
|
||||
assert contents[0].get("Metadata", {}).get("deltaglider-is-delta") == "true"
|
||||
|
||||
def test_list_objects_combined_filtering(self):
|
||||
"""Test filtering of both .delta and reference.bin together."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock comprehensive file list
|
||||
mock_storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{
|
||||
"key": "data/reference.bin",
|
||||
"size": 50000,
|
||||
"last_modified": "2025-01-01T00:00:00Z",
|
||||
"etag": "1",
|
||||
},
|
||||
{
|
||||
"key": "data/file1.zip.delta",
|
||||
"size": 1000,
|
||||
"last_modified": "2025-01-01T00:00:00Z",
|
||||
"etag": "2",
|
||||
},
|
||||
{
|
||||
"key": "data/file2.zip.delta",
|
||||
"size": 1500,
|
||||
"last_modified": "2025-01-01T00:00:00Z",
|
||||
"etag": "3",
|
||||
},
|
||||
{
|
||||
"key": "data/file3.txt",
|
||||
"size": 500,
|
||||
"last_modified": "2025-01-01T00:00:00Z",
|
||||
"etag": "4",
|
||||
},
|
||||
{
|
||||
"key": "data/sub/reference.bin",
|
||||
"size": 50000,
|
||||
"last_modified": "2025-01-01T00:00:00Z",
|
||||
"etag": "5",
|
||||
},
|
||||
{
|
||||
"key": "data/sub/app.jar.delta",
|
||||
"size": 2000,
|
||||
"last_modified": "2025-01-01T00:00:00Z",
|
||||
"etag": "6",
|
||||
},
|
||||
],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": False,
|
||||
"next_continuation_token": None,
|
||||
}
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="data/")
|
||||
|
||||
# Response is now a boto3-compatible dict
|
||||
contents = response["Contents"]
|
||||
|
||||
# Should filter out 2 reference.bin files
|
||||
# Should strip .delta from 3 files
|
||||
# Should keep 1 regular file as-is
|
||||
assert len(contents) == 4 # 3 deltas + 1 regular file
|
||||
|
||||
keys = [obj["Key"] for obj in contents]
|
||||
expected_keys = ["data/file1.zip", "data/file2.zip", "data/file3.txt", "data/sub/app.jar"]
|
||||
assert sorted(keys) == sorted(expected_keys)
|
||||
|
||||
# Verify no internal files visible
|
||||
for key in keys:
|
||||
assert not key.endswith(".delta")
|
||||
assert not key.endswith("reference.bin")
|
||||
|
||||
|
||||
class TestSingleDeleteCleanup:
|
||||
"""Test that single delete() cleans up orphaned reference.bin."""
|
||||
|
||||
def test_delete_last_delta_cleans_reference(self):
|
||||
"""Test that deleting the last delta file removes orphaned reference.bin."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock head for both delta and reference.bin
|
||||
def mock_head_func(key):
|
||||
if key.endswith("app.zip.delta"):
|
||||
return ObjectHead(
|
||||
key="releases/app.zip.delta",
|
||||
size=1000,
|
||||
etag="abc123",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={"original_name": "app.zip", "ref_key": "releases/reference.bin"},
|
||||
)
|
||||
elif key.endswith("reference.bin"):
|
||||
return ObjectHead(
|
||||
key="releases/reference.bin",
|
||||
size=50000,
|
||||
etag="ref123",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={},
|
||||
)
|
||||
return None
|
||||
|
||||
mock_storage.head.side_effect = mock_head_func
|
||||
|
||||
# Mock list to show NO other deltas remain
|
||||
mock_storage.list.return_value = [
|
||||
ObjectHead(
|
||||
key="releases/reference.bin",
|
||||
size=50000,
|
||||
etag="ref123",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
# Delete the last delta
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
|
||||
# Verify reference.bin cleanup was triggered
|
||||
assert "cleaned_reference" in result
|
||||
assert result["cleaned_reference"] == "releases/reference.bin"
|
||||
|
||||
# Verify both files were deleted
|
||||
assert mock_storage.delete.call_count == 2
|
||||
delete_calls = [call[0][0] for call in mock_storage.delete.call_args_list]
|
||||
assert "test-bucket/releases/app.zip.delta" in delete_calls
|
||||
assert "test-bucket/releases/reference.bin" in delete_calls
|
||||
|
||||
def test_delete_delta_keeps_reference_when_others_exist(self):
|
||||
"""Test that reference.bin is kept when other deltas remain."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock the delta file being deleted
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="releases/app-v1.zip.delta",
|
||||
size=1000,
|
||||
etag="abc123",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={"original_name": "app-v1.zip"},
|
||||
)
|
||||
|
||||
# Mock list to show OTHER deltas still exist
|
||||
mock_storage.list.return_value = [
|
||||
ObjectHead(
|
||||
key="releases/app-v2.zip.delta",
|
||||
size=1500,
|
||||
etag="def456",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="releases/reference.bin",
|
||||
size=50000,
|
||||
etag="ref123",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
# Delete one delta (but others remain)
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app-v1.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
|
||||
# Verify reference.bin was NOT cleaned up
|
||||
assert "cleaned_reference" not in result
|
||||
|
||||
# Verify only the delta was deleted, not reference.bin
|
||||
assert mock_storage.delete.call_count == 1
|
||||
mock_storage.delete.assert_called_once_with("test-bucket/releases/app-v1.zip.delta")
|
||||
|
||||
def test_delete_delta_no_reference_exists(self):
|
||||
"""Test deleting delta when reference.bin doesn't exist (edge case)."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock the delta file
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="releases/app.zip.delta",
|
||||
size=1000,
|
||||
etag="abc123",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={"original_name": "app.zip"},
|
||||
)
|
||||
|
||||
# Mock list shows no other deltas
|
||||
mock_storage.list.return_value = []
|
||||
|
||||
# Mock head for reference.bin returns None (doesn't exist)
|
||||
def mock_head_func(key):
|
||||
if key.endswith("reference.bin"):
|
||||
return None
|
||||
return ObjectHead(
|
||||
key="releases/app.zip.delta",
|
||||
size=1000,
|
||||
etag="abc123",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={},
|
||||
)
|
||||
|
||||
mock_storage.head.side_effect = mock_head_func
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
# Delete the delta
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
|
||||
# Verify no reference cleanup (since it didn't exist)
|
||||
assert "cleaned_reference" not in result
|
||||
|
||||
# Only delta should be deleted
|
||||
assert mock_storage.delete.call_count == 1
|
||||
|
||||
def test_delete_isolated_deltaspaces(self):
|
||||
"""Test that cleanup only affects the specific DeltaSpace."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock head for both delta and reference.bin
|
||||
def mock_head_func(key):
|
||||
if "1.0/app.zip.delta" in key:
|
||||
return ObjectHead(
|
||||
key="releases/1.0/app.zip.delta",
|
||||
size=1000,
|
||||
etag="abc123",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={"original_name": "app.zip"},
|
||||
)
|
||||
elif "1.0/reference.bin" in key:
|
||||
return ObjectHead(
|
||||
key="releases/1.0/reference.bin",
|
||||
size=50000,
|
||||
etag="ref1",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={},
|
||||
)
|
||||
return None
|
||||
|
||||
mock_storage.head.side_effect = mock_head_func
|
||||
|
||||
# Mock list for 1.0 - no other deltas
|
||||
mock_storage.list.return_value = [
|
||||
ObjectHead(
|
||||
key="releases/1.0/reference.bin",
|
||||
size=50000,
|
||||
etag="ref1",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
# Delete from 1.0
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/1.0/app.zip.delta"))
|
||||
|
||||
# Should clean up only 1.0/reference.bin
|
||||
assert result["cleaned_reference"] == "releases/1.0/reference.bin"
|
||||
|
||||
# Verify correct files deleted
|
||||
delete_calls = [call[0][0] for call in mock_storage.delete.call_args_list]
|
||||
assert "test-bucket/releases/1.0/app.zip.delta" in delete_calls
|
||||
assert "test-bucket/releases/1.0/reference.bin" in delete_calls
|
||||
|
||||
|
||||
class TestRecursiveDeleteCleanup:
|
||||
"""Test that recursive delete properly cleans up references."""
|
||||
|
||||
def test_recursive_delete_reference_cleanup_already_works(self):
|
||||
"""Verify existing recursive delete reference cleanup is working."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock objects in deltaspace
|
||||
mock_storage.list.return_value = [
|
||||
ObjectHead(
|
||||
key="data/app.zip.delta",
|
||||
size=1000,
|
||||
etag="1",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="data/reference.bin",
|
||||
size=50000,
|
||||
etag="2",
|
||||
last_modified=datetime.now(UTC),
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_storage.head.return_value = None
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
result = service.delete_recursive("test-bucket", "data/")
|
||||
|
||||
# Should delete both delta and reference
|
||||
assert result["deleted_count"] == 2
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -1,146 +0,0 @@
|
||||
"""Integration test for get command."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
from click.testing import CliRunner
|
||||
|
||||
from deltaglider.app.cli.main import cli
|
||||
from deltaglider.core import ObjectKey
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_service():
|
||||
"""Create a mock DeltaService."""
|
||||
return Mock()
|
||||
|
||||
|
||||
def test_get_command_with_original_name(mock_service):
|
||||
"""Test get command with original filename (auto-appends .delta)."""
|
||||
runner = CliRunner()
|
||||
|
||||
# Mock the service.get method and storage.head
|
||||
mock_service.get = Mock()
|
||||
mock_service.storage.head = Mock(
|
||||
side_effect=[
|
||||
None, # First check for original file returns None
|
||||
Mock(), # Second check for .delta file returns something
|
||||
]
|
||||
)
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
|
||||
# Run get with original filename (should auto-append .delta)
|
||||
result = runner.invoke(cli, ["get", "s3://test-bucket/data/myfile.zip"])
|
||||
|
||||
# Check it was successful
|
||||
assert result.exit_code == 0
|
||||
assert "Found delta file: s3://test-bucket/data/myfile.zip.delta" in result.output
|
||||
assert "Successfully retrieved: myfile.zip" in result.output
|
||||
|
||||
# Verify the service was called with the correct arguments
|
||||
mock_service.get.assert_called_once()
|
||||
call_args = mock_service.get.call_args
|
||||
obj_key = call_args[0][0]
|
||||
output_path = call_args[0][1]
|
||||
|
||||
assert isinstance(obj_key, ObjectKey)
|
||||
assert obj_key.bucket == "test-bucket"
|
||||
assert obj_key.key == "data/myfile.zip.delta"
|
||||
assert output_path == Path("myfile.zip")
|
||||
|
||||
|
||||
def test_get_command_with_delta_name(mock_service):
|
||||
"""Test get command with explicit .delta filename."""
|
||||
runner = CliRunner()
|
||||
|
||||
# Mock the service.get method and storage.head
|
||||
mock_service.get = Mock()
|
||||
mock_service.storage.head = Mock(return_value=Mock()) # File exists
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
|
||||
# Run get with explicit .delta filename
|
||||
result = runner.invoke(cli, ["get", "s3://test-bucket/data/myfile.zip.delta"])
|
||||
|
||||
# Check it was successful
|
||||
assert result.exit_code == 0
|
||||
assert "Found file: s3://test-bucket/data/myfile.zip.delta" in result.output
|
||||
assert "Successfully retrieved: myfile.zip" in result.output
|
||||
|
||||
# Verify the service was called with the correct arguments
|
||||
mock_service.get.assert_called_once()
|
||||
call_args = mock_service.get.call_args
|
||||
obj_key = call_args[0][0]
|
||||
output_path = call_args[0][1]
|
||||
|
||||
assert isinstance(obj_key, ObjectKey)
|
||||
assert obj_key.bucket == "test-bucket"
|
||||
assert obj_key.key == "data/myfile.zip.delta"
|
||||
assert output_path == Path("myfile.zip")
|
||||
|
||||
|
||||
def test_get_command_with_output_option(mock_service):
|
||||
"""Test get command with custom output path."""
|
||||
runner = CliRunner()
|
||||
|
||||
# Mock the service.get method and storage.head
|
||||
mock_service.get = Mock()
|
||||
mock_service.storage.head = Mock(
|
||||
side_effect=[
|
||||
None, # First check for original file returns None
|
||||
Mock(), # Second check for .delta file returns something
|
||||
]
|
||||
)
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_file = Path(tmpdir) / "custom_output.zip"
|
||||
|
||||
# Run get with custom output path
|
||||
result = runner.invoke(
|
||||
cli, ["get", "s3://test-bucket/data/myfile.zip", "-o", str(output_file)]
|
||||
)
|
||||
|
||||
# Check it was successful
|
||||
assert result.exit_code == 0
|
||||
assert f"Successfully retrieved: {output_file}" in result.output
|
||||
|
||||
# Verify the service was called with the correct arguments
|
||||
mock_service.get.assert_called_once()
|
||||
call_args = mock_service.get.call_args
|
||||
obj_key = call_args[0][0]
|
||||
output_path = call_args[0][1]
|
||||
|
||||
assert isinstance(obj_key, ObjectKey)
|
||||
assert obj_key.bucket == "test-bucket"
|
||||
assert obj_key.key == "data/myfile.zip.delta"
|
||||
assert output_path == output_file
|
||||
|
||||
|
||||
def test_get_command_error_handling(mock_service):
|
||||
"""Test get command error handling."""
|
||||
runner = CliRunner()
|
||||
|
||||
# Mock the service.get method to raise an error
|
||||
mock_service.get = Mock(side_effect=FileNotFoundError("Delta not found"))
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
|
||||
# Run get command
|
||||
result = runner.invoke(cli, ["get", "s3://test-bucket/data/missing.zip"])
|
||||
|
||||
# Check it failed with error message
|
||||
assert result.exit_code == 1
|
||||
assert "Error: Delta not found" in result.output
|
||||
|
||||
|
||||
def test_get_command_invalid_url():
|
||||
"""Test get command with invalid S3 URL."""
|
||||
runner = CliRunner()
|
||||
|
||||
# Run get with invalid URL
|
||||
result = runner.invoke(cli, ["get", "http://invalid-url/file.zip"])
|
||||
|
||||
# Check it failed with error message
|
||||
assert result.exit_code == 1
|
||||
assert "Error: Invalid S3 URL" in result.output
|
||||
397
tests/integration/test_recursive_delete_reference_cleanup.py
Normal file
397
tests/integration/test_recursive_delete_reference_cleanup.py
Normal file
@@ -0,0 +1,397 @@
|
||||
"""Focused tests for recursive delete reference cleanup functionality."""
|
||||
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.main import create_service
|
||||
from deltaglider.ports.storage import ObjectHead
|
||||
|
||||
|
||||
class TestRecursiveDeleteReferenceCleanup:
|
||||
"""Test the core reference cleanup intelligence in recursive delete."""
|
||||
|
||||
def test_core_service_delete_recursive_method_exists(self):
|
||||
"""Test that the core service has the delete_recursive method."""
|
||||
service = create_service()
|
||||
assert hasattr(service, "delete_recursive")
|
||||
assert callable(service.delete_recursive)
|
||||
|
||||
def test_delete_recursive_handles_empty_prefix(self):
|
||||
"""Test delete_recursive gracefully handles empty prefixes."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock empty result
|
||||
mock_storage.list.return_value = []
|
||||
|
||||
result = service.delete_recursive("test-bucket", "nonexistent/")
|
||||
|
||||
assert result["deleted_count"] == 0
|
||||
assert result["failed_count"] == 0
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
|
||||
def test_delete_recursive_returns_structured_result(self):
|
||||
"""Test that delete_recursive returns a properly structured result."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock some objects
|
||||
mock_storage.list.return_value = [
|
||||
ObjectHead(
|
||||
key="test/file1.zip.delta", size=100, etag="1", last_modified=None, metadata={}
|
||||
),
|
||||
ObjectHead(
|
||||
key="test/file2.txt",
|
||||
size=200,
|
||||
etag="2",
|
||||
last_modified=None,
|
||||
metadata={"compression": "none"},
|
||||
),
|
||||
]
|
||||
mock_storage.head.return_value = None
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Verify structure
|
||||
required_keys = [
|
||||
"bucket",
|
||||
"prefix",
|
||||
"deleted_count",
|
||||
"failed_count",
|
||||
"deltas_deleted",
|
||||
"references_deleted",
|
||||
"direct_deleted",
|
||||
"other_deleted",
|
||||
"errors",
|
||||
"warnings",
|
||||
]
|
||||
for key in required_keys:
|
||||
assert key in result, f"Missing key: {key}"
|
||||
|
||||
assert isinstance(result["deleted_count"], int)
|
||||
assert isinstance(result["failed_count"], int)
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
|
||||
def test_delete_recursive_categorizes_objects_correctly(self):
|
||||
"""Test that delete_recursive correctly categorizes different object types."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock different types of objects
|
||||
mock_objects = [
|
||||
ObjectHead(
|
||||
key="test/app.zip.delta",
|
||||
size=100,
|
||||
etag="1",
|
||||
last_modified=None,
|
||||
metadata={"ref_key": "test/reference.bin"},
|
||||
),
|
||||
ObjectHead(
|
||||
key="test/reference.bin",
|
||||
size=50,
|
||||
etag="2",
|
||||
last_modified=None,
|
||||
metadata={"file_sha256": "abc123"},
|
||||
),
|
||||
ObjectHead(
|
||||
key="test/readme.txt",
|
||||
size=200,
|
||||
etag="3",
|
||||
last_modified=None,
|
||||
metadata={"compression": "none"},
|
||||
),
|
||||
ObjectHead(key="test/config.json", size=300, etag="4", last_modified=None, metadata={}),
|
||||
]
|
||||
|
||||
mock_storage.list.return_value = mock_objects
|
||||
mock_storage.head.return_value = None # No dependencies found
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Should categorize correctly - the exact categorization depends on implementation
|
||||
assert result["deltas_deleted"] == 1 # app.zip.delta
|
||||
assert result["references_deleted"] == 1 # reference.bin
|
||||
# Direct and other files may be categorized differently based on metadata detection
|
||||
assert result["direct_deleted"] + result["other_deleted"] == 2 # readme.txt + config.json
|
||||
assert result["deleted_count"] == 4 # total
|
||||
assert result["failed_count"] == 0
|
||||
|
||||
def test_delete_recursive_handles_storage_errors_gracefully(self):
|
||||
"""Test that delete_recursive handles individual storage errors gracefully."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mock objects
|
||||
mock_storage.list.return_value = [
|
||||
ObjectHead(
|
||||
key="test/good.zip.delta", size=100, etag="1", last_modified=None, metadata={}
|
||||
),
|
||||
ObjectHead(
|
||||
key="test/bad.zip.delta", size=200, etag="2", last_modified=None, metadata={}
|
||||
),
|
||||
]
|
||||
mock_storage.head.return_value = None
|
||||
|
||||
# Mock delete to fail for one file
|
||||
def failing_delete(key):
|
||||
if "bad" in key:
|
||||
raise Exception("Simulated S3 error")
|
||||
|
||||
mock_storage.delete.side_effect = failing_delete
|
||||
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Should handle partial failure
|
||||
assert result["deleted_count"] == 1 # good.zip.delta succeeded
|
||||
assert result["failed_count"] == 1 # bad.zip.delta failed
|
||||
assert len(result["errors"]) == 1
|
||||
assert "bad" in result["errors"][0]
|
||||
|
||||
def test_affected_deltaspaces_discovery(self):
|
||||
"""Test that the system discovers affected deltaspaces when deleting deltas."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Create delta files that should trigger parent reference checking
|
||||
mock_objects = [
|
||||
ObjectHead(
|
||||
key="project/team-a/v1/app.zip.delta",
|
||||
size=100,
|
||||
etag="1",
|
||||
last_modified=None,
|
||||
metadata={"ref_key": "project/reference.bin"},
|
||||
),
|
||||
]
|
||||
|
||||
# Mock list to return objects for initial scan, then parent reference when checked
|
||||
list_calls = []
|
||||
|
||||
def mock_list(prefix):
|
||||
list_calls.append(prefix)
|
||||
if prefix == "test-bucket/project/team-a/v1/":
|
||||
return mock_objects
|
||||
elif prefix == "test-bucket/project":
|
||||
# Return parent reference when checking deltaspace
|
||||
return [
|
||||
ObjectHead(
|
||||
key="project/reference.bin",
|
||||
size=50,
|
||||
etag="ref",
|
||||
last_modified=None,
|
||||
metadata={"file_sha256": "abc123"},
|
||||
)
|
||||
]
|
||||
return []
|
||||
|
||||
mock_storage.list.side_effect = mock_list
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="project/reference.bin",
|
||||
size=50,
|
||||
etag="ref",
|
||||
last_modified=None,
|
||||
metadata={"file_sha256": "abc123"},
|
||||
)
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
result = service.delete_recursive("test-bucket", "project/team-a/v1/")
|
||||
|
||||
# Should have discovered and evaluated the parent reference
|
||||
assert result["deleted_count"] >= 1 # At least the delta file
|
||||
assert result["failed_count"] == 0
|
||||
|
||||
def test_cli_uses_core_service_method(self):
|
||||
"""Test that CLI rm -r command uses the core service delete_recursive method."""
|
||||
from click.testing import CliRunner
|
||||
|
||||
from deltaglider.app.cli.main import cli
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service") as mock_create_service:
|
||||
mock_service = Mock()
|
||||
mock_create_service.return_value = mock_service
|
||||
|
||||
# Mock successful deletion
|
||||
mock_service.delete_recursive.return_value = {
|
||||
"bucket": "test-bucket",
|
||||
"prefix": "test/",
|
||||
"deleted_count": 2,
|
||||
"failed_count": 0,
|
||||
"warnings": [],
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
result = runner.invoke(cli, ["rm", "-r", "s3://test-bucket/test/"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
mock_service.delete_recursive.assert_called_once_with("test-bucket", "test")
|
||||
assert "Deleted 2 object(s)" in result.output
|
||||
|
||||
def test_cli_dryrun_does_not_call_delete_recursive(self):
|
||||
"""Test that CLI dryrun does not call the actual delete_recursive method."""
|
||||
from click.testing import CliRunner
|
||||
|
||||
from deltaglider.app.cli.main import cli
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service") as mock_create_service:
|
||||
mock_service = Mock()
|
||||
mock_create_service.return_value = mock_service
|
||||
|
||||
# Mock list for dryrun preview
|
||||
mock_service.storage.list.return_value = [
|
||||
ObjectHead(
|
||||
key="test/file1.zip.delta", size=100, etag="1", last_modified=None, metadata={}
|
||||
),
|
||||
ObjectHead(
|
||||
key="test/file2.txt", size=200, etag="2", last_modified=None, metadata={}
|
||||
),
|
||||
]
|
||||
|
||||
result = runner.invoke(cli, ["rm", "-r", "--dryrun", "s3://test-bucket/test/"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
mock_service.delete_recursive.assert_not_called() # Should not call actual deletion
|
||||
assert "(dryrun) delete:" in result.output
|
||||
assert "Would delete 2 object(s)" in result.output
|
||||
|
||||
def test_integration_with_existing_single_delete(self):
|
||||
"""Test that recursive delete integrates well with existing single delete functionality."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Test that both methods exist and are callable
|
||||
assert hasattr(service, "delete")
|
||||
assert hasattr(service, "delete_recursive")
|
||||
assert callable(service.delete)
|
||||
assert callable(service.delete_recursive)
|
||||
|
||||
# Mock for single delete
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="test/file.zip.delta",
|
||||
size=100,
|
||||
etag="1",
|
||||
last_modified=None,
|
||||
metadata={"original_name": "file.zip"},
|
||||
)
|
||||
mock_storage.list.return_value = [] # No other deltas remain
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
# Test single delete
|
||||
from deltaglider.core import ObjectKey
|
||||
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="test/file.zip.delta"))
|
||||
|
||||
assert result["deleted"]
|
||||
assert result["type"] == "delta"
|
||||
|
||||
def test_reference_cleanup_intelligence_basic(self):
|
||||
"""Basic test to verify reference cleanup intelligence is working."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Simple scenario: one delta and its reference
|
||||
mock_objects = [
|
||||
ObjectHead(
|
||||
key="simple/file.zip.delta",
|
||||
size=100,
|
||||
etag="1",
|
||||
last_modified=None,
|
||||
metadata={"ref_key": "simple/reference.bin"},
|
||||
),
|
||||
ObjectHead(
|
||||
key="simple/reference.bin",
|
||||
size=50,
|
||||
etag="2",
|
||||
last_modified=None,
|
||||
metadata={"file_sha256": "abc123"},
|
||||
),
|
||||
]
|
||||
|
||||
mock_storage.list.return_value = mock_objects
|
||||
mock_storage.head.return_value = None # No other dependencies
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
result = service.delete_recursive("test-bucket", "simple/")
|
||||
|
||||
# Should delete both delta and reference since there are no other dependencies
|
||||
assert result["deleted_count"] == 2
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
assert result["failed_count"] == 0
|
||||
|
||||
def test_comprehensive_result_validation(self):
|
||||
"""Test that all result fields are properly populated."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
# Mix of different object types
|
||||
mock_objects = [
|
||||
ObjectHead(
|
||||
key="mixed/app.zip.delta", size=100, etag="1", last_modified=None, metadata={}
|
||||
),
|
||||
ObjectHead(
|
||||
key="mixed/reference.bin", size=50, etag="2", last_modified=None, metadata={}
|
||||
),
|
||||
ObjectHead(
|
||||
key="mixed/readme.txt",
|
||||
size=200,
|
||||
etag="3",
|
||||
last_modified=None,
|
||||
metadata={"compression": "none"},
|
||||
),
|
||||
ObjectHead(
|
||||
key="mixed/config.json", size=300, etag="4", last_modified=None, metadata={}
|
||||
),
|
||||
]
|
||||
|
||||
mock_storage.list.return_value = mock_objects
|
||||
mock_storage.head.return_value = None
|
||||
mock_storage.delete.return_value = None
|
||||
|
||||
result = service.delete_recursive("test-bucket", "mixed/")
|
||||
|
||||
# Validate all expected fields are present and have correct types
|
||||
assert isinstance(result["bucket"], str)
|
||||
assert isinstance(result["prefix"], str)
|
||||
assert isinstance(result["deleted_count"], int)
|
||||
assert isinstance(result["failed_count"], int)
|
||||
assert isinstance(result["deltas_deleted"], int)
|
||||
assert isinstance(result["references_deleted"], int)
|
||||
assert isinstance(result["direct_deleted"], int)
|
||||
assert isinstance(result["other_deleted"], int)
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
|
||||
# Validate counts add up
|
||||
total_by_type = (
|
||||
result["deltas_deleted"]
|
||||
+ result["references_deleted"]
|
||||
+ result["direct_deleted"]
|
||||
+ result["other_deleted"]
|
||||
)
|
||||
assert result["deleted_count"] == total_by_type
|
||||
|
||||
# Validate specific counts for this scenario
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
# Direct and other files may be categorized differently
|
||||
assert result["direct_deleted"] + result["other_deleted"] == 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
189
tests/unit/test_cache_encrypted.py
Normal file
189
tests/unit/test_cache_encrypted.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Tests for encrypted cache adapter."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from cryptography.fernet import Fernet
|
||||
|
||||
from deltaglider.adapters import ContentAddressedCache, EncryptedCache, Sha256Adapter
|
||||
from deltaglider.core.errors import CacheCorruptionError, CacheMissError
|
||||
|
||||
|
||||
class TestEncryptedCache:
|
||||
"""Test encrypted cache wrapper functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def temp_dir(self):
|
||||
"""Create temporary directory for tests."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
yield Path(tmpdir)
|
||||
|
||||
@pytest.fixture
|
||||
def hasher(self):
|
||||
"""Create SHA256 hasher."""
|
||||
return Sha256Adapter()
|
||||
|
||||
@pytest.fixture
|
||||
def backend(self, temp_dir, hasher):
|
||||
"""Create CAS backend."""
|
||||
return ContentAddressedCache(temp_dir, hasher)
|
||||
|
||||
@pytest.fixture
|
||||
def encrypted_cache(self, backend):
|
||||
"""Create encrypted cache with ephemeral key."""
|
||||
return EncryptedCache(backend)
|
||||
|
||||
def test_ephemeral_key_generation(self, backend):
|
||||
"""Test that ephemeral key is generated automatically."""
|
||||
cache = EncryptedCache(backend)
|
||||
|
||||
assert cache._ephemeral is True
|
||||
assert cache._key is not None
|
||||
assert len(cache._key) == 44 # Base64-encoded 32-byte key
|
||||
|
||||
def test_provided_key_usage(self, backend):
|
||||
"""Test using provided encryption key."""
|
||||
key = Fernet.generate_key()
|
||||
cache = EncryptedCache(backend, encryption_key=key)
|
||||
|
||||
assert cache._ephemeral is False
|
||||
assert cache._key == key
|
||||
|
||||
def test_write_and_read_encrypted(self, encrypted_cache, temp_dir):
|
||||
"""Test writing and reading encrypted content."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Secret data that should be encrypted"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Compute expected SHA
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
# Write to encrypted cache
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Read back and validate
|
||||
decrypted_path = encrypted_cache.get_validated_ref(
|
||||
"test-bucket", "test-prefix", expected_sha
|
||||
)
|
||||
|
||||
# Verify decrypted content matches original
|
||||
decrypted_content = decrypted_path.read_bytes()
|
||||
assert decrypted_content == test_content
|
||||
|
||||
def test_encrypted_storage_not_readable(self, encrypted_cache, backend, temp_dir):
|
||||
"""Test that stored data is actually encrypted."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Plaintext secret"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Write to encrypted cache
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Get the encrypted file path from backend
|
||||
backend_path = backend.ref_path("test-bucket", "test-prefix")
|
||||
|
||||
# Read encrypted content directly
|
||||
encrypted_content = backend_path.read_bytes()
|
||||
|
||||
# Verify content is NOT the same as plaintext
|
||||
assert encrypted_content != test_content
|
||||
# Verify content doesn't contain plaintext substring
|
||||
assert b"secret" not in encrypted_content.lower()
|
||||
|
||||
def test_cache_miss(self, encrypted_cache):
|
||||
"""Test cache miss error."""
|
||||
with pytest.raises(CacheMissError):
|
||||
encrypted_cache.get_validated_ref("no-bucket", "no-prefix", "fakehash")
|
||||
|
||||
def test_decryption_with_wrong_sha(self, encrypted_cache, temp_dir):
|
||||
"""Test that wrong SHA is detected after decryption."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test content"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Write to cache
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Try to read with wrong SHA
|
||||
with pytest.raises(CacheCorruptionError, match="SHA mismatch"):
|
||||
encrypted_cache.get_validated_ref("test-bucket", "test-prefix", "wrong_sha_hash_here")
|
||||
|
||||
def test_decryption_with_wrong_key(self, temp_dir):
|
||||
"""Test that decryption fails with wrong key."""
|
||||
# Create shared backend
|
||||
from deltaglider.adapters import ContentAddressedCache, Sha256Adapter
|
||||
|
||||
hasher = Sha256Adapter()
|
||||
backend = ContentAddressedCache(temp_dir / "shared", hasher)
|
||||
|
||||
# Create two caches with different keys sharing same backend
|
||||
cache1 = EncryptedCache(backend)
|
||||
|
||||
# Write with cache1
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Encrypted data"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
cache1.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Create cache2 with different key (fresh instance, different ephemeral key)
|
||||
# and manually add to its mapping (simulating persistent storage scenario)
|
||||
cache2 = EncryptedCache(backend)
|
||||
cache2._plaintext_sha_map[("test-bucket", "test-prefix")] = expected_sha
|
||||
|
||||
# Try to read with cache2 (different key) - should fail decryption
|
||||
with pytest.raises(CacheCorruptionError, match="Decryption failed"):
|
||||
cache2.get_validated_ref("test-bucket", "test-prefix", expected_sha)
|
||||
|
||||
def test_evict_cleans_decrypted_files(self, encrypted_cache, temp_dir):
|
||||
"""Test that evict cleans up .decrypted temporary files."""
|
||||
# Create and store file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Read to create .decrypted file
|
||||
decrypted_path = encrypted_cache.get_validated_ref(
|
||||
"test-bucket", "test-prefix", expected_sha
|
||||
)
|
||||
assert decrypted_path.exists()
|
||||
|
||||
# Evict
|
||||
encrypted_cache.evict("test-bucket", "test-prefix")
|
||||
|
||||
# Verify .decrypted file is removed
|
||||
assert not decrypted_path.exists()
|
||||
|
||||
def test_from_env_with_no_key(self, backend, monkeypatch):
|
||||
"""Test from_env creates ephemeral key when env var not set."""
|
||||
monkeypatch.delenv("DG_CACHE_ENCRYPTION_KEY", raising=False)
|
||||
|
||||
cache = EncryptedCache.from_env(backend)
|
||||
|
||||
assert cache._ephemeral is True
|
||||
|
||||
def test_from_env_with_key(self, backend, monkeypatch):
|
||||
"""Test from_env uses key from environment."""
|
||||
key = Fernet.generate_key()
|
||||
monkeypatch.setenv("DG_CACHE_ENCRYPTION_KEY", key.decode("utf-8"))
|
||||
|
||||
cache = EncryptedCache.from_env(backend)
|
||||
|
||||
assert cache._ephemeral is False
|
||||
assert cache._key == key
|
||||
200
tests/unit/test_cache_memory.py
Normal file
200
tests/unit/test_cache_memory.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Tests for in-memory cache adapter."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.adapters import MemoryCache, Sha256Adapter
|
||||
from deltaglider.core.errors import CacheCorruptionError, CacheMissError
|
||||
|
||||
|
||||
class TestMemoryCache:
|
||||
"""Test in-memory cache functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def temp_dir(self):
|
||||
"""Create temporary directory for tests."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
yield Path(tmpdir)
|
||||
|
||||
@pytest.fixture
|
||||
def hasher(self):
|
||||
"""Create SHA256 hasher."""
|
||||
return Sha256Adapter()
|
||||
|
||||
@pytest.fixture
|
||||
def memory_cache(self, hasher, temp_dir):
|
||||
"""Create memory cache with 1MB limit."""
|
||||
return MemoryCache(hasher, max_size_mb=1, temp_dir=temp_dir)
|
||||
|
||||
def test_write_and_read(self, memory_cache, temp_dir):
|
||||
"""Test basic write and read functionality."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Hello, memory cache!"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Compute expected SHA
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
# Write to memory cache
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Read back
|
||||
retrieved_path = memory_cache.get_validated_ref("test-bucket", "test-prefix", expected_sha)
|
||||
|
||||
# Verify content
|
||||
assert retrieved_path.read_bytes() == test_content
|
||||
|
||||
def test_has_ref_true(self, memory_cache, temp_dir):
|
||||
"""Test has_ref returns True for existing content."""
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
assert memory_cache.has_ref("test-bucket", "test-prefix", sha) is True
|
||||
|
||||
def test_has_ref_false(self, memory_cache):
|
||||
"""Test has_ref returns False for non-existent content."""
|
||||
assert memory_cache.has_ref("no-bucket", "no-prefix", "fakehash") is False
|
||||
|
||||
def test_cache_miss(self, memory_cache):
|
||||
"""Test cache miss error."""
|
||||
with pytest.raises(CacheMissError):
|
||||
memory_cache.get_validated_ref("no-bucket", "no-prefix", "fakehash")
|
||||
|
||||
def test_sha_mismatch_detection(self, memory_cache, temp_dir):
|
||||
"""Test that SHA mismatch is detected."""
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_file.write_bytes(b"Content")
|
||||
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Try to read with wrong SHA
|
||||
with pytest.raises(CacheCorruptionError, match="SHA mismatch"):
|
||||
memory_cache.get_validated_ref("test-bucket", "test-prefix", "wrong_sha")
|
||||
|
||||
def test_lru_eviction(self, hasher, temp_dir):
|
||||
"""Test LRU eviction when cache is full."""
|
||||
# Create small cache (only 10KB)
|
||||
small_cache = MemoryCache(hasher, max_size_mb=0.01, temp_dir=temp_dir)
|
||||
|
||||
# Create files that will exceed cache limit
|
||||
file1 = temp_dir / "file1.txt"
|
||||
file2 = temp_dir / "file2.txt"
|
||||
file3 = temp_dir / "file3.txt"
|
||||
|
||||
# Each file is 5KB
|
||||
file1.write_bytes(b"A" * 5000)
|
||||
file2.write_bytes(b"B" * 5000)
|
||||
file3.write_bytes(b"C" * 5000)
|
||||
|
||||
# Write file1 and file2 (total 10KB, at limit)
|
||||
small_cache.write_ref("bucket", "prefix1", file1)
|
||||
small_cache.write_ref("bucket", "prefix2", file2)
|
||||
|
||||
# Verify both are in cache
|
||||
import hashlib
|
||||
|
||||
sha1 = hashlib.sha256(b"A" * 5000).hexdigest()
|
||||
sha2 = hashlib.sha256(b"B" * 5000).hexdigest()
|
||||
|
||||
assert small_cache.has_ref("bucket", "prefix1", sha1) is True
|
||||
assert small_cache.has_ref("bucket", "prefix2", sha2) is True
|
||||
|
||||
# Write file3 (5KB) - should evict file1 (LRU)
|
||||
small_cache.write_ref("bucket", "prefix3", file3)
|
||||
|
||||
# file1 should be evicted
|
||||
assert small_cache.has_ref("bucket", "prefix1", sha1) is False
|
||||
|
||||
# file2 and file3 should still be in cache
|
||||
sha3 = hashlib.sha256(b"C" * 5000).hexdigest()
|
||||
assert small_cache.has_ref("bucket", "prefix2", sha2) is True
|
||||
assert small_cache.has_ref("bucket", "prefix3", sha3) is True
|
||||
|
||||
def test_file_too_large_for_cache(self, hasher, temp_dir):
|
||||
"""Test error when file exceeds cache size limit."""
|
||||
small_cache = MemoryCache(hasher, max_size_mb=0.001, temp_dir=temp_dir) # 1KB limit
|
||||
|
||||
large_file = temp_dir / "large.txt"
|
||||
large_file.write_bytes(b"X" * 2000) # 2KB file
|
||||
|
||||
with pytest.raises(CacheCorruptionError, match="too large"):
|
||||
small_cache.write_ref("bucket", "prefix", large_file)
|
||||
|
||||
def test_evict_removes_from_memory(self, memory_cache, temp_dir):
|
||||
"""Test that evict removes content from memory."""
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Verify it's in cache
|
||||
assert memory_cache.has_ref("test-bucket", "test-prefix", sha) is True
|
||||
|
||||
# Evict
|
||||
memory_cache.evict("test-bucket", "test-prefix")
|
||||
|
||||
# Verify it's gone
|
||||
assert memory_cache.has_ref("test-bucket", "test-prefix", sha) is False
|
||||
|
||||
def test_clear_removes_all(self, memory_cache, temp_dir):
|
||||
"""Test that clear removes all cached content."""
|
||||
# Add multiple files
|
||||
for i in range(3):
|
||||
test_file = temp_dir / f"test{i}.txt"
|
||||
test_file.write_bytes(f"Content {i}".encode())
|
||||
memory_cache.write_ref("bucket", f"prefix{i}", test_file)
|
||||
|
||||
# Verify cache is not empty
|
||||
assert memory_cache._current_size > 0
|
||||
assert len(memory_cache._cache) == 3
|
||||
|
||||
# Clear
|
||||
memory_cache.clear()
|
||||
|
||||
# Verify cache is empty
|
||||
assert memory_cache._current_size == 0
|
||||
assert len(memory_cache._cache) == 0
|
||||
assert len(memory_cache._access_order) == 0
|
||||
|
||||
def test_access_order_updated_on_read(self, memory_cache, temp_dir):
|
||||
"""Test that LRU access order is updated on reads."""
|
||||
# Create two files
|
||||
file1 = temp_dir / "file1.txt"
|
||||
file2 = temp_dir / "file2.txt"
|
||||
file1.write_bytes(b"File 1")
|
||||
file2.write_bytes(b"File 2")
|
||||
|
||||
# Write both
|
||||
memory_cache.write_ref("bucket", "prefix1", file1)
|
||||
memory_cache.write_ref("bucket", "prefix2", file2)
|
||||
|
||||
# Access order should be: [prefix1, prefix2]
|
||||
assert memory_cache._access_order[0] == ("bucket", "prefix1")
|
||||
assert memory_cache._access_order[1] == ("bucket", "prefix2")
|
||||
|
||||
# Read prefix1 again
|
||||
import hashlib
|
||||
|
||||
sha1 = hashlib.sha256(b"File 1").hexdigest()
|
||||
memory_cache.get_validated_ref("bucket", "prefix1", sha1)
|
||||
|
||||
# Access order should now be: [prefix2, prefix1]
|
||||
assert memory_cache._access_order[0] == ("bucket", "prefix2")
|
||||
assert memory_cache._access_order[1] == ("bucket", "prefix1")
|
||||
@@ -147,22 +147,36 @@ class TestDeltaServiceGet:
|
||||
service.get(delta_key, temp_dir / "output.zip")
|
||||
|
||||
def test_get_missing_metadata(self, service, mock_storage, temp_dir):
|
||||
"""Test get with missing metadata."""
|
||||
"""Test get with missing metadata (regular S3 object)."""
|
||||
# Setup
|
||||
delta_key = ObjectKey(bucket="test-bucket", key="test/file.zip.delta")
|
||||
|
||||
# Create test content
|
||||
test_content = b"regular S3 file content"
|
||||
|
||||
# Mock a regular S3 object without DeltaGlider metadata
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="test/file.zip.delta",
|
||||
size=100,
|
||||
size=len(test_content),
|
||||
etag="abc",
|
||||
last_modified=None,
|
||||
metadata={}, # Missing required metadata
|
||||
metadata={}, # Missing DeltaGlider metadata - this is a regular S3 object
|
||||
)
|
||||
|
||||
# Execute and verify
|
||||
from deltaglider.core.errors import StorageIOError
|
||||
# Mock the storage.get to return the content
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
with pytest.raises(StorageIOError):
|
||||
service.get(delta_key, temp_dir / "output.zip")
|
||||
mock_stream = MagicMock()
|
||||
mock_stream.read.side_effect = [test_content, b""] # Return content then EOF
|
||||
mock_storage.get.return_value = mock_stream
|
||||
|
||||
# Execute - should successfully download regular S3 object
|
||||
output_path = temp_dir / "output.zip"
|
||||
service.get(delta_key, output_path)
|
||||
|
||||
# Verify - file should be downloaded
|
||||
assert output_path.exists()
|
||||
assert output_path.read_bytes() == test_content
|
||||
|
||||
|
||||
class TestDeltaServiceVerify:
|
||||
|
||||
Reference in New Issue
Block a user