Initial commit: DeltaGlider - 99.9% compression for S3 storage

DeltaGlider reduces storage costs by storing only binary deltas between
similar files. Achieves 99.9% compression for versioned artifacts.

Key features:
- Intelligent file type detection (delta for archives, direct for others)
- Drop-in S3 replacement with automatic compression
- SHA256 integrity verification on every operation
- Clean hexagonal architecture
- Full test coverage
- Production tested with 200K+ files

Case study: ReadOnlyREST reduced 4TB to 5GB (99.9% compression)
This commit is contained in:
Simone Scarduzio
2025-09-22 15:49:31 +02:00
commit 7562064832
50 changed files with 4520 additions and 0 deletions

1
tests/unit/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Unit tests for DeltaGlider."""

210
tests/unit/test_adapters.py Normal file
View File

@@ -0,0 +1,210 @@
"""Unit tests for adapters."""
import hashlib
from datetime import UTC, datetime
from deltaglider.adapters import (
FsCacheAdapter,
NoopMetricsAdapter,
Sha256Adapter,
StdLoggerAdapter,
UtcClockAdapter,
)
class TestSha256Adapter:
"""Test SHA256 adapter."""
def test_sha256_from_path(self, temp_dir):
"""Test computing SHA256 from file path."""
# Setup
file_path = temp_dir / "test.txt"
content = b"Hello, World!"
file_path.write_bytes(content)
# Expected SHA256
expected = hashlib.sha256(content).hexdigest()
# Execute
adapter = Sha256Adapter()
actual = adapter.sha256(file_path)
# Verify
assert actual == expected
def test_sha256_from_stream(self, temp_dir):
"""Test computing SHA256 from stream."""
# Setup
content = b"Hello, Stream!"
expected = hashlib.sha256(content).hexdigest()
# Execute
adapter = Sha256Adapter()
import io
stream = io.BytesIO(content)
actual = adapter.sha256(stream)
# Verify
assert actual == expected
class TestFsCacheAdapter:
"""Test filesystem cache adapter."""
def test_ref_path(self, temp_dir):
"""Test reference path generation."""
# Setup
hasher = Sha256Adapter()
adapter = FsCacheAdapter(temp_dir / "cache", hasher)
# Execute
path = adapter.ref_path("my-bucket", "path/to/leaf")
# Verify
expected = temp_dir / "cache" / "my-bucket" / "path/to/leaf" / "reference.bin"
assert path == expected
def test_has_ref_not_exists(self, temp_dir):
"""Test checking non-existent reference."""
# Setup
hasher = Sha256Adapter()
adapter = FsCacheAdapter(temp_dir / "cache", hasher)
# Execute
result = adapter.has_ref("bucket", "leaf", "abc123")
# Verify
assert result is False
def test_has_ref_wrong_sha(self, temp_dir):
"""Test checking reference with wrong SHA."""
# Setup
hasher = Sha256Adapter()
adapter = FsCacheAdapter(temp_dir / "cache", hasher)
# Create reference with known content
ref_path = adapter.ref_path("bucket", "leaf")
ref_path.parent.mkdir(parents=True, exist_ok=True)
content = b"reference content"
ref_path.write_bytes(content)
# Execute with wrong SHA
result = adapter.has_ref("bucket", "leaf", "wrong_sha")
# Verify
assert result is False
def test_has_ref_correct_sha(self, temp_dir):
"""Test checking reference with correct SHA."""
# Setup
hasher = Sha256Adapter()
adapter = FsCacheAdapter(temp_dir / "cache", hasher)
# Create reference with known content
ref_path = adapter.ref_path("bucket", "leaf")
ref_path.parent.mkdir(parents=True, exist_ok=True)
content = b"reference content"
ref_path.write_bytes(content)
correct_sha = hasher.sha256(ref_path)
# Execute with correct SHA
result = adapter.has_ref("bucket", "leaf", correct_sha)
# Verify
assert result is True
def test_write_ref(self, temp_dir):
"""Test writing reference to cache."""
# Setup
hasher = Sha256Adapter()
adapter = FsCacheAdapter(temp_dir / "cache", hasher)
# Create source file
src = temp_dir / "source.bin"
src.write_text("source content")
# Execute
cached = adapter.write_ref("bucket", "leaf/path", src)
# Verify
assert cached.exists()
assert cached.read_text() == "source content"
assert cached == temp_dir / "cache" / "bucket" / "leaf/path" / "reference.bin"
def test_evict(self, temp_dir):
"""Test evicting cached reference."""
# Setup
hasher = Sha256Adapter()
adapter = FsCacheAdapter(temp_dir / "cache", hasher)
# Create cached reference
ref_path = adapter.ref_path("bucket", "leaf")
ref_path.parent.mkdir(parents=True, exist_ok=True)
ref_path.write_text("cached")
# Execute
adapter.evict("bucket", "leaf")
# Verify
assert not ref_path.exists()
class TestUtcClockAdapter:
"""Test UTC clock adapter."""
def test_now_returns_utc(self):
"""Test that now() returns UTC time."""
# Execute
adapter = UtcClockAdapter()
now = adapter.now()
# Verify
assert isinstance(now, datetime)
# Should be close to current UTC time
utc_now = datetime.now(UTC).replace(tzinfo=None)
diff = abs((now - utc_now).total_seconds())
assert diff < 1 # Within 1 second
class TestStdLoggerAdapter:
"""Test standard logger adapter."""
def test_log_levels(self):
"""Test different log levels."""
# Setup
adapter = StdLoggerAdapter(level="DEBUG")
# Execute - should not raise
adapter.debug("Debug message", extra="data")
adapter.info("Info message", key="value")
adapter.warning("Warning message", count=123)
adapter.error("Error message", error="details")
def test_log_operation(self):
"""Test structured operation logging."""
# Setup
adapter = StdLoggerAdapter()
# Execute - should not raise
adapter.log_operation(
op="put",
key="test/key",
leaf="bucket/prefix",
sizes={"file": 1000, "delta": 100},
durations={"total": 1.5},
cache_hit=True,
)
class TestNoopMetricsAdapter:
"""Test no-op metrics adapter."""
def test_noop_methods(self):
"""Test that all methods are no-ops."""
# Setup
adapter = NoopMetricsAdapter()
# Execute - should not raise or do anything
adapter.increment("counter", 1, {"tag": "value"})
adapter.gauge("gauge", 42.5, {"env": "test"})
adapter.timing("timer", 1.234, {"op": "test"})

View File

@@ -0,0 +1,235 @@
"""Unit tests for DeltaService."""
import warnings
import pytest
from deltaglider.core import (
Leaf,
NotFoundError,
ObjectKey,
PolicyViolationWarning,
)
from deltaglider.ports.storage import ObjectHead, PutResult
class TestDeltaServicePut:
"""Test DeltaService.put method."""
def test_create_reference_first_file(self, service, sample_file, mock_storage):
"""Test creating reference for first file."""
# Setup
leaf = Leaf(bucket="test-bucket", prefix="test/prefix")
mock_storage.head.return_value = None # No reference exists
mock_storage.put.return_value = PutResult(etag="abc123")
# Execute
summary = service.put(sample_file, leaf)
# Verify
assert summary.operation == "create_reference"
assert summary.bucket == "test-bucket"
assert summary.key == "test/prefix/reference.bin"
assert summary.original_name == "test.zip"
assert summary.file_size > 0
assert summary.file_sha256 is not None
# Check storage calls
assert mock_storage.head.call_count == 2 # Initial check + re-check
assert mock_storage.put.call_count == 2 # Reference + zero-diff delta
def test_create_delta_subsequent_file(self, service, sample_file, mock_storage, mock_diff):
"""Test creating delta for subsequent file."""
# Setup
leaf = Leaf(bucket="test-bucket", prefix="test/prefix")
# Create reference content and compute its SHA
import io
ref_content = b"reference content for test"
ref_sha = service.hasher.sha256(io.BytesIO(ref_content))
ref_metadata = {
"tool": "deltaglider/0.1.0",
"source_name": "original.zip",
"file_sha256": ref_sha,
"created_at": "2025-01-01T00:00:00Z",
}
mock_storage.head.return_value = ObjectHead(
key="test/prefix/reference.bin",
size=1000,
etag="ref123",
last_modified=None,
metadata=ref_metadata,
)
mock_storage.put.return_value = PutResult(etag="delta123")
# Mock storage.get to return the reference content
mock_storage.get.return_value = io.BytesIO(ref_content)
# Create cached reference with matching content
ref_path = service.cache.ref_path(leaf.bucket, leaf.prefix)
ref_path.parent.mkdir(parents=True, exist_ok=True)
ref_path.write_bytes(ref_content)
# Execute
summary = service.put(sample_file, leaf)
# Verify
assert summary.operation == "create_delta"
assert summary.bucket == "test-bucket"
assert summary.key == "test/prefix/test.zip.delta"
assert summary.delta_size is not None
assert summary.delta_ratio is not None
assert summary.ref_key == "test/prefix/reference.bin"
# Check diff was called
mock_diff.encode.assert_called_once()
def test_delta_ratio_warning(self, service, sample_file, mock_storage, mock_diff):
"""Test warning when delta ratio exceeds threshold."""
# Setup
leaf = Leaf(bucket="test-bucket", prefix="test/prefix")
# Create reference content and compute its SHA
import io
ref_content = b"reference content for test"
ref_sha = service.hasher.sha256(io.BytesIO(ref_content))
ref_metadata = {
"file_sha256": ref_sha,
}
mock_storage.head.return_value = ObjectHead(
key="test/prefix/reference.bin",
size=1000,
etag="ref123",
last_modified=None,
metadata=ref_metadata,
)
mock_storage.put.return_value = PutResult(etag="delta123")
# Mock storage.get to return the reference content
mock_storage.get.return_value = io.BytesIO(ref_content)
# Make delta large (exceeds ratio)
def large_encode(base, target, out):
out.write_bytes(b"x" * 10000) # Large delta
mock_diff.encode.side_effect = large_encode
# Create cached reference with matching content
ref_path = service.cache.ref_path(leaf.bucket, leaf.prefix)
ref_path.parent.mkdir(parents=True, exist_ok=True)
ref_path.write_bytes(ref_content)
# Execute and check warning
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
service.put(sample_file, leaf, max_ratio=0.1)
assert len(w) == 1
assert issubclass(w[0].category, PolicyViolationWarning)
assert "exceeds threshold" in str(w[0].message)
class TestDeltaServiceGet:
"""Test DeltaService.get method."""
def test_get_not_found(self, service, mock_storage, temp_dir):
"""Test get with non-existent delta."""
# Setup
delta_key = ObjectKey(bucket="test-bucket", key="test/file.zip.delta")
mock_storage.head.return_value = None
# Execute and verify
with pytest.raises(NotFoundError):
service.get(delta_key, temp_dir / "output.zip")
def test_get_missing_metadata(self, service, mock_storage, temp_dir):
"""Test get with missing metadata."""
# Setup
delta_key = ObjectKey(bucket="test-bucket", key="test/file.zip.delta")
mock_storage.head.return_value = ObjectHead(
key="test/file.zip.delta",
size=100,
etag="abc",
last_modified=None,
metadata={}, # Missing required metadata
)
# Execute and verify
from deltaglider.core.errors import StorageIOError
with pytest.raises(StorageIOError):
service.get(delta_key, temp_dir / "output.zip")
class TestDeltaServiceVerify:
"""Test DeltaService.verify method."""
def test_verify_valid(self, service, mock_storage, mock_diff, temp_dir):
"""Test verify with valid delta."""
# Setup
delta_key = ObjectKey(bucket="test-bucket", key="test/file.zip.delta")
# Create test file content
test_content = b"test file content"
temp_file = temp_dir / "temp"
temp_file.write_bytes(test_content)
test_sha = service.hasher.sha256(temp_file)
# Create reference content for mock
import io
ref_content = b"reference content for test"
ref_sha = service.hasher.sha256(io.BytesIO(ref_content))
delta_metadata = {
"tool": "deltaglider/0.1.0",
"original_name": "file.zip",
"file_sha256": test_sha,
"file_size": str(len(test_content)),
"created_at": "2025-01-01T00:00:00Z",
"ref_key": "test/reference.bin",
"ref_sha256": ref_sha,
"delta_size": "100",
"delta_cmd": "xdelta3 -e -9 -s reference.bin file.zip file.zip.delta",
}
mock_storage.head.return_value = ObjectHead(
key="test/file.zip.delta",
size=100,
etag="delta123",
last_modified=None,
metadata=delta_metadata,
)
# Mock storage.get to return content based on which key is requested
# Storage.get is called with full keys like "bucket/path/file"
def get_side_effect(key):
# Check the actual key passed
if "delta" in key:
return io.BytesIO(b"delta content")
elif "reference.bin" in key:
# Return reference content for the reference file
return io.BytesIO(ref_content)
else:
# Default case - return reference content
return io.BytesIO(ref_content)
mock_storage.get.side_effect = get_side_effect
# Setup mock diff decode to create correct file
def decode_correct(base, delta, out):
out.write_bytes(test_content)
mock_diff.decode.side_effect = decode_correct
# Create cached reference
ref_path = service.cache.ref_path("test-bucket", "test")
ref_path.parent.mkdir(parents=True, exist_ok=True)
ref_path.write_bytes(ref_content)
# Execute
result = service.verify(delta_key)
# Verify
assert result.valid is True
assert result.expected_sha256 == test_sha
assert result.actual_sha256 == test_sha
assert "verified" in result.message.lower()