Initial commit: DeltaGlider - 99.9% compression for S3 storage

DeltaGlider reduces storage costs by storing only binary deltas between
similar files. Achieves 99.9% compression for versioned artifacts.

Key features:
- Intelligent file type detection (delta for archives, direct for others)
- Drop-in S3 replacement with automatic compression
- SHA256 integrity verification on every operation
- Clean hexagonal architecture
- Full test coverage
- Production tested with 200K+ files

Case study: ReadOnlyREST reduced 4TB to 5GB (99.9% compression)
This commit is contained in:
Simone Scarduzio
2025-09-22 15:49:31 +02:00
commit 7562064832
50 changed files with 4520 additions and 0 deletions

View File

@@ -0,0 +1 @@
"""Integration tests for DeltaGlider."""

View File

@@ -0,0 +1,191 @@
"""Integration test for full put/get workflow."""
import io
import tempfile
from pathlib import Path
from unittest.mock import Mock
import pytest
from deltaglider.core import DeltaService, Leaf, ObjectKey
def test_full_put_get_workflow(service, temp_dir, mock_storage, mock_diff):
"""Test complete workflow: put a file, then get it back."""
# Create test files
file1_content = b"This is the first version of the file."
file2_content = b"This is the second version of the file with changes."
file1 = temp_dir / "version1.txt"
file2 = temp_dir / "version2.txt"
output_file = temp_dir / "recovered.txt"
file1.write_bytes(file1_content)
file2.write_bytes(file2_content)
# Set up mock_diff decode to write the target content
def decode_side_effect(base, delta, out):
out.write_bytes(file2_content)
mock_diff.decode.side_effect = decode_side_effect
leaf = Leaf(bucket="test-bucket", prefix="test/data")
# Storage state tracking
storage_data = {}
def mock_head(key):
"""Mock head_object."""
if key in storage_data:
return storage_data[key]["head"]
return None
def mock_put(key, body, metadata, content_type="application/octet-stream"):
"""Mock put_object."""
from deltaglider.ports.storage import PutResult, ObjectHead
# Read content if it's a Path
if isinstance(body, Path):
content = body.read_bytes()
elif isinstance(body, bytes):
content = body
else:
content = body.read()
storage_data[key] = {
"content": content,
"head": ObjectHead(
key=key.split("/", 1)[1],
size=len(content),
etag="mock-etag",
last_modified=None,
metadata=metadata,
)
}
return PutResult(etag="mock-etag")
def mock_get(key):
"""Mock get_object."""
# The key might come without bucket prefix, so check both formats
if key in storage_data:
return io.BytesIO(storage_data[key]["content"])
# Also try with test-bucket prefix if not found
full_key = f"test-bucket/{key}" if not key.startswith("test-bucket/") else key
if full_key in storage_data:
return io.BytesIO(storage_data[full_key]["content"])
raise FileNotFoundError(f"Object not found: {key}")
mock_storage.head.side_effect = mock_head
mock_storage.put.side_effect = mock_put
mock_storage.get.side_effect = mock_get
# Step 1: Put the first file (creates reference)
summary1 = service.put(file1, leaf)
assert summary1.operation == "create_reference"
assert summary1.key == "test/data/reference.bin"
# Verify reference was stored
ref_key = f"{leaf.bucket}/{leaf.reference_key()}"
assert ref_key in storage_data
assert storage_data[ref_key]["content"] == file1_content
# Step 2: Put the second file (creates delta)
summary2 = service.put(file2, leaf)
assert summary2.operation == "create_delta"
assert summary2.key == "test/data/version2.txt.delta"
assert summary2.delta_size is not None
assert summary2.ref_key == "test/data/reference.bin"
# Verify delta was stored
delta_key = f"{leaf.bucket}/{summary2.key}"
assert delta_key in storage_data
# Step 3: Get the delta file back
obj_key = ObjectKey(bucket=leaf.bucket, key=summary2.key)
service.get(obj_key, output_file)
# Step 4: Verify the recovered file matches the original
recovered_content = output_file.read_bytes()
assert recovered_content == file2_content
def test_get_with_auto_delta_suffix(service, temp_dir, mock_storage, mock_diff):
"""Test get command behavior when .delta suffix is auto-appended."""
# Create test file
file_content = b"Test file content for auto-suffix test."
test_file = temp_dir / "mydata.zip"
test_file.write_bytes(file_content)
# Set up mock_diff decode to write the target content
def decode_side_effect(base, delta, out):
out.write_bytes(file_content)
mock_diff.decode.side_effect = decode_side_effect
leaf = Leaf(bucket="test-bucket", prefix="archive")
# Storage state tracking
storage_data = {}
def mock_head(key):
"""Mock head_object."""
if key in storage_data:
return storage_data[key]["head"]
return None
def mock_put(key, body, metadata, content_type="application/octet-stream"):
"""Mock put_object."""
from deltaglider.ports.storage import PutResult, ObjectHead
# Read content if it's a Path
if isinstance(body, Path):
content = body.read_bytes()
elif isinstance(body, bytes):
content = body
else:
content = body.read()
storage_data[key] = {
"content": content,
"head": ObjectHead(
key=key.split("/", 1)[1],
size=len(content),
etag="mock-etag",
last_modified=None,
metadata=metadata,
)
}
return PutResult(etag="mock-etag")
def mock_get(key):
"""Mock get_object."""
# The key might come without bucket prefix, so check both formats
if key in storage_data:
return io.BytesIO(storage_data[key]["content"])
# Also try with test-bucket prefix if not found
full_key = f"test-bucket/{key}" if not key.startswith("test-bucket/") else key
if full_key in storage_data:
return io.BytesIO(storage_data[full_key]["content"])
raise FileNotFoundError(f"Object not found: {key}")
mock_storage.head.side_effect = mock_head
mock_storage.put.side_effect = mock_put
mock_storage.get.side_effect = mock_get
# Put the file
summary = service.put(test_file, leaf)
# Get it back using original name (without .delta)
# The service should internally look for "mydata.zip.delta"
output_file = temp_dir / "recovered.zip"
# Use the key without .delta suffix
if summary.operation == "create_reference":
# If it's a reference, the zero-diff delta was created
obj_key = ObjectKey(bucket=leaf.bucket, key="archive/mydata.zip.delta")
else:
obj_key = ObjectKey(bucket=leaf.bucket, key=summary.key)
service.get(obj_key, output_file)
# Verify the recovered file matches the original
recovered_content = output_file.read_bytes()
assert recovered_content == file_content

View File

@@ -0,0 +1,135 @@
"""Integration test for get command."""
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch
import pytest
from click.testing import CliRunner
from deltaglider.app.cli.main import cli
from deltaglider.core import ObjectKey
@pytest.fixture
def mock_service():
"""Create a mock DeltaService."""
return Mock()
def test_get_command_with_original_name(mock_service):
"""Test get command with original filename (auto-appends .delta)."""
runner = CliRunner()
# Mock the service.get method
mock_service.get = Mock()
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
# Run get with original filename (should auto-append .delta)
result = runner.invoke(cli, ["get", "s3://test-bucket/data/myfile.zip"])
# Check it was successful
assert result.exit_code == 0
assert "Looking for delta file: s3://test-bucket/data/myfile.zip.delta" in result.output
assert "Successfully reconstructed: myfile.zip" in result.output
# Verify the service was called with the correct arguments
mock_service.get.assert_called_once()
call_args = mock_service.get.call_args
obj_key = call_args[0][0]
output_path = call_args[0][1]
assert isinstance(obj_key, ObjectKey)
assert obj_key.bucket == "test-bucket"
assert obj_key.key == "data/myfile.zip.delta"
assert output_path == Path("myfile.zip")
def test_get_command_with_delta_name(mock_service):
"""Test get command with explicit .delta filename."""
runner = CliRunner()
# Mock the service.get method
mock_service.get = Mock()
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
# Run get with explicit .delta filename
result = runner.invoke(cli, ["get", "s3://test-bucket/data/myfile.zip.delta"])
# Check it was successful
assert result.exit_code == 0
assert "Looking for delta file" not in result.output # Should not print this message
assert "Successfully reconstructed: myfile.zip" in result.output
# Verify the service was called with the correct arguments
mock_service.get.assert_called_once()
call_args = mock_service.get.call_args
obj_key = call_args[0][0]
output_path = call_args[0][1]
assert isinstance(obj_key, ObjectKey)
assert obj_key.bucket == "test-bucket"
assert obj_key.key == "data/myfile.zip.delta"
assert output_path == Path("myfile.zip")
def test_get_command_with_output_option(mock_service):
"""Test get command with custom output path."""
runner = CliRunner()
# Mock the service.get method
mock_service.get = Mock()
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
with tempfile.TemporaryDirectory() as tmpdir:
output_file = Path(tmpdir) / "custom_output.zip"
# Run get with custom output path
result = runner.invoke(cli, [
"get",
"s3://test-bucket/data/myfile.zip",
"-o", str(output_file)
])
# Check it was successful
assert result.exit_code == 0
assert f"Successfully reconstructed: {output_file}" in result.output
# Verify the service was called with the correct arguments
mock_service.get.assert_called_once()
call_args = mock_service.get.call_args
obj_key = call_args[0][0]
output_path = call_args[0][1]
assert isinstance(obj_key, ObjectKey)
assert obj_key.bucket == "test-bucket"
assert obj_key.key == "data/myfile.zip.delta"
assert output_path == output_file
def test_get_command_error_handling(mock_service):
"""Test get command error handling."""
runner = CliRunner()
# Mock the service.get method to raise an error
mock_service.get = Mock(side_effect=FileNotFoundError("Delta not found"))
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
# Run get command
result = runner.invoke(cli, ["get", "s3://test-bucket/data/missing.zip"])
# Check it failed with error message
assert result.exit_code == 1
assert "Error: Delta not found" in result.output
def test_get_command_invalid_url():
"""Test get command with invalid S3 URL."""
runner = CliRunner()
# Run get with invalid URL
result = runner.invoke(cli, ["get", "http://invalid-url/file.zip"])
# Check it failed with error message
assert result.exit_code == 1
assert "Error: Invalid S3 URL" in result.output

View File

@@ -0,0 +1,106 @@
"""Integration tests for xdelta3."""
import pytest
from deltaglider.adapters import XdeltaAdapter
@pytest.mark.usefixtures("skip_if_no_xdelta")
class TestXdeltaIntegration:
"""Test xdelta3 integration."""
def test_encode_decode_roundtrip(self, temp_dir):
"""Test encoding and decoding roundtrip."""
# Setup
adapter = XdeltaAdapter()
# Create base and target files
base = temp_dir / "base.txt"
base.write_text("This is the base file content.")
target = temp_dir / "target.txt"
target.write_text("This is the modified target file content with changes.")
delta = temp_dir / "delta.bin"
output = temp_dir / "output.txt"
# Encode
adapter.encode(base, target, delta)
# Verify delta was created
assert delta.exists()
assert delta.stat().st_size > 0
# Decode
adapter.decode(base, delta, output)
# Verify output matches target
assert output.read_text() == target.read_text()
def test_encode_identical_files(self, temp_dir):
"""Test encoding identical files produces small delta."""
# Setup
adapter = XdeltaAdapter()
# Create identical files
base = temp_dir / "base.txt"
content = "This is identical content in both files." * 100
base.write_text(content)
target = temp_dir / "target.txt"
target.write_text(content)
delta = temp_dir / "delta.bin"
# Encode
adapter.encode(base, target, delta)
# Verify delta is small (much smaller than original)
assert delta.exists()
assert delta.stat().st_size < len(content) / 10 # Delta should be <10% of original
def test_encode_completely_different_files(self, temp_dir):
"""Test encoding completely different files."""
# Setup
adapter = XdeltaAdapter()
# Create completely different files
base = temp_dir / "base.txt"
base.write_text("A" * 1000)
target = temp_dir / "target.txt"
target.write_text("B" * 1000)
delta = temp_dir / "delta.bin"
# Encode
adapter.encode(base, target, delta)
# Delta will be roughly the size of target since files are completely different
assert delta.exists()
# Note: xdelta3 compression may still reduce size somewhat
def test_encode_binary_files(self, temp_dir):
"""Test encoding binary files."""
# Setup
adapter = XdeltaAdapter()
# Create binary files
base = temp_dir / "base.bin"
base.write_bytes(b"\x00\x01\x02\x03" * 256)
target = temp_dir / "target.bin"
target.write_bytes(b"\x00\x01\x02\x03" * 200 + b"\xFF\xFE\xFD\xFC" * 56)
delta = temp_dir / "delta.bin"
output = temp_dir / "output.bin"
# Encode
adapter.encode(base, target, delta)
# Decode
adapter.decode(base, delta, output)
# Verify
assert output.read_bytes() == target.read_bytes()