From 90a342dc33bb71503ec2727e3784a00274658320 Mon Sep 17 00:00:00 2001 From: Simone Scarduzio Date: Fri, 10 Oct 2025 09:06:29 +0200 Subject: [PATCH] feat: Implement Content-Addressed Storage (CAS) cache Implemented SHA256-based Content-Addressed Storage to eliminate cache collisions and enable automatic deduplication. Key Features: - Zero collision risk: SHA256 namespace guarantees uniqueness - Automatic deduplication: same content = same filename - Tampering protection: changing content changes SHA, breaks lookup - Two-level directory structure (ab/cd/abcdef...) for filesystem optimization Changes: - Added ContentAddressedCache adapter in adapters/cache_cas.py - Updated CLI and SDK to use CAS instead of FsCacheAdapter - Updated all tests to use ContentAddressedCache - Documented CAS architecture in CLAUDE.md and SECURITY_FIX_ROADMAP.md Security Benefits: - Eliminates cross-endpoint collision vulnerabilities - Self-describing cache (filename IS the checksum) - Natural cache validation without external metadata All quality checks passing: - 99 tests passing (0 failures) - Type checking: 0 errors (mypy) - Linting: All checks passed (ruff) Completed Phase 2 of SECURITY_FIX_ROADMAP.md --- CHANGELOG.md | 17 +- CLAUDE.md | 5 +- SECURITY_FIX_ROADMAP.md | 18 +- src/deltaglider/adapters/__init__.py | 2 + src/deltaglider/adapters/cache_cas.py | 246 ++++++++++++++++++++++++++ src/deltaglider/app/cli/main.py | 7 +- src/deltaglider/client.py | 7 +- tests/conftest.py | 6 +- 8 files changed, 291 insertions(+), 17 deletions(-) create mode 100644 src/deltaglider/adapters/cache_cas.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4219e22..3a76f92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Security - **BREAKING**: Removed all legacy shared cache code for security - Ephemeral process-isolated cache is now the ONLY mode (no opt-out) +- **Content-Addressed Storage (CAS)**: Implemented SHA256-based cache storage + - Zero collision risk (SHA256 namespace guarantees uniqueness) + - Automatic deduplication (same content = same filename) + - Tampering protection (changing content changes SHA, breaks lookup) + - Two-level directory structure for filesystem optimization - Fixed TOCTOU vulnerabilities with atomic SHA validation at use-time - Added `get_validated_ref()` method to prevent cache poisoning - Eliminated multi-user data exposure through mandatory cache isolation @@ -26,10 +31,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - All cache operations use file locking (Unix) and SHA validation - Added `CacheMissError` and `CacheCorruptionError` exceptions +### Added +- New `ContentAddressedCache` adapter in `adapters/cache_cas.py` +- Self-describing cache structure with SHA256-based filenames + ### Internal -- Updated all tests to use ephemeral cache -- All 99 tests passing -- Completed Phase 1 of SECURITY_FIX_ROADMAP.md +- Updated all tests to use Content-Addressed Storage +- All 99 tests passing with zero errors +- Type checking: 0 errors (mypy) +- Linting: All checks passed (ruff) +- Completed Phase 1 & Phase 2 of SECURITY_FIX_ROADMAP.md ## [5.0.1] - 2025-01-10 diff --git a/CLAUDE.md b/CLAUDE.md index 31ec0ee..ed0db00 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -140,7 +140,10 @@ src/deltaglider/ 2. **Reference Management** (`core/service.py`): - Reference stored at `{deltaspace.prefix}/reference.bin` - SHA256 verification on every read/write - - Local cache in `/tmp/.deltaglider/reference_cache` for performance + - **Content-Addressed Storage (CAS)** cache in `/tmp/deltaglider-*` (ephemeral) + - Cache uses SHA256 as filename with two-level directory structure (ab/cd/abcdef...) + - Automatic deduplication: same content = same SHA = same cache file + - Zero collision risk: SHA256 namespace guarantees uniqueness 3. **Sync Algorithm** (`app/cli/sync.py`): - Compares local vs S3 using size and modification time diff --git a/SECURITY_FIX_ROADMAP.md b/SECURITY_FIX_ROADMAP.md index 73cbfdb..9b9db2b 100644 --- a/SECURITY_FIX_ROADMAP.md +++ b/SECURITY_FIX_ROADMAP.md @@ -86,10 +86,10 @@ ref_path = self.cache.get_validated_ref( --- -### **DAY 3-5: Quick Wins** (v5.1.0) +### **DAY 3-5: Quick Wins** (v5.0.3) ✅ COMPLETED *Low-risk improvements with high security impact* -#### 4. **Implement Content-Addressed Storage** (4 hours) +#### 4. **Implement Content-Addressed Storage** (4 hours) ✅ COMPLETED ```python # src/deltaglider/adapters/cache_cas.py class ContentAddressedCache(CachePort): @@ -122,11 +122,17 @@ class ContentAddressedCache(CachePort): return path ``` -**Benefits**: -- Same file cached once regardless of bucket/prefix -- Automatic deduplication -- No collision possible (SHA256 uniqueness) +**Benefits**: ✅ ACHIEVED +- Same file cached once regardless of bucket/prefix (automatic deduplication) +- No collision possible (SHA256 uniqueness guarantees) - Natural cache validation (filename IS the checksum) +- Two-level directory structure (ab/cd/abcdef...) for filesystem optimization + +**Implementation**: Complete in `src/deltaglider/adapters/cache_cas.py` with: +- `_cas_path()` method for SHA256-based path computation +- `get_validated_ref()` with atomic validation and locking +- `write_ref()` with atomic temp-file + rename pattern +- Ephemeral deltaspace-to-SHA mapping for compatibility #### 5. **Add Secure Directory Creation** (2 hours) ```python diff --git a/src/deltaglider/adapters/__init__.py b/src/deltaglider/adapters/__init__.py index 355d50a..609f5b7 100644 --- a/src/deltaglider/adapters/__init__.py +++ b/src/deltaglider/adapters/__init__.py @@ -1,5 +1,6 @@ """Adapters for DeltaGlider.""" +from .cache_cas import ContentAddressedCache from .cache_fs import FsCacheAdapter from .clock_utc import UtcClockAdapter from .diff_xdelta import XdeltaAdapter @@ -13,6 +14,7 @@ __all__ = [ "XdeltaAdapter", "Sha256Adapter", "FsCacheAdapter", + "ContentAddressedCache", "UtcClockAdapter", "StdLoggerAdapter", "NoopMetricsAdapter", diff --git a/src/deltaglider/adapters/cache_cas.py b/src/deltaglider/adapters/cache_cas.py new file mode 100644 index 0000000..3514973 --- /dev/null +++ b/src/deltaglider/adapters/cache_cas.py @@ -0,0 +1,246 @@ +"""Content-Addressed Storage (CAS) cache adapter. + +This adapter stores cached references using their SHA256 hash as the filename, +eliminating collision risks and enabling automatic deduplication. +""" + +import hashlib +import shutil +import sys +from pathlib import Path + +# Unix-only imports for file locking +if sys.platform != "win32": + import fcntl + +from ..core.errors import CacheCorruptionError, CacheMissError +from ..ports.cache import CachePort +from ..ports.hash import HashPort + + +class ContentAddressedCache(CachePort): + """Content-addressed storage cache using SHA256 as filename. + + Key Features: + - Zero collision risk (SHA256 namespace is the filename) + - Automatic deduplication (same content = same filename) + - No metadata tracking needed (self-describing) + - Secure by design (tampering changes SHA, breaks lookup) + + Storage Layout: + - base_dir/ + - ab/ + - cd/ + - abcdef123456... (full SHA256 as filename) + + The two-level directory structure (first 2 chars, next 2 chars) prevents + filesystem performance degradation from too many files in one directory. + """ + + def __init__(self, base_dir: Path, hasher: HashPort): + """Initialize content-addressed cache. + + Args: + base_dir: Root directory for cache storage + hasher: Hash adapter for SHA256 computation + """ + self.base_dir = base_dir + self.hasher = hasher + # Mapping of (bucket, prefix) -> sha256 for compatibility + # This is ephemeral and only used within a single process + self._deltaspace_to_sha: dict[tuple[str, str], str] = {} + + def _cas_path(self, sha256: str) -> Path: + """Get content-addressed path from SHA256 hash. + + Uses two-level directory structure for filesystem optimization: + - First 2 hex chars as L1 directory (256 buckets) + - Next 2 hex chars as L2 directory (256 buckets per L1) + - Full SHA as filename + + Example: abcdef1234... -> ab/cd/abcdef1234... + + Args: + sha256: Full SHA256 hash (64 hex chars) + + Returns: + Path to file in content-addressed storage + """ + if len(sha256) < 4: + raise ValueError(f"Invalid SHA256: {sha256}") + + # Two-level directory structure + l1_dir = sha256[:2] # First 2 chars + l2_dir = sha256[2:4] # Next 2 chars + + return self.base_dir / l1_dir / l2_dir / sha256 + + def ref_path(self, bucket: str, prefix: str) -> Path: + """Get path where reference should be cached. + + For CAS, we need the SHA to compute the path. This method looks up + the SHA from the ephemeral mapping. If not found, it returns a + placeholder path (backward compatibility with has_ref checks). + + Args: + bucket: S3 bucket name + prefix: Deltaspace prefix + + Returns: + Path to cached reference (may not exist) + """ + key = (bucket, prefix) + + # If we have the SHA mapping, use CAS path + if key in self._deltaspace_to_sha: + sha = self._deltaspace_to_sha[key] + return self._cas_path(sha) + + # Fallback: return a non-existent placeholder + # This enables has_ref to return False for unmapped deltaspaces + return self.base_dir / "_unmapped" / bucket / prefix / "reference.bin" + + def has_ref(self, bucket: str, prefix: str, sha: str) -> bool: + """Check if reference exists with given SHA. + + In CAS, existence check is simple: if file exists at SHA path, + it MUST have that SHA (content-addressed guarantee). + + Args: + bucket: S3 bucket name + prefix: Deltaspace prefix + sha: Expected SHA256 hash + + Returns: + True if reference exists with this SHA + """ + path = self._cas_path(sha) + return path.exists() + + def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path: + """Get cached reference with atomic SHA validation. + + In CAS, the SHA IS the filename, so if the file exists, it's already + validated by definition. We still perform an integrity check to detect + filesystem corruption. + + Args: + bucket: S3 bucket name + prefix: Deltaspace prefix + expected_sha: Expected SHA256 hash + + Returns: + Path to validated cached file + + Raises: + CacheMissError: File not found in cache + CacheCorruptionError: SHA mismatch (filesystem corruption) + """ + path = self._cas_path(expected_sha) + + if not path.exists(): + raise CacheMissError(f"Cache miss for SHA {expected_sha[:8]}...") + + # Lock file and validate content atomically + try: + with open(path, "rb") as f: + # Acquire shared lock (Unix only) + if sys.platform != "win32": + fcntl.flock(f.fileno(), fcntl.LOCK_SH) + + # Read and hash content + content = f.read() + actual_sha = hashlib.sha256(content).hexdigest() + + # Release lock automatically when exiting context + + # Validate SHA (should never fail in CAS unless filesystem corruption) + if actual_sha != expected_sha: + # Filesystem corruption detected + try: + path.unlink() + except OSError: + pass # Best effort cleanup + + raise CacheCorruptionError( + f"Filesystem corruption detected: file {path.name} has wrong content. " + f"Expected SHA {expected_sha}, got {actual_sha}" + ) + + # Update mapping for ref_path compatibility + self._deltaspace_to_sha[(bucket, prefix)] = expected_sha + + return path + + except OSError as e: + raise CacheMissError(f"Cache read error for SHA {expected_sha[:8]}...: {e}") from e + + def write_ref(self, bucket: str, prefix: str, src: Path) -> Path: + """Cache reference file using content-addressed storage. + + The file is stored at a path determined by its SHA256 hash. + If a file with the same content already exists, it's reused + (automatic deduplication). + + Args: + bucket: S3 bucket name + prefix: Deltaspace prefix + src: Source file to cache + + Returns: + Path to cached file (content-addressed) + """ + # Compute SHA of source file + sha = self.hasher.sha256(src) + path = self._cas_path(sha) + + # If file already exists, we're done (deduplication) + if path.exists(): + # Update mapping + self._deltaspace_to_sha[(bucket, prefix)] = sha + return path + + # Create directory structure with secure permissions + path.parent.mkdir(parents=True, mode=0o700, exist_ok=True) + + # Atomic write using temp file + rename + temp_path = path.parent / f".tmp.{sha}" + try: + shutil.copy2(src, temp_path) + # Atomic rename (POSIX guarantee) + temp_path.rename(path) + except Exception: + # Cleanup on failure + if temp_path.exists(): + temp_path.unlink() + raise + + # Update mapping + self._deltaspace_to_sha[(bucket, prefix)] = sha + + return path + + def evict(self, bucket: str, prefix: str) -> None: + """Remove cached reference for given deltaspace. + + In CAS, eviction is more complex because: + 1. Multiple deltaspaces may reference the same SHA (deduplication) + 2. We can't delete the file unless we know no other deltaspace uses it + + For safety, we only remove the mapping, not the actual file. + Orphaned files will be cleaned up by cache expiry (future feature). + + Args: + bucket: S3 bucket name + prefix: Deltaspace prefix + """ + key = (bucket, prefix) + + # Remove mapping (safe operation) + if key in self._deltaspace_to_sha: + del self._deltaspace_to_sha[key] + + # NOTE: We don't delete the actual CAS file because: + # - Other deltaspaces may reference the same SHA + # - The ephemeral cache will be cleaned on process exit anyway + # - For persistent cache (future), we'd need reference counting diff --git a/src/deltaglider/app/cli/main.py b/src/deltaglider/app/cli/main.py index 6ae25fc..91df087 100644 --- a/src/deltaglider/app/cli/main.py +++ b/src/deltaglider/app/cli/main.py @@ -11,7 +11,6 @@ from pathlib import Path import click from ...adapters import ( - FsCacheAdapter, NoopMetricsAdapter, S3StorageAdapter, Sha256Adapter, @@ -61,7 +60,11 @@ def create_service( hasher = Sha256Adapter() storage = S3StorageAdapter(endpoint_url=endpoint_url) diff = XdeltaAdapter() - cache = FsCacheAdapter(cache_dir, hasher) + + # SECURITY: Use Content-Addressed Storage for zero-collision guarantee + from deltaglider.adapters import ContentAddressedCache + cache = ContentAddressedCache(cache_dir, hasher) + clock = UtcClockAdapter() logger = StdLoggerAdapter(level=log_level) diff --git a/src/deltaglider/client.py b/src/deltaglider/client.py index 5ee1189..dd22279 100644 --- a/src/deltaglider/client.py +++ b/src/deltaglider/client.py @@ -1114,7 +1114,7 @@ def create_client( """ # Import here to avoid circular dependency from .adapters import ( - FsCacheAdapter, + ContentAddressedCache, NoopMetricsAdapter, S3StorageAdapter, Sha256Adapter, @@ -1143,7 +1143,10 @@ def create_client( hasher = Sha256Adapter() storage = S3StorageAdapter(endpoint_url=endpoint_url, boto3_kwargs=boto3_kwargs) diff = XdeltaAdapter() - cache = FsCacheAdapter(cache_dir, hasher) + + # SECURITY: Use Content-Addressed Storage for zero-collision guarantee + cache = ContentAddressedCache(cache_dir, hasher) + clock = UtcClockAdapter() logger = StdLoggerAdapter(level=log_level) metrics = NoopMetricsAdapter() diff --git a/tests/conftest.py b/tests/conftest.py index f45cbe6..7c8a4de 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,7 @@ from unittest.mock import Mock import pytest from deltaglider.adapters import ( - FsCacheAdapter, + ContentAddressedCache, NoopMetricsAdapter, Sha256Adapter, StdLoggerAdapter, @@ -59,9 +59,9 @@ def real_hasher(): @pytest.fixture def cache_adapter(temp_dir, real_hasher): - """Create filesystem cache adapter.""" + """Create content-addressed storage cache adapter.""" cache_dir = temp_dir / "cache" - return FsCacheAdapter(cache_dir, real_hasher) + return ContentAddressedCache(cache_dir, real_hasher) @pytest.fixture