Merge pull request #2273 from krish341360/fix/concurrent-storage-race-condition
Release / Create Release (push) Waiting to run
Release / Create Release (push) Waiting to run
fix: race condition in ConcurrentStorage and cache invalidation bug
This commit is contained in:
@@ -167,14 +167,18 @@ class ConcurrentStorage:
|
|||||||
run: Run to save
|
run: Run to save
|
||||||
immediate: If True, save immediately (bypasses batching)
|
immediate: If True, save immediately (bypasses batching)
|
||||||
"""
|
"""
|
||||||
|
# Invalidate summary cache since the run data is changing
|
||||||
|
# This ensures load_summary() fetches fresh data after the save
|
||||||
|
self._cache.pop(f"summary:{run.id}", None)
|
||||||
|
|
||||||
if immediate or not self._running:
|
if immediate or not self._running:
|
||||||
await self._save_run_locked(run)
|
await self._save_run_locked(run)
|
||||||
|
# Update cache only after successful immediate write
|
||||||
|
self._cache[f"run:{run.id}"] = CacheEntry(run, time.time())
|
||||||
else:
|
else:
|
||||||
|
# For batched writes, cache will be updated in _flush_batch after successful write
|
||||||
await self._write_queue.put(("run", run))
|
await self._write_queue.put(("run", run))
|
||||||
|
|
||||||
# Update cache
|
|
||||||
self._cache[f"run:{run.id}"] = CacheEntry(run, time.time())
|
|
||||||
|
|
||||||
async def _save_run_locked(self, run: Run) -> None:
|
async def _save_run_locked(self, run: Run) -> None:
|
||||||
"""Save a run with file locking, including index locks."""
|
"""Save a run with file locking, including index locks."""
|
||||||
lock_key = f"run:{run.id}"
|
lock_key = f"run:{run.id}"
|
||||||
@@ -363,8 +367,12 @@ class ConcurrentStorage:
|
|||||||
try:
|
try:
|
||||||
if item_type == "run":
|
if item_type == "run":
|
||||||
await self._save_run_locked(item)
|
await self._save_run_locked(item)
|
||||||
|
# Update cache only after successful batched write
|
||||||
|
# This fixes the race condition where cache was updated before write completed
|
||||||
|
self._cache[f"run:{item.id}"] = CacheEntry(item, time.time())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to save {item_type}: {e}")
|
logger.error(f"Failed to save {item_type}: {e}")
|
||||||
|
# Cache is NOT updated on failure - prevents stale/inconsistent cache state
|
||||||
|
|
||||||
async def _flush_pending(self) -> None:
|
async def _flush_pending(self) -> None:
|
||||||
"""Flush all pending writes."""
|
"""Flush all pending writes."""
|
||||||
|
|||||||
@@ -0,0 +1,162 @@
|
|||||||
|
"""Tests for ConcurrentStorage race condition and cache invalidation fixes."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from framework.schemas.run import Run, RunMetrics, RunStatus
|
||||||
|
from framework.storage.concurrent import ConcurrentStorage
|
||||||
|
|
||||||
|
|
||||||
|
def create_test_run(
|
||||||
|
run_id: str, goal_id: str = "test-goal", status: RunStatus = RunStatus.RUNNING
|
||||||
|
) -> Run:
|
||||||
|
"""Create a minimal test Run object."""
|
||||||
|
return Run(
|
||||||
|
id=run_id,
|
||||||
|
goal_id=goal_id,
|
||||||
|
status=status,
|
||||||
|
narrative="Test run",
|
||||||
|
metrics=RunMetrics(
|
||||||
|
nodes_executed=[],
|
||||||
|
),
|
||||||
|
decisions=[],
|
||||||
|
problems=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cache_invalidation_on_save(tmp_path: Path):
|
||||||
|
"""Test that summary cache is invalidated when a run is saved.
|
||||||
|
|
||||||
|
This tests the fix for the cache invalidation bug where load_summary()
|
||||||
|
would return stale data after a run was updated.
|
||||||
|
"""
|
||||||
|
storage = ConcurrentStorage(tmp_path)
|
||||||
|
await storage.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_id = "test-run-1"
|
||||||
|
|
||||||
|
# Create and save initial run
|
||||||
|
run = create_test_run(run_id, status=RunStatus.RUNNING)
|
||||||
|
await storage.save_run(run, immediate=True)
|
||||||
|
|
||||||
|
# Load summary to populate the cache
|
||||||
|
summary = await storage.load_summary(run_id)
|
||||||
|
assert summary is not None
|
||||||
|
assert summary.status == RunStatus.RUNNING
|
||||||
|
|
||||||
|
# Update run with new status
|
||||||
|
run.status = RunStatus.COMPLETED
|
||||||
|
await storage.save_run(run, immediate=True)
|
||||||
|
|
||||||
|
# Load summary again - should get fresh data, not cached stale data
|
||||||
|
summary = await storage.load_summary(run_id)
|
||||||
|
assert summary is not None
|
||||||
|
assert summary.status == RunStatus.COMPLETED, (
|
||||||
|
"Summary cache should be invalidated on save - got stale data"
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
await storage.stop()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_batched_write_cache_consistency(tmp_path: Path):
|
||||||
|
"""Test that cache is only updated after successful batched write.
|
||||||
|
|
||||||
|
This tests the fix for the race condition where cache was updated
|
||||||
|
before the batched write completed.
|
||||||
|
"""
|
||||||
|
storage = ConcurrentStorage(tmp_path, batch_interval=0.05)
|
||||||
|
await storage.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_id = "test-run-2"
|
||||||
|
|
||||||
|
# Save via batching (immediate=False)
|
||||||
|
run = create_test_run(run_id, status=RunStatus.RUNNING)
|
||||||
|
await storage.save_run(run, immediate=False)
|
||||||
|
|
||||||
|
# Before batch flush, cache should NOT contain the run
|
||||||
|
# (This is the fix - previously cache was updated immediately)
|
||||||
|
cache_key = f"run:{run_id}"
|
||||||
|
assert cache_key not in storage._cache, (
|
||||||
|
"Cache should not be updated before batch is flushed"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for batch to flush
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
# After batch flush, cache should contain the run
|
||||||
|
assert cache_key in storage._cache, "Cache should be updated after batch flush"
|
||||||
|
|
||||||
|
# Verify data on disk matches cache
|
||||||
|
loaded_run = await storage.load_run(run_id, use_cache=False)
|
||||||
|
assert loaded_run is not None
|
||||||
|
assert loaded_run.id == run_id
|
||||||
|
assert loaded_run.status == RunStatus.RUNNING
|
||||||
|
finally:
|
||||||
|
await storage.stop()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_immediate_write_updates_cache(tmp_path: Path):
|
||||||
|
"""Test that immediate writes still update cache correctly."""
|
||||||
|
storage = ConcurrentStorage(tmp_path)
|
||||||
|
await storage.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_id = "test-run-3"
|
||||||
|
|
||||||
|
# Save with immediate=True
|
||||||
|
run = create_test_run(run_id, status=RunStatus.COMPLETED)
|
||||||
|
await storage.save_run(run, immediate=True)
|
||||||
|
|
||||||
|
# Cache should be updated immediately for immediate writes
|
||||||
|
cache_key = f"run:{run_id}"
|
||||||
|
assert cache_key in storage._cache, "Cache should be updated after immediate write"
|
||||||
|
|
||||||
|
# Verify cached value is correct
|
||||||
|
cached_run = storage._cache[cache_key].value
|
||||||
|
assert cached_run.id == run_id
|
||||||
|
assert cached_run.status == RunStatus.COMPLETED
|
||||||
|
finally:
|
||||||
|
await storage.stop()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_summary_cache_invalidated_on_multiple_saves(tmp_path: Path):
|
||||||
|
"""Test that summary cache is invalidated on each save, not just the first."""
|
||||||
|
storage = ConcurrentStorage(tmp_path)
|
||||||
|
await storage.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_id = "test-run-4"
|
||||||
|
|
||||||
|
# First save
|
||||||
|
run = create_test_run(run_id, status=RunStatus.RUNNING)
|
||||||
|
await storage.save_run(run, immediate=True)
|
||||||
|
|
||||||
|
# Load summary to cache it
|
||||||
|
summary1 = await storage.load_summary(run_id)
|
||||||
|
assert summary1.status == RunStatus.RUNNING
|
||||||
|
|
||||||
|
# Second save with new status
|
||||||
|
run.status = RunStatus.RUNNING
|
||||||
|
await storage.save_run(run, immediate=True)
|
||||||
|
|
||||||
|
# Load summary - should be fresh
|
||||||
|
summary2 = await storage.load_summary(run_id)
|
||||||
|
assert summary2.status == RunStatus.RUNNING
|
||||||
|
|
||||||
|
# Third save with final status
|
||||||
|
run.status = RunStatus.COMPLETED
|
||||||
|
await storage.save_run(run, immediate=True)
|
||||||
|
|
||||||
|
# Load summary - should be fresh again
|
||||||
|
summary3 = await storage.load_summary(run_id)
|
||||||
|
assert summary3.status == RunStatus.COMPLETED
|
||||||
|
finally:
|
||||||
|
await storage.stop()
|
||||||
Reference in New Issue
Block a user