Merge pull request #2273 from krish341360/fix/concurrent-storage-race-condition
Release / Create Release (push) Waiting to run

fix: race condition in ConcurrentStorage and cache invalidation bug
This commit is contained in:
Timothy @aden
2026-02-02 10:57:45 -08:00
committed by GitHub
2 changed files with 173 additions and 3 deletions
+11 -3
View File
@@ -167,14 +167,18 @@ class ConcurrentStorage:
run: Run to save
immediate: If True, save immediately (bypasses batching)
"""
# Invalidate summary cache since the run data is changing
# This ensures load_summary() fetches fresh data after the save
self._cache.pop(f"summary:{run.id}", None)
if immediate or not self._running:
await self._save_run_locked(run)
# Update cache only after successful immediate write
self._cache[f"run:{run.id}"] = CacheEntry(run, time.time())
else:
# For batched writes, cache will be updated in _flush_batch after successful write
await self._write_queue.put(("run", run))
# Update cache
self._cache[f"run:{run.id}"] = CacheEntry(run, time.time())
async def _save_run_locked(self, run: Run) -> None:
"""Save a run with file locking, including index locks."""
lock_key = f"run:{run.id}"
@@ -363,8 +367,12 @@ class ConcurrentStorage:
try:
if item_type == "run":
await self._save_run_locked(item)
# Update cache only after successful batched write
# This fixes the race condition where cache was updated before write completed
self._cache[f"run:{item.id}"] = CacheEntry(item, time.time())
except Exception as e:
logger.error(f"Failed to save {item_type}: {e}")
# Cache is NOT updated on failure - prevents stale/inconsistent cache state
async def _flush_pending(self) -> None:
"""Flush all pending writes."""
+162
View File
@@ -0,0 +1,162 @@
"""Tests for ConcurrentStorage race condition and cache invalidation fixes."""
import asyncio
from pathlib import Path
import pytest
from framework.schemas.run import Run, RunMetrics, RunStatus
from framework.storage.concurrent import ConcurrentStorage
def create_test_run(
run_id: str, goal_id: str = "test-goal", status: RunStatus = RunStatus.RUNNING
) -> Run:
"""Create a minimal test Run object."""
return Run(
id=run_id,
goal_id=goal_id,
status=status,
narrative="Test run",
metrics=RunMetrics(
nodes_executed=[],
),
decisions=[],
problems=[],
)
@pytest.mark.asyncio
async def test_cache_invalidation_on_save(tmp_path: Path):
"""Test that summary cache is invalidated when a run is saved.
This tests the fix for the cache invalidation bug where load_summary()
would return stale data after a run was updated.
"""
storage = ConcurrentStorage(tmp_path)
await storage.start()
try:
run_id = "test-run-1"
# Create and save initial run
run = create_test_run(run_id, status=RunStatus.RUNNING)
await storage.save_run(run, immediate=True)
# Load summary to populate the cache
summary = await storage.load_summary(run_id)
assert summary is not None
assert summary.status == RunStatus.RUNNING
# Update run with new status
run.status = RunStatus.COMPLETED
await storage.save_run(run, immediate=True)
# Load summary again - should get fresh data, not cached stale data
summary = await storage.load_summary(run_id)
assert summary is not None
assert summary.status == RunStatus.COMPLETED, (
"Summary cache should be invalidated on save - got stale data"
)
finally:
await storage.stop()
@pytest.mark.asyncio
async def test_batched_write_cache_consistency(tmp_path: Path):
"""Test that cache is only updated after successful batched write.
This tests the fix for the race condition where cache was updated
before the batched write completed.
"""
storage = ConcurrentStorage(tmp_path, batch_interval=0.05)
await storage.start()
try:
run_id = "test-run-2"
# Save via batching (immediate=False)
run = create_test_run(run_id, status=RunStatus.RUNNING)
await storage.save_run(run, immediate=False)
# Before batch flush, cache should NOT contain the run
# (This is the fix - previously cache was updated immediately)
cache_key = f"run:{run_id}"
assert cache_key not in storage._cache, (
"Cache should not be updated before batch is flushed"
)
# Wait for batch to flush
await asyncio.sleep(0.1)
# After batch flush, cache should contain the run
assert cache_key in storage._cache, "Cache should be updated after batch flush"
# Verify data on disk matches cache
loaded_run = await storage.load_run(run_id, use_cache=False)
assert loaded_run is not None
assert loaded_run.id == run_id
assert loaded_run.status == RunStatus.RUNNING
finally:
await storage.stop()
@pytest.mark.asyncio
async def test_immediate_write_updates_cache(tmp_path: Path):
"""Test that immediate writes still update cache correctly."""
storage = ConcurrentStorage(tmp_path)
await storage.start()
try:
run_id = "test-run-3"
# Save with immediate=True
run = create_test_run(run_id, status=RunStatus.COMPLETED)
await storage.save_run(run, immediate=True)
# Cache should be updated immediately for immediate writes
cache_key = f"run:{run_id}"
assert cache_key in storage._cache, "Cache should be updated after immediate write"
# Verify cached value is correct
cached_run = storage._cache[cache_key].value
assert cached_run.id == run_id
assert cached_run.status == RunStatus.COMPLETED
finally:
await storage.stop()
@pytest.mark.asyncio
async def test_summary_cache_invalidated_on_multiple_saves(tmp_path: Path):
"""Test that summary cache is invalidated on each save, not just the first."""
storage = ConcurrentStorage(tmp_path)
await storage.start()
try:
run_id = "test-run-4"
# First save
run = create_test_run(run_id, status=RunStatus.RUNNING)
await storage.save_run(run, immediate=True)
# Load summary to cache it
summary1 = await storage.load_summary(run_id)
assert summary1.status == RunStatus.RUNNING
# Second save with new status
run.status = RunStatus.RUNNING
await storage.save_run(run, immediate=True)
# Load summary - should be fresh
summary2 = await storage.load_summary(run_id)
assert summary2.status == RunStatus.RUNNING
# Third save with final status
run.status = RunStatus.COMPLETED
await storage.save_run(run, immediate=True)
# Load summary - should be fresh again
summary3 = await storage.load_summary(run_id)
assert summary3.status == RunStatus.COMPLETED
finally:
await storage.stop()