Files
hive/core/framework/storage/backend.py
T
2026-01-28 11:06:27 +05:30

213 lines
7.5 KiB
Python

"""
File-based storage backend for runtime data.
Stores runs as JSON files with indexes for efficient querying.
Uses Pydantic's built-in serialization.
"""
import json
from pathlib import Path
from framework.schemas.run import Run, RunStatus, RunSummary
class FileStorage:
"""
Simple file-based storage for runs.
Directory structure:
{base_path}/
runs/
{run_id}.json # Full run data
indexes/
by_goal/
{goal_id}.json # List of run IDs for this goal
by_status/
{status}.json # List of run IDs with this status
by_node/
{node_id}.json # List of run IDs that used this node
summaries/
{run_id}.json # Run summary (for quick loading)
"""
def __init__(self, base_path: str | Path):
self.base_path = Path(base_path)
self._ensure_dirs()
def _ensure_dirs(self) -> None:
"""Create directory structure if it doesn't exist."""
dirs = [
self.base_path / "runs",
self.base_path / "indexes" / "by_goal",
self.base_path / "indexes" / "by_status",
self.base_path / "indexes" / "by_node",
self.base_path / "summaries",
]
for d in dirs:
d.mkdir(parents=True, exist_ok=True)
def _validate_key(self, key: str) -> None:
"""
Validate key to prevent path traversal attacks.
Args:
key: The key to validate
Raises:
ValueError: If key contains path traversal or dangerous patterns
"""
if not key or key.strip() == "":
raise ValueError("Key cannot be empty")
# Block path separators
if "/" in key or "\\" in key:
raise ValueError(f"Invalid key format: path separators not allowed in '{key}'")
# Block parent directory references
if ".." in key or key.startswith("."):
raise ValueError(f"Invalid key format: path traversal detected in '{key}'")
# Block absolute paths
if key.startswith("/") or (len(key) > 1 and key[1] == ":"):
raise ValueError(f"Invalid key format: absolute paths not allowed in '{key}'")
# Block null bytes (Unix path injection)
if "\x00" in key:
raise ValueError("Invalid key format: null bytes not allowed")
# Block other dangerous special characters
dangerous_chars = {"<", ">", "|", "&", "$", "`", "'", '"'}
if any(char in key for char in dangerous_chars):
raise ValueError(f"Invalid key format: contains dangerous characters in '{key}'")
# === RUN OPERATIONS ===
def save_run(self, run: Run) -> None:
"""Save a run to storage."""
# Save full run using Pydantic's model_dump_json
run_path = self.base_path / "runs" / f"{run.id}.json"
with open(run_path, "w") as f:
f.write(run.model_dump_json(indent=2))
# Save summary
summary = RunSummary.from_run(run)
summary_path = self.base_path / "summaries" / f"{run.id}.json"
with open(summary_path, "w") as f:
f.write(summary.model_dump_json(indent=2))
# Update indexes
self._add_to_index("by_goal", run.goal_id, run.id)
self._add_to_index("by_status", run.status.value, run.id)
for node_id in run.metrics.nodes_executed:
self._add_to_index("by_node", node_id, run.id)
def load_run(self, run_id: str) -> Run | None:
"""Load a run from storage."""
run_path = self.base_path / "runs" / f"{run_id}.json"
if not run_path.exists():
return None
with open(run_path) as f:
return Run.model_validate_json(f.read())
def load_summary(self, run_id: str) -> RunSummary | None:
"""Load just the summary (faster than full run)."""
summary_path = self.base_path / "summaries" / f"{run_id}.json"
if not summary_path.exists():
# Fall back to computing from full run
run = self.load_run(run_id)
if run:
return RunSummary.from_run(run)
return None
with open(summary_path) as f:
return RunSummary.model_validate_json(f.read())
def delete_run(self, run_id: str) -> bool:
"""Delete a run from storage."""
run_path = self.base_path / "runs" / f"{run_id}.json"
summary_path = self.base_path / "summaries" / f"{run_id}.json"
if not run_path.exists():
return False
# Load run to get index keys
run = self.load_run(run_id)
if run:
self._remove_from_index("by_goal", run.goal_id, run_id)
self._remove_from_index("by_status", run.status.value, run_id)
for node_id in run.metrics.nodes_executed:
self._remove_from_index("by_node", node_id, run_id)
run_path.unlink()
if summary_path.exists():
summary_path.unlink()
return True
# === QUERY OPERATIONS ===
def get_runs_by_goal(self, goal_id: str) -> list[str]:
"""Get all run IDs for a goal."""
return self._get_index("by_goal", goal_id)
def get_runs_by_status(self, status: str | RunStatus) -> list[str]:
"""Get all run IDs with a status."""
if isinstance(status, RunStatus):
status = status.value
return self._get_index("by_status", status)
def get_runs_by_node(self, node_id: str) -> list[str]:
"""Get all run IDs that executed a node."""
return self._get_index("by_node", node_id)
def list_all_runs(self) -> list[str]:
"""List all run IDs."""
runs_dir = self.base_path / "runs"
return [f.stem for f in runs_dir.glob("*.json")]
def list_all_goals(self) -> list[str]:
"""List all goal IDs that have runs."""
goals_dir = self.base_path / "indexes" / "by_goal"
return [f.stem for f in goals_dir.glob("*.json")]
# === INDEX OPERATIONS ===
def _get_index(self, index_type: str, key: str) -> list[str]:
"""Get values from an index."""
self._validate_key(key) # Prevent path traversal
index_path = self.base_path / "indexes" / index_type / f"{key}.json"
if not index_path.exists():
return []
with open(index_path) as f:
return json.load(f)
def _add_to_index(self, index_type: str, key: str, value: str) -> None:
"""Add a value to an index."""
self._validate_key(key) # Prevent path traversal
index_path = self.base_path / "indexes" / index_type / f"{key}.json"
values = self._get_index(index_type, key) # Already validated in _get_index
if value not in values:
values.append(value)
with open(index_path, "w") as f:
json.dump(values, f)
def _remove_from_index(self, index_type: str, key: str, value: str) -> None:
"""Remove a value from an index."""
self._validate_key(key) # Prevent path traversal
index_path = self.base_path / "indexes" / index_type / f"{key}.json"
values = self._get_index(index_type, key) # Already validated in _get_index
if value in values:
values.remove(value)
with open(index_path, "w") as f:
json.dump(values, f)
# === UTILITY ===
def get_stats(self) -> dict:
"""Get storage statistics."""
return {
"total_runs": len(self.list_all_runs()),
"total_goals": len(self.list_all_goals()),
"storage_path": str(self.base_path),
}