consolidation of some of the modules, localization fixes, product advisories work, qa work

This commit is contained in:
master
2026-03-05 03:54:22 +02:00
parent 7bafcc3eef
commit 8e1cb9448d
3878 changed files with 72600 additions and 46861 deletions

View File

@@ -0,0 +1,21 @@
# Worker SDK (Python) — Agent Charter
## Mission
Publish the Python client library for StellaOps orchestrated workers. Provide asyncio-friendly claim/heartbeat/progress APIs, artifact publishing helpers, error handling, and observability hooks aligned with Epic 9 requirements and the imposed rule for cross-component parity.
## Responsibilities
- Maintain typed client (httpx/async) with retry/backoff primitives mirroring jobengine expectations.
- Surface structured metrics/logging instrumentation and pluggable exporters.
- Enforce idempotency token usage, artifact checksum publication, and watermark/backfill helpers.
- Coordinate versioning with Go SDK, jobengine service contracts, DevOps packaging, and Offline Kit deliverables.
## Required Reading
- `docs/modules/jobengine/architecture.md`
- `docs/modules/platform/architecture-overview.md`
## Working Agreement
- 1. Update task status to `DOING`/`DONE` in both correspoding sprint file `/docs/implplan/SPRINT_*.md` and the local `TASKS.md` when you start or finish work.
- 2. Review this charter and the Required Reading documents before coding; confirm prerequisites are met.
- 3. Keep changes deterministic (stable ordering, timestamps, hashes) and align with offline/air-gap expectations.
- 4. Coordinate doc updates, tests, and cross-guild communication whenever contracts or workflows change.
- 5. Revert to `TODO` if you pause the task without shipping changes; leave notes in commit/PR descriptions for context.

View File

@@ -0,0 +1,10 @@
# StellaOps Orchestrator Worker SDK (Python)
Async-friendly SDK for StellaOps workers: claim jobs, acknowledge results, and attach tenant-aware auth headers. The default transport is dependency-free and can be swapped for aiohttp/httpx as needed.
## Quick start
```bash
export ORCH_BASE_URL=http://localhost:8080
export ORCH_API_KEY=dev-token
python sample_worker.py
```

View File

@@ -0,0 +1,11 @@
[project]
name = "stellaops-jobengine-worker"
version = "0.1.0"
description = "Async worker SDK for StellaOps Orchestrator"
authors = [{name = "StellaOps"}]
readme = "README.md"
requires-python = ">=3.10"
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

View File

@@ -0,0 +1,41 @@
import asyncio
import os
from stellaops_jobengine_worker import (
AckJobRequest,
ClaimJobRequest,
Config,
OrchestratorClient,
)
from stellaops_jobengine_worker.retry import RetryPolicy, retry
async def main():
cfg = Config(
base_url=os.environ.get("ORCH_BASE_URL", "http://localhost:8080"),
api_key=os.environ.get("ORCH_API_KEY", "dev-token"),
tenant_id=os.environ.get("ORCH_TENANT", "local-tenant"),
project_id=os.environ.get("ORCH_PROJECT", "demo-project"),
)
client = OrchestratorClient(cfg)
claim = await client.claim(ClaimJobRequest(worker_id="py-worker", capabilities=["pack-run"]))
if claim is None:
print("no work available")
return
# ... perform actual work described by claim.payload ...
await client.heartbeat(job_id=claim.job_id, lease_id=claim.lease_id)
await client.progress(job_id=claim.job_id, lease_id=claim.lease_id, pct=50, message="halfway")
async def _ack():
await client.ack(
AckJobRequest(job_id=claim.job_id, lease_id=claim.lease_id, status="succeeded"),
)
await retry(RetryPolicy(), _ack)
print(f"acknowledged job {claim.job_id}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,37 @@
"""Async worker SDK for StellaOps Orchestrator."""
from .client import OrchestratorClient, ClaimJobRequest, AckJobRequest, ClaimJobResponse
from .config import Config
from .metrics import MetricsSink, NoopMetrics
from .transport import Transport, InMemoryTransport, TransportRequest, TransportResponse
from .retry import RetryPolicy, retry
from .storage import publish_artifact, InMemoryStorage, ArtifactPublishResult, Storage
from .errors import ErrorCode, classify_status
from .backfill import Range, WatermarkHandshake, Deduper, execute_range, verify_and_publish_artifact
__all__ = [
"OrchestratorClient",
"ClaimJobRequest",
"ClaimJobResponse",
"AckJobRequest",
"Config",
"MetricsSink",
"NoopMetrics",
"RetryPolicy",
"retry",
"Storage",
"publish_artifact",
"InMemoryStorage",
"ArtifactPublishResult",
"Range",
"WatermarkHandshake",
"Deduper",
"execute_range",
"verify_and_publish_artifact",
"ErrorCode",
"classify_status",
"Transport",
"InMemoryTransport",
"TransportRequest",
"TransportResponse",
]

View File

@@ -0,0 +1,81 @@
from __future__ import annotations
import asyncio
import datetime as dt
from dataclasses import dataclass
from typing import Awaitable, Callable, Optional
from .storage import publish_artifact, ArtifactPublishResult, Storage
@dataclass
class Range:
start: dt.datetime
end: dt.datetime
def validate(self) -> None:
if self.end < self.start:
raise ValueError("range end before start")
@dataclass
class WatermarkHandshake:
expected: str
current: str
def validate(self) -> None:
if not self.expected:
raise ValueError("expected watermark required")
if self.expected != self.current:
raise ValueError("watermark mismatch")
class Deduper:
def __init__(self):
self._seen: set[str] = set()
def seen(self, key: str) -> bool:
if not key:
return False
if key in self._seen:
return True
self._seen.add(key)
return False
async def execute_range(r: Range, step: dt.timedelta, fn: Callable[[dt.datetime], Awaitable[None]]) -> None:
r.validate()
if step.total_seconds() <= 0:
raise ValueError("step must be positive")
current = r.start
while current <= r.end:
await fn(current)
current = current + step
async def verify_and_publish_artifact(
*,
storage: Storage,
wm: WatermarkHandshake,
dedupe: Optional[Deduper],
job_id: str,
lease_id: str,
object_key: str,
content: bytes,
content_type: str = "application/octet-stream",
artifact_type: Optional[str] = None,
idempotency_key: Optional[str] = None,
) -> ArtifactPublishResult:
wm.validate()
if dedupe and idempotency_key and dedupe.seen(idempotency_key):
raise ValueError("duplicate artifact idempotency key")
return await publish_artifact(
storage=storage,
job_id=job_id,
lease_id=lease_id,
object_key=object_key,
content=content,
content_type=content_type,
artifact_type=artifact_type,
idempotency_key=idempotency_key,
)

View File

@@ -0,0 +1,111 @@
from __future__ import annotations
import json
from dataclasses import dataclass
from typing import Optional
from urllib.parse import urljoin
from .config import Config
from .metrics import MetricsSink
from .transport import Transport, TransportRequest, TransportResponse
@dataclass
class ClaimJobRequest:
worker_id: str
capabilities: Optional[list[str]] = None
@dataclass
class ClaimJobResponse:
job_id: str
lease_id: str
job_type: Optional[str]
payload: dict
expires_at: Optional[str] = None
retry_after_seconds: Optional[int] = None
@dataclass
class AckJobRequest:
job_id: str
lease_id: str
status: str
message: Optional[str] = None
class OrchestratorClient:
"""Async client for job claim/ack operations."""
def __init__(self, config: Config, *, transport: Optional[Transport] = None):
config.validate()
self._cfg = config
self._transport = transport or config.get_transport()
self._metrics: MetricsSink = config.get_metrics()
async def claim(self, request: ClaimJobRequest) -> Optional[ClaimJobResponse]:
if not request.worker_id:
raise ValueError("worker_id is required")
body = json.dumps(request.__dict__).encode()
resp = await self._execute("POST", "/api/jobs/lease", body)
if resp.status == 204:
return None
if resp.status >= 300:
raise RuntimeError(f"claim failed: {resp.status} {resp.body.decode(errors='ignore')}")
data = json.loads(resp.body)
self._metrics.inc_claimed()
return ClaimJobResponse(
job_id=data["job_id"],
lease_id=data["lease_id"],
job_type=data.get("job_type"),
payload=data.get("payload", {}),
expires_at=data.get("expires_at"),
retry_after_seconds=data.get("retry_after_seconds"),
)
async def ack(self, request: AckJobRequest) -> None:
if not request.job_id or not request.lease_id or not request.status:
raise ValueError("job_id, lease_id, and status are required")
body = json.dumps(request.__dict__).encode()
resp = await self._execute("POST", f"/api/jobs/{request.job_id}/ack", body)
if resp.status >= 300:
raise RuntimeError(f"ack failed: {resp.status} {resp.body.decode(errors='ignore')}")
self._metrics.inc_ack(request.status)
async def heartbeat(self, *, job_id: str, lease_id: str) -> None:
if not job_id or not lease_id:
raise ValueError("job_id and lease_id are required")
body = json.dumps({"lease_id": lease_id}).encode()
resp = await self._execute("POST", f"/api/jobs/{job_id}/heartbeat", body)
if resp.status >= 300:
self._metrics.inc_heartbeat_failures()
raise RuntimeError(f"heartbeat failed: {resp.status} {resp.body.decode(errors='ignore')}")
# latency recorded by caller; keep simple here
async def progress(self, *, job_id: str, lease_id: str, pct: int, message: Optional[str] = None) -> None:
if pct < 0 or pct > 100:
raise ValueError("pct must be 0-100")
payload = {"lease_id": lease_id, "progress": pct}
if message:
payload["message"] = message
body = json.dumps(payload).encode()
resp = await self._execute("POST", f"/api/jobs/{job_id}/progress", body)
if resp.status >= 300:
raise RuntimeError(f"progress failed: {resp.status} {resp.body.decode(errors='ignore')}")
async def _execute(self, method: str, path: str, body: Optional[bytes]) -> TransportResponse:
url = urljoin(self._cfg.base_url.rstrip("/") + "/", path.lstrip("/"))
headers = {
"Accept": "application/json",
"Content-Type": "application/json",
"User-Agent": self._cfg.user_agent,
}
if self._cfg.api_key:
headers["Authorization"] = f"Bearer {self._cfg.api_key}"
if self._cfg.tenant_id:
headers["X-StellaOps-Tenant"] = self._cfg.tenant_id
if self._cfg.project_id:
headers["X-StellaOps-Project"] = self._cfg.project_id
req = TransportRequest(method=method, url=url, headers=headers, body=body)
return await self._transport.execute(req)

View File

@@ -0,0 +1,30 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Optional
from .metrics import MetricsSink, NoopMetrics
from .transport import Transport, default_transport
@dataclass
class Config:
"""SDK configuration."""
base_url: str
api_key: Optional[str] = None
tenant_id: Optional[str] = None
project_id: Optional[str] = None
user_agent: str = "stellaops-worker-sdk-py/0.1"
transport: Optional[Transport] = None
metrics: Optional[MetricsSink] = None
def validate(self) -> None:
if not self.base_url:
raise ValueError("base_url is required")
def get_transport(self) -> Transport:
return self.transport or default_transport()
def get_metrics(self) -> MetricsSink:
return self.metrics or NoopMetrics()

View File

@@ -0,0 +1,24 @@
from __future__ import annotations
from enum import Enum
class ErrorCode(str, Enum):
TEMPORARY = "temporary"
PERMANENT = "permanent"
FATAL = "fatal"
UNAUTHORIZED = "unauthorized"
QUOTA = "quota_exceeded"
VALIDATION = "validation"
def classify_status(status: int) -> tuple[ErrorCode | None, bool]:
if status in (401, 403):
return ErrorCode.UNAUTHORIZED, False
if status == 429:
return ErrorCode.QUOTA, True
if 500 <= status < 600:
return ErrorCode.TEMPORARY, True
if 400 <= status < 500:
return ErrorCode.PERMANENT, False
return None, False

View File

@@ -0,0 +1,24 @@
from __future__ import annotations
from typing import Protocol
class MetricsSink(Protocol):
def inc_claimed(self) -> None: ...
def inc_ack(self, status: str) -> None: ...
def observe_heartbeat_latency(self, seconds: float) -> None: ...
def inc_heartbeat_failures(self) -> None: ...
class NoopMetrics:
def inc_claimed(self) -> None:
return None
def inc_ack(self, status: str) -> None:
return None
def observe_heartbeat_latency(self, seconds: float) -> None:
return None
def inc_heartbeat_failures(self) -> None:
return None

View File

@@ -0,0 +1,34 @@
from __future__ import annotations
import asyncio
import random
from dataclasses import dataclass
from typing import Awaitable, Callable
@dataclass
class RetryPolicy:
max_attempts: int = 5
base_delay: float = 0.2 # seconds
max_delay: float = 5.0 # seconds
jitter: float = 0.2 # +/- 20%
def _jittered(delay: float, jitter: float) -> float:
if jitter <= 0:
return delay
factor = 1 + ((random.random() * 2 - 1) * jitter)
return delay * factor
async def retry(policy: RetryPolicy, fn: Callable[[], Awaitable[None]]) -> None:
delay = policy.base_delay
for attempt in range(1, policy.max_attempts + 1):
try:
await fn()
return
except Exception: # pragma: no cover - caller handles fatal
if attempt == policy.max_attempts:
raise
await asyncio.sleep(min(_jittered(delay, policy.jitter), policy.max_delay))
delay = min(delay * 2, policy.max_delay)

View File

@@ -0,0 +1,56 @@
from __future__ import annotations
import hashlib
from dataclasses import dataclass
from typing import Protocol, Dict, Optional
class Storage(Protocol):
async def put_object(self, key: str, data: bytes, metadata: Dict[str, str]) -> None: ...
@dataclass
class ArtifactPublishResult:
sha256: str
size: int
async def publish_artifact(
*,
storage: Storage,
job_id: str,
lease_id: str,
object_key: str,
content: bytes,
content_type: str = "application/octet-stream",
artifact_type: Optional[str] = None,
idempotency_key: Optional[str] = None,
) -> ArtifactPublishResult:
if not job_id or not lease_id:
raise ValueError("job_id and lease_id are required")
if not object_key:
raise ValueError("object_key is required")
if storage is None:
raise ValueError("storage is required")
sha = hashlib.sha256(content).hexdigest()
metadata = {
"x-stellaops-job-id": job_id,
"x-stellaops-lease": lease_id,
"x-stellaops-ct": content_type,
}
if artifact_type:
metadata["x-stellaops-type"] = artifact_type
if idempotency_key:
metadata["x-idempotency-key"] = idempotency_key
await storage.put_object(object_key, content, metadata)
return ArtifactPublishResult(sha256=sha, size=len(content))
class InMemoryStorage(Storage):
def __init__(self):
self.calls = []
async def put_object(self, key: str, data: bytes, metadata: Dict[str, str]) -> None:
self.calls.append((key, data, metadata))

View File

@@ -0,0 +1,164 @@
import asyncio
import json
import unittest
import datetime as dt
from stellaops_jobengine_worker import (
AckJobRequest,
ClaimJobRequest,
Config,
ErrorCode,
Deduper,
Range,
WatermarkHandshake,
execute_range,
verify_and_publish_artifact,
InMemoryStorage,
InMemoryTransport,
MetricsSink,
OrchestratorClient,
TransportRequest,
TransportResponse,
classify_status,
publish_artifact,
)
class ClientTests(unittest.TestCase):
def test_claim_and_ack_headers(self):
seen = {}
metric_calls = {"claimed": 0, "ack": 0, "hb_fail": 0}
class Metrics(MetricsSink):
def inc_claimed(self) -> None:
metric_calls["claimed"] += 1
def inc_ack(self, status: str) -> None:
metric_calls["ack"] += 1
def observe_heartbeat_latency(self, seconds: float) -> None:
metric_calls["latency"] = seconds
def inc_heartbeat_failures(self) -> None:
metric_calls["hb_fail"] += 1
def handler(req: TransportRequest) -> TransportResponse:
if req.url.endswith("/api/jobs/lease"):
seen["claim_headers"] = req.headers
seen["claim_url"] = req.url
body = json.loads(req.body)
self.assertEqual(body["worker_id"], "w1")
payload = {
"job_id": "123",
"lease_id": "l1",
"job_type": "demo",
"payload": {"k": "v"},
}
return TransportResponse(status=200, headers={}, body=json.dumps(payload).encode())
if req.url.endswith("/api/jobs/123/heartbeat"):
return TransportResponse(status=202, headers={}, body=b"")
if req.url.endswith("/api/jobs/123/progress"):
return TransportResponse(status=202, headers={}, body=b"")
seen["ack_headers"] = req.headers
seen["ack_url"] = req.url
return TransportResponse(status=202, headers={}, body=b"")
transport = InMemoryTransport(handler)
client = OrchestratorClient(
Config(base_url="http://orch/", api_key="t", tenant_id="tenant-a", project_id="project-1", metrics=Metrics()),
transport=transport,
)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
claim = loop.run_until_complete(
client.claim(ClaimJobRequest(worker_id="w1", capabilities=["scan"]))
)
self.assertEqual(claim.job_id, "123")
loop.run_until_complete(client.ack(AckJobRequest(job_id="123", lease_id="l1", status="succeeded")))
loop.run_until_complete(client.heartbeat(job_id="123", lease_id="l1"))
loop.run_until_complete(client.progress(job_id="123", lease_id="l1", pct=50, message="halfway"))
headers = seen["claim_headers"]
self.assertEqual(headers["Authorization"], "Bearer t")
self.assertEqual(headers["X-StellaOps-Tenant"], "tenant-a")
self.assertEqual(headers["X-StellaOps-Project"], "project-1")
self.assertIn("/api/jobs/lease", seen["claim_url"])
self.assertEqual(metric_calls["claimed"], 1)
self.assertEqual(metric_calls["ack"], 1)
def test_missing_worker_rejected(self):
client = OrchestratorClient(Config(base_url="http://orch"))
loop = asyncio.get_event_loop()
with self.assertRaises(ValueError):
loop.run_until_complete(client.claim(ClaimJobRequest(worker_id="")))
def test_publish_artifact(self):
storage = InMemoryStorage()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
result = loop.run_until_complete(
publish_artifact(
storage=storage,
job_id="j1",
lease_id="l1",
object_key="artifacts/j1/out.txt",
content=b"hello",
content_type="text/plain",
artifact_type="log",
idempotency_key="idem-1",
)
)
self.assertEqual(result.size, 5)
self.assertEqual(len(storage.calls), 1)
key, data, metadata = storage.calls[0]
self.assertEqual(key, "artifacts/j1/out.txt")
self.assertEqual(data, b"hello")
self.assertEqual(metadata["x-idempotency-key"], "idem-1")
def test_classify_status(self):
code, retry = classify_status(500)
self.assertEqual(code, ErrorCode.TEMPORARY)
self.assertTrue(retry)
code, retry = classify_status(404)
self.assertEqual(code, ErrorCode.PERMANENT)
self.assertFalse(retry)
def test_execute_range_and_watermark(self):
r = Range(start=dt.datetime(2025, 11, 15), end=dt.datetime(2025, 11, 17))
hits = []
async def fn(ts: dt.datetime):
hits.append(ts.date())
asyncio.get_event_loop().run_until_complete(execute_range(r, dt.timedelta(days=1), fn))
self.assertEqual(len(hits), 3)
with self.assertRaises(ValueError):
Range(start=r.end, end=r.start - dt.timedelta(days=1)).validate()
wm = WatermarkHandshake(expected="w1", current="w2")
with self.assertRaises(ValueError):
wm.validate()
def test_verify_and_publish_dedupe(self):
storage = InMemoryStorage()
dedupe = Deduper()
dedupe.seen("idem-1")
loop = asyncio.get_event_loop()
with self.assertRaises(ValueError):
loop.run_until_complete(
verify_and_publish_artifact(
storage=storage,
wm=WatermarkHandshake(expected="w", current="w"),
dedupe=dedupe,
job_id="j",
lease_id="l",
object_key="k",
content=b"",
idempotency_key="idem-1",
)
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,62 @@
from __future__ import annotations
import asyncio
from dataclasses import dataclass
from typing import Awaitable, Callable, Dict, Optional
import json
import urllib.request
@dataclass
class TransportRequest:
method: str
url: str
headers: Dict[str, str]
body: Optional[bytes]
@dataclass
class TransportResponse:
status: int
headers: Dict[str, str]
body: bytes
class Transport:
"""Abstract transport interface for HTTP requests."""
async def execute(self, request: TransportRequest) -> TransportResponse: # pragma: no cover - interface
raise NotImplementedError
class _StdlibTransport(Transport):
def __init__(self, *, timeout: float = 10.0):
self._timeout = timeout
async def execute(self, request: TransportRequest) -> TransportResponse:
def _do() -> TransportResponse:
req = urllib.request.Request(
request.url, data=request.body, method=request.method, headers=request.headers
)
with urllib.request.urlopen(req, timeout=self._timeout) as resp: # nosec B310: controlled endpoint
return TransportResponse(
status=resp.status,
headers=dict(resp.headers.items()),
body=resp.read(),
)
return await asyncio.to_thread(_do)
class InMemoryTransport(Transport):
"""Simple stub transport for tests that returns a prepared response."""
def __init__(self, handler: Callable[[TransportRequest], TransportResponse]):
self._handler = handler
async def execute(self, request: TransportRequest) -> TransportResponse:
return self._handler(request)
def default_transport() -> Transport:
return _StdlibTransport()