feat(logging): structured TRACE events and end-to-end request correlation

Add core/trace.py with trace_event, traced_async_stream, and payload snapshots.
Merge TRACE fields into JSON logs; promote claude_session_id, http path/method.
Instrument API, messaging/CLI, and OpenAI-compat/native provider paths.
Harden log sink with enqueue and stdlib intercept re-entrancy guard.
Document behavior in .env.example and README; extend tests.
This commit is contained in:
Alishahryar1
2026-05-10 18:24:48 -07:00
parent 1e97dff214
commit 29e7714337
18 changed files with 646 additions and 140 deletions
+23 -8
View File
@@ -13,6 +13,7 @@ from starlette.types import Receive, Scope, Send
from config.logging_config import configure_logging
from config.settings import get_settings
from core.trace import extract_claude_session_id_from_headers, trace_event
from providers.exceptions import ProviderError
from .admin_routes import router as admin_router
@@ -95,6 +96,18 @@ def create_app(*, lifespan_enabled: bool = True) -> FastAPI:
app_kwargs["lifespan"] = lifespan
app = FastAPI(**app_kwargs)
@app.middleware("http")
async def trace_http_correlation(request: Request, call_next):
"""Attach HTTP identifiers and optional Claude session id to logs."""
claude_sid = extract_claude_session_id_from_headers(request.headers)
with logger.contextualize(
http_method=request.method,
http_path=request.url.path,
claude_session_id=claude_sid,
):
response = await call_next(request)
return response
# Register routes
app.include_router(admin_router)
app.include_router(router)
@@ -111,14 +124,16 @@ def create_app(*, lifespan_enabled: bool = True) -> FastAPI:
message_summary, tool_names = summarize_request_validation_body(body)
logger.debug(
"Request validation failed: path={} query={} error_locs={} error_types={} message_summary={} tool_names={}",
request.url.path,
str(request.url.query),
[list(error.get("loc", ())) for error in exc.errors()],
[str(error.get("type", "")) for error in exc.errors()],
message_summary,
tool_names,
trace_event(
stage="ingress",
event="server.request.validation_failed",
source="api",
path=request.url.path,
query=dict(request.query_params),
error_locs=[list(error.get("loc", ())) for error in exc.errors()],
error_types=[str(error.get("type", "")) for error in exc.errors()],
message_summary=message_summary,
tool_names=tool_names,
)
return await request_validation_exception_handler(request, exc)
+8
View File
@@ -5,6 +5,7 @@ from loguru import logger
from config.settings import Settings
from core.anthropic import get_token_count
from core.trace import trace_event
from providers.registry import ProviderRegistry
from . import dependencies
@@ -231,6 +232,7 @@ async def list_models(
_auth=Depends(require_api_key),
):
"""List the model ids this proxy advertises to Claude-compatible clients."""
trace_event(stage="ingress", event="api.models.list", source="api")
registry = getattr(request.app.state, "provider_registry", None)
provider_registry = registry if isinstance(registry, ProviderRegistry) else None
return _build_models_list_response(settings, provider_registry)
@@ -250,5 +252,11 @@ async def stop_cli(request: Request, _auth=Depends(require_api_key)):
raise HTTPException(status_code=503, detail="Messaging system not initialized")
count = await handler.stop_all_tasks()
trace_event(
stage="ingress",
event="api.cli.stop_via_handler",
source="api",
cancelled_nodes=count,
)
logger.info("STOP_CLI: source=handler cancelled_count={}", count)
return {"status": "stopped", "cancelled_count": count}
+78 -27
View File
@@ -14,6 +14,7 @@ from loguru import logger
from config.settings import Settings
from core.anthropic import get_token_count, get_user_facing_error_message
from core.anthropic.sse import ANTHROPIC_SSE_RESPONSE_HEADERS
from core.trace import api_messages_request_snapshot, trace_event, traced_async_stream
from providers.base import BaseProvider
from providers.exceptions import InvalidRequestError, ProviderError
@@ -118,7 +119,12 @@ class ClaudeProxyService:
input_tokens = self._token_counter(
routed.request.messages, routed.request.system, routed.request.tools
)
logger.info("Optimization: Handling Anthropic web server tool")
trace_event(
stage="routing",
event="api.optimization.web_server_tool",
source="api",
model=routed.request.model,
)
egress = WebFetchEgressPolicy(
allow_private_network_targets=self._settings.web_fetch_allow_private_networks,
allowed_schemes=self._settings.web_fetch_allowed_scheme_set(),
@@ -134,6 +140,12 @@ class ClaudeProxyService:
optimized = try_optimizations(routed.request, self._settings)
if optimized is not None:
trace_event(
stage="routing",
event="api.optimization.short_circuit",
source="api",
model=routed.request.model,
)
return optimized
logger.debug("No optimization matched, routing to provider")
@@ -143,29 +155,57 @@ class ClaudeProxyService:
thinking_enabled=routed.resolved.thinking_enabled,
)
request_id = f"req_{uuid.uuid4().hex[:12]}"
logger.info(
"API_REQUEST: request_id={} model={} messages={}",
request_id,
routed.request.model,
len(routed.request.messages),
trace_event(
stage="routing",
event="api.route.resolved",
source="api",
provider_id=routed.resolved.provider_id,
provider_model=routed.resolved.provider_model,
provider_model_ref=routed.resolved.provider_model_ref,
gateway_model=routed.request.model,
thinking_enabled=routed.resolved.thinking_enabled,
)
if self._settings.log_raw_api_payloads:
logger.debug(
"FULL_PAYLOAD [{}]: {}", request_id, routed.request.model_dump()
request_id = f"req_{uuid.uuid4().hex[:12]}"
with logger.contextualize(request_id=request_id):
trace_event(
stage="ingress",
event="api.request.received",
source="api",
message_count=len(routed.request.messages),
snapshot=api_messages_request_snapshot(routed.request),
)
input_tokens = self._token_counter(
routed.request.messages, routed.request.system, routed.request.tools
)
return anthropic_sse_streaming_response(
provider.stream_response(
routed.request,
input_tokens=input_tokens,
request_id=request_id,
thinking_enabled=routed.resolved.thinking_enabled,
),
)
if self._settings.log_raw_api_payloads:
logger.debug(
"FULL_PAYLOAD [{}]: {}", request_id, routed.request.model_dump()
)
input_tokens = self._token_counter(
routed.request.messages,
routed.request.system,
routed.request.tools,
)
streamed = traced_async_stream(
provider.stream_response(
routed.request,
input_tokens=input_tokens,
request_id=request_id,
thinking_enabled=routed.resolved.thinking_enabled,
),
stage="egress",
source="api",
complete_event="api.response.stream_completed",
interrupted_event="api.response.stream_interrupted",
chunk_event=None,
extra={
"request_id": request_id,
"provider_id": routed.resolved.provider_id,
"gateway_model": routed.request.model,
},
)
return anthropic_sse_streaming_response(streamed)
except ProviderError:
raise
@@ -188,12 +228,23 @@ class ClaudeProxyService:
tokens = self._token_counter(
routed.request.messages, routed.request.system, routed.request.tools
)
logger.info(
"COUNT_TOKENS: request_id={} model={} messages={} input_tokens={}",
request_id,
routed.request.model,
len(routed.request.messages),
tokens,
trace_event(
stage="routing",
event="api.route.resolved",
source="api",
kind="count_tokens",
provider_id=routed.resolved.provider_id,
provider_model=routed.resolved.provider_model,
provider_model_ref=routed.resolved.provider_model_ref,
gateway_model=routed.request.model,
)
trace_event(
stage="ingress",
event="api.count_tokens.completed",
source="api",
message_count=len(routed.request.messages),
input_tokens=tokens,
snapshot=api_messages_request_snapshot(routed.request),
)
return TokenCountResponse(input_tokens=tokens)
except ProviderError: