diff --git a/.env.example b/.env.example index 8699ce1..79af59d 100644 --- a/.env.example +++ b/.env.example @@ -49,6 +49,8 @@ FCC_SMOKE_MODEL_LLAMACPP= FCC_SMOKE_MODEL_OLLAMA= FCC_SMOKE_MODEL_KIMI= FCC_SMOKE_MODEL_WAFER= +FCC_SMOKE_NIM_MODELS= +FCC_SMOKE_NIM_EXTRA_MODELS= # Thinking output diff --git a/providers/nvidia_nim/client.py b/providers/nvidia_nim/client.py index ef06c10..630d810 100644 --- a/providers/nvidia_nim/client.py +++ b/providers/nvidia_nim/client.py @@ -12,10 +12,12 @@ from providers.defaults import NVIDIA_NIM_DEFAULT_BASE from providers.openai_compat import OpenAIChatTransport from .request import ( + body_without_nim_tool_argument_aliases, build_request_body, clone_body_without_chat_template, clone_body_without_reasoning_budget, clone_body_without_reasoning_content, + nim_tool_argument_aliases_from_body, ) @@ -41,6 +43,14 @@ class NvidiaNimProvider(OpenAIChatTransport): thinking_enabled=self._is_thinking_enabled(request, thinking_enabled), ) + def _prepare_create_body(self, body: dict[str, Any]) -> dict[str, Any]: + """Strip private request metadata before calling NVIDIA NIM.""" + return body_without_nim_tool_argument_aliases(body) + + def _tool_argument_aliases(self, body: dict[str, Any]) -> dict[str, dict[str, str]]: + """Return NIM tool argument aliases captured while building this request.""" + return nim_tool_argument_aliases_from_body(body) + def _get_retry_request_body(self, error: Exception, body: dict) -> dict | None: """Retry once with a downgraded body when NIM rejects a known field.""" status_code = getattr(error, "status_code", None) diff --git a/providers/nvidia_nim/request.py b/providers/nvidia_nim/request.py index d04b622..b860968 100644 --- a/providers/nvidia_nim/request.py +++ b/providers/nvidia_nim/request.py @@ -34,6 +34,9 @@ _SCHEMA_LIST_KEYS = frozenset({"allOf", "anyOf", "oneOf", "prefixItems"}) _SCHEMA_MAP_KEYS = frozenset( {"properties", "patternProperties", "$defs", "definitions", "dependentSchemas"} ) +NIM_TOOL_ARGUMENT_ALIASES_KEY = "_fcc_nim_tool_argument_aliases" +_NIM_TOOL_PARAMETER_ALIAS_PREFIX = "_fcc_arg_" +_NIM_UNSAFE_TOOL_PARAMETER_NAMES = frozenset({"type"}) def _clone_strip_extra_body( @@ -123,12 +126,135 @@ def _sanitize_nim_schema_node(value: Any) -> tuple[bool, Any]: return True, value +def _needs_nim_tool_parameter_alias(name: str) -> bool: + return name in _NIM_UNSAFE_TOOL_PARAMETER_NAMES + + +def _make_nim_tool_parameter_alias(name: str, reserved: set[str]) -> str: + safe_tail = "".join( + character if character.isalnum() or character == "_" else "_" + for character in name + ).strip("_") + if not safe_tail: + safe_tail = "arg" + candidate = f"{_NIM_TOOL_PARAMETER_ALIAS_PREFIX}{safe_tail}" + alias = candidate + suffix = 2 + while alias in reserved: + alias = f"{candidate}_{suffix}" + suffix += 1 + reserved.add(alias) + return alias + + +def _collect_nim_tool_property_names(value: Any) -> set[str]: + names: set[str] = set() + if isinstance(value, dict): + properties = value.get("properties") + if isinstance(properties, dict): + for property_name, property_schema in properties.items(): + if isinstance(property_name, str): + names.add(property_name) + names.update(_collect_nim_tool_property_names(property_schema)) + for key, item in value.items(): + if key != "properties": + names.update(_collect_nim_tool_property_names(item)) + elif isinstance(value, list): + for item in value: + names.update(_collect_nim_tool_property_names(item)) + return names + + +def _alias_nim_schema_property_names( + value: Any, + *, + reserved: set[str], + alias_to_original: dict[str, str], + original_to_alias: dict[str, str], +) -> Any: + if isinstance(value, list): + return [ + _alias_nim_schema_property_names( + item, + reserved=reserved, + alias_to_original=alias_to_original, + original_to_alias=original_to_alias, + ) + for item in value + ] + if not isinstance(value, dict): + return value + + local_aliases: dict[str, str] = {} + aliased_value: dict[str, Any] = {} + properties = value.get("properties") + if isinstance(properties, dict): + aliased_properties: dict[str, Any] = {} + for property_name, property_schema in properties.items(): + aliased_schema = _alias_nim_schema_property_names( + property_schema, + reserved=reserved, + alias_to_original=alias_to_original, + original_to_alias=original_to_alias, + ) + if isinstance(property_name, str) and _needs_nim_tool_parameter_alias( + property_name + ): + alias = original_to_alias.get(property_name) + if alias is None: + alias = _make_nim_tool_parameter_alias(property_name, reserved) + alias_to_original[alias] = property_name + original_to_alias[property_name] = alias + local_aliases[property_name] = alias + aliased_properties[alias] = aliased_schema + else: + aliased_properties[property_name] = aliased_schema + aliased_value["properties"] = aliased_properties + + for key, item in value.items(): + if key == "properties": + continue + if key == "required" and isinstance(item, list): + aliased_value[key] = [ + local_aliases.get(required_item, required_item) + if isinstance(required_item, str) + else required_item + for required_item in item + ] + continue + aliased_value[key] = _alias_nim_schema_property_names( + item, + reserved=reserved, + alias_to_original=alias_to_original, + original_to_alias=original_to_alias, + ) + return aliased_value + + +def _alias_nim_tool_parameters( + parameters: dict[str, Any], +) -> tuple[dict[str, Any], dict[str, str]]: + alias_to_original: dict[str, str] = {} + original_to_alias: dict[str, str] = {} + reserved = _collect_nim_tool_property_names(parameters) + aliased_parameters = _alias_nim_schema_property_names( + parameters, + reserved=reserved, + alias_to_original=alias_to_original, + original_to_alias=original_to_alias, + ) + if not alias_to_original: + return parameters, {} + return aliased_parameters, alias_to_original + + def _sanitize_nim_tool_schemas(body: dict[str, Any]) -> None: """Sanitize only tool parameter schemas, preserving tool calls/history.""" tools = body.get("tools") if not isinstance(tools, list): return + tool_argument_aliases: dict[str, dict[str, str]] = {} sanitized_tools: list[Any] = [] for tool in tools: if not isinstance(tool, dict): @@ -141,11 +267,52 @@ def _sanitize_nim_tool_schemas(body: dict[str, Any]) -> None: parameters = function.get("parameters") if isinstance(parameters, dict): _, sanitized_parameters = _sanitize_nim_schema_node(parameters) + sanitized_parameters, argument_aliases = _alias_nim_tool_parameters( + sanitized_parameters + ) sanitized_function["parameters"] = sanitized_parameters + tool_name = function.get("name") + if argument_aliases and isinstance(tool_name, str) and tool_name: + tool_argument_aliases[tool_name] = argument_aliases sanitized_tool["function"] = sanitized_function sanitized_tools.append(sanitized_tool) body["tools"] = sanitized_tools + if tool_argument_aliases: + body[NIM_TOOL_ARGUMENT_ALIASES_KEY] = tool_argument_aliases + else: + body.pop(NIM_TOOL_ARGUMENT_ALIASES_KEY, None) + + +def nim_tool_argument_aliases_from_body( + body: dict[str, Any], +) -> dict[str, dict[str, str]]: + """Return validated private NIM tool argument aliases from a built body.""" + raw_aliases = body.get(NIM_TOOL_ARGUMENT_ALIASES_KEY) + if not isinstance(raw_aliases, dict): + return {} + + aliases: dict[str, dict[str, str]] = {} + for tool_name, tool_aliases in raw_aliases.items(): + if not isinstance(tool_name, str) or not isinstance(tool_aliases, dict): + continue + sanitized_aliases = { + alias: original + for alias, original in tool_aliases.items() + if isinstance(alias, str) and isinstance(original, str) + } + if sanitized_aliases: + aliases[tool_name] = sanitized_aliases + return aliases + + +def body_without_nim_tool_argument_aliases(body: dict[str, Any]) -> dict[str, Any]: + """Return a request body with private alias metadata stripped before upstream I/O.""" + if NIM_TOOL_ARGUMENT_ALIASES_KEY not in body: + return body + upstream_body = dict(body) + upstream_body.pop(NIM_TOOL_ARGUMENT_ALIASES_KEY, None) + return upstream_body def _set_extra( diff --git a/providers/openai_compat.py b/providers/openai_compat.py index 9ebcf03..071b827 100644 --- a/providers/openai_compat.py +++ b/providers/openai_compat.py @@ -128,11 +128,20 @@ class OpenAIChatTransport(BaseProvider): """Return a modified request body for one retry, or None.""" return None + def _prepare_create_body(self, body: dict[str, Any]) -> dict[str, Any]: + """Return the body passed to the upstream OpenAI-compatible client.""" + return body + + def _tool_argument_aliases(self, body: dict[str, Any]) -> dict[str, dict[str, str]]: + """Return provider-specific per-tool argument aliases for this request.""" + return {} + async def _create_stream(self, body: dict) -> tuple[Any, dict]: """Create a streaming chat completion, optionally retrying once.""" try: + create_body = self._prepare_create_body(body) stream = await self._global_rate_limiter.execute_with_retry( - self._client.chat.completions.create, **body, stream=True + self._client.chat.completions.create, **create_body, stream=True ) return stream, body except Exception as error: @@ -140,13 +149,49 @@ class OpenAIChatTransport(BaseProvider): if retry_body is None: raise + create_retry_body = self._prepare_create_body(retry_body) stream = await self._global_rate_limiter.execute_with_retry( - self._client.chat.completions.create, **retry_body, stream=True + self._client.chat.completions.create, **create_retry_body, stream=True ) return stream, retry_body + def _restore_aliased_tool_arguments( + self, argument_json: str, aliases: dict[str, str] + ) -> str | None: + try: + parsed = json.loads(argument_json) + except json.JSONDecodeError: + return None + if not isinstance(parsed, dict): + return argument_json + restored = self._restore_aliased_tool_argument_value(parsed, aliases) + return json.dumps(restored) + + def _restore_aliased_tool_argument_value( + self, value: Any, aliases: dict[str, str] + ) -> Any: + if isinstance(value, dict): + return { + aliases.get(key, key): self._restore_aliased_tool_argument_value( + item, aliases + ) + for key, item in value.items() + } + if isinstance(value, list): + return [ + self._restore_aliased_tool_argument_value(item, aliases) + for item in value + ] + return value + def _emit_tool_arg_delta( - self, sse: SSEBuilder, tc_index: int, args: str + self, + sse: SSEBuilder, + tc_index: int, + args: str, + *, + tool_argument_aliases: dict[str, dict[str, str]] | None = None, + tool_argument_alias_buffers: dict[int, str] | None = None, ) -> Iterator[str]: """Emit one argument fragment for a started tool block (Task buffer or raw JSON).""" if not args: @@ -159,9 +204,34 @@ class OpenAIChatTransport(BaseProvider): if parsed is not None: yield sse.emit_tool_delta(tc_index, json.dumps(parsed)) return + aliases = ( + tool_argument_aliases.get(state.name, {}) if tool_argument_aliases else {} + ) + if aliases: + if tool_argument_alias_buffers is None: + restored = self._restore_aliased_tool_arguments(args, aliases) + if restored is not None: + yield sse.emit_tool_delta(tc_index, restored) + return + + buffered_args = tool_argument_alias_buffers.get(tc_index, "") + args + restored = self._restore_aliased_tool_arguments(buffered_args, aliases) + if restored is None: + tool_argument_alias_buffers[tc_index] = buffered_args + return + tool_argument_alias_buffers.pop(tc_index, None) + yield sse.emit_tool_delta(tc_index, restored) + return yield sse.emit_tool_delta(tc_index, args) - def _process_tool_call(self, tc: dict, sse: SSEBuilder) -> Iterator[str]: + def _process_tool_call( + self, + tc: dict, + sse: SSEBuilder, + *, + tool_argument_aliases: dict[str, dict[str, str]] | None = None, + tool_argument_alias_buffers: dict[int, str] | None = None, + ) -> Iterator[str]: """Process a single tool call delta and yield SSE events.""" tc_index = tc.get("index", 0) if tc_index < 0: @@ -193,7 +263,13 @@ class OpenAIChatTransport(BaseProvider): if state.pre_start_args: pre = state.pre_start_args state.pre_start_args = "" - yield from self._emit_tool_arg_delta(sse, tc_index, pre) + yield from self._emit_tool_arg_delta( + sse, + tc_index, + pre, + tool_argument_aliases=tool_argument_aliases, + tool_argument_alias_buffers=tool_argument_alias_buffers, + ) state = sse.blocks.tool_states.get(tc_index) if not arguments: @@ -204,13 +280,43 @@ class OpenAIChatTransport(BaseProvider): state.pre_start_args += arguments return - yield from self._emit_tool_arg_delta(sse, tc_index, arguments) + yield from self._emit_tool_arg_delta( + sse, + tc_index, + arguments, + tool_argument_aliases=tool_argument_aliases, + tool_argument_alias_buffers=tool_argument_alias_buffers, + ) def _flush_task_arg_buffers(self, sse: SSEBuilder) -> Iterator[str]: """Emit buffered Task args as a single JSON delta (best-effort).""" for tool_index, out in sse.blocks.flush_task_arg_buffers(): yield sse.emit_tool_delta(tool_index, out) + def _flush_tool_argument_alias_buffers( + self, + sse: SSEBuilder, + tool_argument_aliases: dict[str, dict[str, str]], + tool_argument_alias_buffers: dict[int, str], + ) -> Iterator[str]: + """Emit remaining aliased tool args without losing data on malformed JSON.""" + for tool_index, buffered_args in list(tool_argument_alias_buffers.items()): + if not buffered_args: + tool_argument_alias_buffers.pop(tool_index, None) + continue + state = sse.blocks.tool_states.get(tool_index) + if state is None or state.name == "Task": + continue + aliases = tool_argument_aliases.get(state.name, {}) + if not aliases: + continue + restored = self._restore_aliased_tool_arguments(buffered_args, aliases) + yield sse.emit_tool_delta( + tool_index, + restored if restored is not None else buffered_args, + ) + tool_argument_alias_buffers.pop(tool_index, None) + async def stream_response( self, request: Any, @@ -262,10 +368,13 @@ class OpenAIChatTransport(BaseProvider): heuristic_parser = HeuristicToolParser() finish_reason = None usage_info = None + tool_argument_aliases: dict[str, dict[str, str]] = {} + tool_argument_alias_buffers: dict[int, str] = {} async with self._global_rate_limiter.concurrency_slot(): try: stream, body = await self._create_stream(body) + tool_argument_aliases = self._tool_argument_aliases(body) async for chunk in stream: if getattr(chunk, "usage", None): usage_info = chunk.usage @@ -335,7 +444,12 @@ class OpenAIChatTransport(BaseProvider): "arguments": tc.function.arguments, }, } - for event in self._process_tool_call(tc_info, sse): + for event in self._process_tool_call( + tc_info, + sse, + tool_argument_aliases=tool_argument_aliases, + tool_argument_alias_buffers=tool_argument_alias_buffers, + ): yield event except asyncio.CancelledError, GeneratorExit: @@ -409,6 +523,11 @@ class OpenAIChatTransport(BaseProvider): yield event yield sse.emit_text_delta(" ") + for event in self._flush_tool_argument_alias_buffers( + sse, tool_argument_aliases, tool_argument_alias_buffers + ): + yield event + for event in self._flush_task_arg_buffers(sse): yield event diff --git a/smoke/README.md b/smoke/README.md index c6f5f79..e0208b4 100644 --- a/smoke/README.md +++ b/smoke/README.md @@ -58,10 +58,11 @@ Default targets do not send real bot messages or load voice backends: | `llamacpp` | local `/models` plus native `/messages` through proxy | running llama-server | | `ollama` | local `/api/tags` plus native Anthropic messages through proxy | running Ollama server | -Side-effectful targets are opt-in: +Heavy/side-effectful targets are opt-in: | Target | Product scenarios | Required environment | | --- | --- | --- | +| `nvidia_nim_cli` | Claude Code CLI feature matrix across NIM models | `NVIDIA_NIM_API_KEY`, Claude CLI | | `telegram` | getMe, send, edit, delete, optional manual inbound | token and chat/user ID | | `discord` | channel access, send, edit, delete, optional manual inbound | token and channel ID | | `voice` | generated WAV through local Whisper or NVIDIA NIM transcription | `VOICE_NOTE_ENABLED=true`, `FCC_SMOKE_RUN_VOICE=1` | @@ -88,6 +89,13 @@ $env:FCC_SMOKE_RUN_VOICE = "1" uv run pytest smoke/product -n 0 -s --tb=short ``` +```powershell +$env:FCC_LIVE_SMOKE = "1" +$env:FCC_SMOKE_TARGETS = "nvidia_nim_cli" +$env:FCC_SMOKE_NIM_MODELS = "z-ai/glm-5.1,moonshotai/kimi-k2.6,minimaxai/minimax-m2.7,nvidia/nemotron-3-super-120b-a12b,deepseek-ai/deepseek-v4-pro,deepseek-ai/deepseek-v4-flash" +uv run pytest smoke/product -n 0 -s --tb=short +``` + ```powershell $env:FCC_LIVE_SMOKE = "1" $env:FCC_SMOKE_TARGETS = "messaging,config,extensibility" @@ -106,6 +114,10 @@ uv run pytest smoke/product -n 0 -s --tb=short `FCC_SMOKE_MODEL_LLAMACPP`, `FCC_SMOKE_MODEL_OLLAMA`: optional per-provider smoke model overrides. Values may include the provider prefix or just the model name for that provider. +- `FCC_SMOKE_NIM_MODELS`: optional comma-separated NVIDIA NIM CLI matrix models + that replace the default characterization set. +- `FCC_SMOKE_NIM_EXTRA_MODELS`: optional comma-separated NVIDIA NIM CLI matrix + models appended to the default or replacement set. - `FCC_SMOKE_TIMEOUT_S`: per-request/subprocess timeout, default `45`. - `FCC_SMOKE_CLAUDE_BIN`: Claude CLI executable name, default `claude`. - `FCC_SMOKE_TELEGRAM_CHAT_ID`: Telegram chat/user ID for send/edit/delete. @@ -129,10 +141,15 @@ names contain `KEY`, `TOKEN`, `SECRET`, `WEBHOOK`, or `AUTH`. opt-in flag is absent. - `upstream_unavailable`: a real provider, bot API, or local model server is not reachable. +- `probe_timeout`: the smoke driver reached the target, but the CLI/probe did + not complete within the smoke timeout. - `product_failure`: the app accepted the scenario but returned the wrong shape, crashed, leaked state, or violated the product contract. - `harness_bug`: the smoke test or driver made an invalid assumption. +- `target_disabled`: skipped because `FCC_SMOKE_TARGETS` intentionally selected + a different target. -`product_failure` and `harness_bug` are failures. `missing_env` and -`upstream_unavailable` are skips except when the user explicitly selected a -provider in `FCC_SMOKE_PROVIDER_MATRIX`; selected-but-missing providers fail. +`product_failure` and `harness_bug` are failures. `missing_env`, +`upstream_unavailable`, and `probe_timeout` are skips except when the user +explicitly selected a provider in `FCC_SMOKE_PROVIDER_MATRIX`; +selected-but-missing providers fail. diff --git a/smoke/capabilities.py b/smoke/capabilities.py index 4b0a5a2..2a3efbe 100644 --- a/smoke/capabilities.py +++ b/smoke/capabilities.py @@ -411,7 +411,7 @@ CAPABILITY_CONTRACTS: tuple[CapabilityContract, ...] = ( "stream-json events and session id mapping", "stderr/error event and process cleanup", ("tests/cli/test_cli.py",), - ("test_claude_cli_prompt_when_available",), + ("test_claude_cli_prompt_when_available", "test_nvidia_nim_cli_matrix_e2e"), ), CapabilityContract( "extensibility", diff --git a/smoke/features.py b/smoke/features.py index fa0bc18..4bb95fa 100644 --- a/smoke/features.py +++ b/smoke/features.py @@ -72,10 +72,11 @@ FEATURE_INVENTORY: tuple[FeatureCoverage, ...] = ( ( "test_api_basic_conversation_e2e", "test_claude_cli_adaptive_thinking_e2e", + "test_nvidia_nim_cli_matrix_e2e", "test_vscode_protocol_e2e", "test_jetbrains_protocol_e2e", ), - ("api", "cli", "clients"), + ("api", "cli", "clients", "nvidia_nim_cli"), ("configured provider", "FCC_SMOKE_CLAUDE_BIN for real Claude CLI"), "skip real CLI when binary is absent; configured providers must pass", ), @@ -384,9 +385,10 @@ FEATURE_INVENTORY: tuple[FeatureCoverage, ...] = ( ( "test_claude_cli_adaptive_thinking_e2e", "test_claude_cli_multiturn_tool_protocol_e2e", + "test_nvidia_nim_cli_matrix_e2e", ), - ("cli",), - ("FCC_SMOKE_CLAUDE_BIN", "configured provider"), + ("cli", "nvidia_nim_cli"), + ("FCC_SMOKE_CLAUDE_BIN", "configured provider", "NVIDIA_NIM_API_KEY"), "skip only when Claude CLI binary is absent", ), FeatureCoverage( diff --git a/smoke/lib/config.py b/smoke/lib/config.py index a2dacd5..a5947cb 100644 --- a/smoke/lib/config.py +++ b/smoke/lib/config.py @@ -28,9 +28,11 @@ DEFAULT_TARGETS = frozenset( } ) SIDE_EFFECT_TARGETS = frozenset({"discord", "telegram", "voice"}) -ALL_TARGETS = DEFAULT_TARGETS | SIDE_EFFECT_TARGETS +OPT_IN_TARGETS = frozenset({"nvidia_nim_cli"}) +ALL_TARGETS = DEFAULT_TARGETS | SIDE_EFFECT_TARGETS | OPT_IN_TARGETS TARGET_ALIASES = { "contract": "api", + "nim_cli": "nvidia_nim_cli", "optimizations": "api", "thinking": "providers", "vscode": "clients", @@ -47,6 +49,15 @@ PROVIDER_SMOKE_DEFAULT_MODELS: dict[str, str] = { "wafer": "wafer/DeepSeek-V4-Pro", } +NVIDIA_NIM_CLI_DEFAULT_MODELS: tuple[str, ...] = ( + "z-ai/glm-5.1", + "moonshotai/kimi-k2.6", + "minimaxai/minimax-m2.7", + "nvidia/nemotron-3-super-120b-a12b", + "deepseek-ai/deepseek-v4-pro", + "deepseek-ai/deepseek-v4-flash", +) + TARGET_REQUIRED_ENV: dict[str, tuple[str, ...]] = { "api": (), @@ -62,6 +73,10 @@ TARGET_REQUIRED_ENV: dict[str, tuple[str, ...]] = { "lmstudio": ("LM_STUDIO_BASE_URL with a running LM Studio server",), "llamacpp": ("LLAMACPP_BASE_URL with a running llama-server",), "ollama": ("OLLAMA_BASE_URL with a running Ollama server",), + "nvidia_nim_cli": ( + "NVIDIA_NIM_API_KEY", + "FCC_SMOKE_CLAUDE_BIN or claude on PATH", + ), "telegram": ( "TELEGRAM_BOT_TOKEN", "ALLOWED_TELEGRAM_USER_ID or FCC_SMOKE_TELEGRAM_CHAT_ID", @@ -161,6 +176,13 @@ class SmokeConfig: ) return models + def nvidia_nim_cli_models(self) -> list[ProviderModel]: + """Return the NVIDIA NIM models for Claude Code CLI characterization.""" + return [ + ProviderModel(provider="nvidia_nim", full_model=full_model, source=source) + for full_model, source in nvidia_nim_cli_model_refs().items() + ] + def _include_provider_in_smoke( self, provider: str, mapped_providers: set[str] ) -> bool: @@ -197,6 +219,12 @@ def _parse_csv(raw: str | None) -> frozenset[str]: return frozenset(part.strip() for part in raw.split(",") if part.strip()) +def _parse_csv_ordered(raw: str | None) -> tuple[str, ...]: + if not raw: + return () + return tuple(part.strip() for part in raw.split(",") if part.strip()) + + def _parse_targets(raw: str | None) -> frozenset[str]: if not raw: return DEFAULT_TARGETS @@ -237,6 +265,36 @@ def _normalize_provider_model(provider: str, raw_model: str) -> str: return f"{provider}/{model}" +def nvidia_nim_cli_model_refs( + env: Mapping[str, str] | None = None, +) -> dict[str, str]: + """Return normalized NIM CLI matrix model refs in deterministic order. + + Values are returned as ``full_model -> source`` so callers can preserve both + de-duplicated order and provenance in reports. + """ + source = env if env is not None else os.environ + explicit_models = _parse_csv_ordered(source.get("FCC_SMOKE_NIM_MODELS")) + extra_models = _parse_csv_ordered(source.get("FCC_SMOKE_NIM_EXTRA_MODELS")) + + if "FCC_SMOKE_NIM_MODELS" in source and not explicit_models: + raise ValueError("FCC_SMOKE_NIM_MODELS must list at least one model") + + models: list[tuple[str, str]] = [] + base_models = explicit_models or NVIDIA_NIM_CLI_DEFAULT_MODELS + base_source = ( + "FCC_SMOKE_NIM_MODELS" if explicit_models else "nvidia_nim_cli_default" + ) + models.extend((model, base_source) for model in base_models) + models.extend((model, "FCC_SMOKE_NIM_EXTRA_MODELS") for model in extra_models) + + normalized: dict[str, str] = {} + for raw_model, model_source in models: + full_model = _normalize_provider_model("nvidia_nim", raw_model) + normalized.setdefault(full_model, model_source) + return normalized + + def auth_headers(token: str | None = None) -> dict[str, str]: settings = get_settings() resolved = token if token is not None else settings.anthropic_auth_token diff --git a/smoke/lib/nvidia_nim_cli.py b/smoke/lib/nvidia_nim_cli.py new file mode 100644 index 0000000..30a85a5 --- /dev/null +++ b/smoke/lib/nvidia_nim_cli.py @@ -0,0 +1,350 @@ +"""Claude Code CLI characterization helpers for NVIDIA NIM smoke tests.""" + +from __future__ import annotations + +import json +import os +import re +import subprocess +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + +from smoke.lib.config import SmokeConfig, redacted +from smoke.lib.server import RunningServer + +REGRESSION_CLASSIFICATIONS = frozenset({"harness_bug", "product_failure"}) + +_HTTP_REGRESSION_PATTERNS = ( + r'POST /v1/messages[^"\n]* HTTP/1\.1" 4(?!01|03|04|08|09)\d\d', + r'POST /v1/messages[^"\n]* HTTP/1\.1" 5\d\d', +) +_UPSTREAM_UNAVAILABLE_MARKERS = ( + "upstream_unavailable", + "readtimeout", + "connecterror", + "connection refused", + "timed out", + "rate limit", + "429", + "overloaded", + "capacity", + "upstream provider", +) +_MISSING_ENV_MARKERS = ( + "api key", + "not logged in", + "authentication", + "permission denied", +) + + +@dataclass(frozen=True, slots=True) +class ClaudeCliRun: + command: tuple[str, ...] + returncode: int | None + stdout: str + stderr: str + duration_s: float + timed_out: bool = False + + @property + def combined_output(self) -> str: + return f"{self.stdout}\n{self.stderr}" + + +@dataclass(frozen=True, slots=True) +class NimCliMatrixOutcome: + model: str + full_model: str + source: str + feature: str + outcome: str + classification: str + duration_s: float + cli_returncode: int | None + token_evidence: dict[str, Any] + request_count: int + log_path: str + stdout_excerpt: str + stderr_excerpt: str + log_excerpt: str + + +def run_claude_cli( + *, + claude_bin: str, + server: RunningServer, + config: SmokeConfig, + cwd: Path, + prompt: str, + tools: str | None, + extra_args: tuple[str, ...] = (), + session_id: str | None = None, + resume_session_id: str | None = None, + no_session_persistence: bool = True, +) -> ClaudeCliRun: + """Run Claude Code CLI against the local smoke proxy.""" + cwd.mkdir(parents=True, exist_ok=True) + + cmd: list[str] = [claude_bin, "--bare"] + if resume_session_id: + cmd.extend(["--resume", resume_session_id]) + if session_id: + cmd.extend(["--session-id", session_id]) + cmd.extend( + [ + "--output-format", + "stream-json", + "--include-partial-messages", + "--verbose", + "--permission-mode", + "bypassPermissions", + "--dangerously-skip-permissions", + "--model", + "sonnet", + ] + ) + if no_session_persistence: + cmd.append("--no-session-persistence") + if tools is not None: + cmd.extend(["--tools", tools]) + if tools: + cmd.extend(["--allowedTools", tools]) + cmd.extend(extra_args) + cmd.extend(["-p", prompt]) + + env = os.environ.copy() + env["ANTHROPIC_BASE_URL"] = server.base_url + env["ANTHROPIC_API_URL"] = f"{server.base_url}/v1" + env.setdefault("ANTHROPIC_API_KEY", "sk-smoke-proxy") + if config.settings.anthropic_auth_token: + env["ANTHROPIC_AUTH_TOKEN"] = config.settings.anthropic_auth_token + env["TERM"] = "dumb" + env["NO_COLOR"] = "1" + env["PYTHONIOENCODING"] = "utf-8" + + started = time.monotonic() + try: + result = subprocess.run( + cmd, + cwd=cwd, + env=env, + capture_output=True, + text=True, + timeout=config.timeout_s, + check=False, + ) + except subprocess.TimeoutExpired as exc: + return ClaudeCliRun( + command=tuple(cmd), + returncode=None, + stdout=_coerce_timeout_text(exc.stdout), + stderr=_coerce_timeout_text(exc.stderr), + duration_s=time.monotonic() - started, + timed_out=True, + ) + + return ClaudeCliRun( + command=tuple(cmd), + returncode=result.returncode, + stdout=result.stdout, + stderr=result.stderr, + duration_s=time.monotonic() - started, + ) + + +def read_log_offset(log_path: Path) -> int: + """Return the current text length of a smoke server log.""" + if not log_path.is_file(): + return 0 + return len(log_path.read_text(encoding="utf-8", errors="replace")) + + +def read_log_delta(log_path: Path, offset: int) -> str: + """Return smoke server log text written after ``offset``.""" + if not log_path.is_file(): + return "" + text = log_path.read_text(encoding="utf-8", errors="replace") + return text[offset:] + + +def token_evidence( + *, + feature: str, + marker: str, + run: ClaudeCliRun, + log_delta: str, +) -> dict[str, Any]: + """Collect compact evidence for a CLI feature probe.""" + combined = f"{run.combined_output}\n{log_delta}" + lower = combined.lower() + return { + "feature": feature, + "marker_present": bool(marker and marker in combined), + "thinking_delta_count": combined.count("thinking_delta"), + "tool_use_count": combined.count('"tool_use"'), + "tool_result_count": combined.count('"tool_result"'), + "task_tool_count": combined.count('"name": "Task"') + + combined.count('"name":"Task"'), + "run_in_background_false": "run_in_background" in combined and "false" in lower, + "compact_boundary": "compact_boundary" in combined, + "compact_metadata": "compact_metadata" in combined, + "http_422": 'HTTP/1.1" 422' in combined, + "http_500": bool(re.search(r'HTTP/1\.1" 5\d\d', combined)), + "timed_out": run.timed_out, + } + + +def classify_probe( + *, + run: ClaudeCliRun, + log_delta: str, + marker: str, + requires_tool_result: bool = False, + requires_task: bool = False, + requires_compact: bool = False, +) -> tuple[str, str]: + """Classify a probe without failing compatibility characterization failures.""" + combined = f"{run.combined_output}\n{log_delta}" + lower = combined.lower() + + if _has_proxy_regression(log_delta): + return "failed", "product_failure" + if run.returncode != 0 and any( + marker_text in lower for marker_text in _MISSING_ENV_MARKERS + ): + return "skipped", "missing_env" + if run.timed_out: + return "failed", "probe_timeout" + + marker_ok = not marker or marker in combined + tool_ok = not requires_tool_result or '"tool_result"' in combined + task_ok = not requires_task or ( + ('"name": "Task"' in combined or '"name":"Task"' in combined) + and "run_in_background" in combined + and "false" in lower + ) + compact_ok = not requires_compact or ( + "compact_boundary" in combined + or "compact_metadata" in combined + or "/compact" in combined + or "compact" in lower + ) + cli_ok = run.returncode == 0 + + if cli_ok and marker_ok and tool_ok and task_ok and compact_ok: + return "passed", "passed" + if any(marker_text in lower for marker_text in _UPSTREAM_UNAVAILABLE_MARKERS): + return "failed", "upstream_unavailable" + if not _has_proxy_request(log_delta): + return "failed", "harness_bug" + return "failed", "model_feature_failure" + + +def make_outcome( + *, + model: str, + full_model: str, + source: str, + feature: str, + marker: str, + run: ClaudeCliRun, + log_delta: str, + log_path: Path, + requires_tool_result: bool = False, + requires_task: bool = False, + requires_compact: bool = False, +) -> NimCliMatrixOutcome: + """Build one report outcome from a CLI run and its server log delta.""" + outcome, classification = classify_probe( + run=run, + log_delta=log_delta, + marker=marker, + requires_tool_result=requires_tool_result, + requires_task=requires_task, + requires_compact=requires_compact, + ) + evidence = token_evidence( + feature=feature, + marker=marker, + run=run, + log_delta=log_delta, + ) + return NimCliMatrixOutcome( + model=model, + full_model=full_model, + source=source, + feature=feature, + outcome=outcome, + classification=classification, + duration_s=round(run.duration_s, 3), + cli_returncode=run.returncode, + token_evidence=evidence, + request_count=_request_count(log_delta), + log_path=str(log_path), + stdout_excerpt=_excerpt(run.stdout), + stderr_excerpt=_excerpt(run.stderr), + log_excerpt=_excerpt(log_delta), + ) + + +def write_matrix_report( + config: SmokeConfig, + outcomes: list[NimCliMatrixOutcome], +) -> Path: + """Write the NVIDIA NIM CLI compatibility matrix report.""" + config.results_dir.mkdir(parents=True, exist_ok=True) + path = ( + config.results_dir + / f"nvidia-nim-cli-matrix-{config.worker_id}-{int(time.time())}.json" + ) + payload = { + "started_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "worker_id": config.worker_id, + "target": "nvidia_nim_cli", + "models": sorted({outcome.full_model for outcome in outcomes}), + "outcomes": [asdict(outcome) for outcome in outcomes], + } + path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") + return path + + +def regression_failures(outcomes: list[NimCliMatrixOutcome]) -> list[str]: + """Return report lines for classifications that should fail pytest.""" + return [ + f"{outcome.full_model} {outcome.feature}: {outcome.classification}" + for outcome in outcomes + if outcome.classification in REGRESSION_CLASSIFICATIONS + ] + + +def _has_proxy_regression(log_delta: str) -> bool: + if "CREATE_MESSAGE_ERROR" in log_delta: + return True + return any(re.search(pattern, log_delta) for pattern in _HTTP_REGRESSION_PATTERNS) + + +def _has_proxy_request(log_delta: str) -> bool: + return "POST /v1/messages" in log_delta or "API_REQUEST:" in log_delta + + +def _request_count(log_delta: str) -> int: + access_log_count = log_delta.count("POST /v1/messages") + service_log_count = log_delta.count("API_REQUEST:") + return max(access_log_count, service_log_count) + + +def _excerpt(value: str, *, max_chars: int = 2400) -> str: + if len(value) <= max_chars: + return redacted(value) + return redacted(value[-max_chars:]) + + +def _coerce_timeout_text(value: str | bytes | None) -> str: + if value is None: + return "" + if isinstance(value, bytes): + return value.decode("utf-8", errors="replace") + return value diff --git a/smoke/lib/report.py b/smoke/lib/report.py index c17af26..ea3b0b3 100644 --- a/smoke/lib/report.py +++ b/smoke/lib/report.py @@ -69,6 +69,8 @@ def classify_outcome(*, nodeid: str, outcome: str, detail: str) -> str: text = f"{nodeid}\n{detail}".lower() if outcome == "skipped": + if "smoke target disabled" in text: + return "target_disabled" if any( marker in text for marker in ( diff --git a/smoke/product/test_nvidia_nim_cli_product_live.py b/smoke/product/test_nvidia_nim_cli_product_live.py new file mode 100644 index 0000000..f7e3e39 --- /dev/null +++ b/smoke/product/test_nvidia_nim_cli_product_live.py @@ -0,0 +1,325 @@ +from __future__ import annotations + +import json +import shutil +import uuid +from pathlib import Path + +import pytest + +from smoke.lib.config import ProviderModel, SmokeConfig +from smoke.lib.e2e import SmokeServerDriver +from smoke.lib.nvidia_nim_cli import ( + ClaudeCliRun, + NimCliMatrixOutcome, + make_outcome, + read_log_delta, + read_log_offset, + regression_failures, + run_claude_cli, + write_matrix_report, +) +from smoke.lib.server import RunningServer + +pytestmark = [pytest.mark.live, pytest.mark.smoke_target("nvidia_nim_cli")] + + +def test_nvidia_nim_cli_matrix_e2e(smoke_config: SmokeConfig, tmp_path: Path) -> None: + if not smoke_config.has_provider_configuration("nvidia_nim"): + pytest.skip("missing_env: NVIDIA_NIM_API_KEY is not configured") + + claude_bin = shutil.which(smoke_config.claude_bin) + if not claude_bin: + pytest.skip(f"missing_env: Claude CLI not found: {smoke_config.claude_bin}") + + provider_models = smoke_config.nvidia_nim_cli_models() + if not provider_models: + pytest.skip("missing_env: no NVIDIA NIM CLI smoke models configured") + + outcomes: list[NimCliMatrixOutcome] = [] + for provider_model in provider_models: + with SmokeServerDriver( + smoke_config, + name=f"product-nvidia-nim-cli-{_slug(provider_model.model_name)}", + env_overrides={ + "MODEL": provider_model.full_model, + "MESSAGING_PLATFORM": "none", + "ENABLE_MODEL_THINKING": "true", + "LOG_RAW_API_PAYLOADS": "true", + "LOG_RAW_SSE_EVENTS": "true", + }, + ).run() as server: + model_dir = tmp_path / _slug(provider_model.model_name) + outcomes.extend( + [ + _basic_text( + claude_bin, server, smoke_config, provider_model, model_dir + ), + _thinking( + claude_bin, server, smoke_config, provider_model, model_dir + ), + _tool_use_roundtrip( + claude_bin, server, smoke_config, provider_model, model_dir + ), + _interleaved_thinking_tool( + claude_bin, server, smoke_config, provider_model, model_dir + ), + _subagent_task( + claude_bin, server, smoke_config, provider_model, model_dir + ), + _compact_command( + claude_bin, server, smoke_config, provider_model, model_dir + ), + ] + ) + + report_path = write_matrix_report(smoke_config, outcomes) + failures = regression_failures(outcomes) + assert not failures, ( + f"NVIDIA NIM CLI matrix regressions written to {report_path}:\n" + + "\n".join(failures) + ) + + +def _basic_text( + claude_bin: str, + server: RunningServer, + smoke_config: SmokeConfig, + provider_model: ProviderModel, + model_dir: Path, +) -> NimCliMatrixOutcome: + marker = _marker("BASIC") + return _run_probe( + claude_bin=claude_bin, + server=server, + smoke_config=smoke_config, + provider_model=provider_model, + workspace=model_dir / "basic_text", + feature="basic_text", + marker=marker, + prompt=f"Reply with exactly {marker} and no other text.", + tools="", + ) + + +def _thinking( + claude_bin: str, + server: RunningServer, + smoke_config: SmokeConfig, + provider_model: ProviderModel, + model_dir: Path, +) -> NimCliMatrixOutcome: + marker = _marker("THINK") + return _run_probe( + claude_bin=claude_bin, + server=server, + smoke_config=smoke_config, + provider_model=provider_model, + workspace=model_dir / "thinking", + feature="thinking", + marker=marker, + prompt=( + "Think privately about the request, then reply with exactly " + f"{marker} and no other text." + ), + tools="", + extra_args=("--effort", "high"), + ) + + +def _tool_use_roundtrip( + claude_bin: str, + server: RunningServer, + smoke_config: SmokeConfig, + provider_model: ProviderModel, + model_dir: Path, +) -> NimCliMatrixOutcome: + marker = _marker("TOOL") + workspace = model_dir / "tool_use_roundtrip" + (workspace / "smoke-read.txt").parent.mkdir(parents=True, exist_ok=True) + (workspace / "smoke-read.txt").write_text(marker, encoding="utf-8") + return _run_probe( + claude_bin=claude_bin, + server=server, + smoke_config=smoke_config, + provider_model=provider_model, + workspace=workspace, + feature="tool_use_roundtrip", + marker=marker, + prompt=( + "Use the Read tool to read smoke-read.txt. Reply with exactly the " + "secret token from that file and no other text." + ), + tools="Read", + requires_tool_result=True, + ) + + +def _interleaved_thinking_tool( + claude_bin: str, + server: RunningServer, + smoke_config: SmokeConfig, + provider_model: ProviderModel, + model_dir: Path, +) -> NimCliMatrixOutcome: + marker = _marker("INTERLEAVED") + workspace = model_dir / "interleaved_thinking_tool" + (workspace / "smoke-interleaved.txt").parent.mkdir(parents=True, exist_ok=True) + (workspace / "smoke-interleaved.txt").write_text(marker, encoding="utf-8") + return _run_probe( + claude_bin=claude_bin, + server=server, + smoke_config=smoke_config, + provider_model=provider_model, + workspace=workspace, + feature="interleaved_thinking_tool", + marker=marker, + prompt=( + "Think privately, use Read on smoke-interleaved.txt, then reply with " + "exactly the secret token from that file and no other text." + ), + tools="Read", + extra_args=("--effort", "high"), + requires_tool_result=True, + ) + + +def _subagent_task( + claude_bin: str, + server: RunningServer, + smoke_config: SmokeConfig, + provider_model: ProviderModel, + model_dir: Path, +) -> NimCliMatrixOutcome: + marker = _marker("TASK") + workspace = model_dir / "subagent_task" + (workspace / "smoke-subagent.txt").parent.mkdir(parents=True, exist_ok=True) + (workspace / "smoke-subagent.txt").write_text(marker, encoding="utf-8") + agents = json.dumps( + { + "smoke_reader": { + "description": "Reads one requested file and returns its token.", + "prompt": ( + "Read the requested file with Read and return only the token " + "inside it." + ), + } + } + ) + return _run_probe( + claude_bin=claude_bin, + server=server, + smoke_config=smoke_config, + provider_model=provider_model, + workspace=workspace, + feature="subagent_task", + marker=marker, + prompt=( + "Use the smoke_reader subagent with Task to read smoke-subagent.txt. " + "Reply with exactly the token the subagent returns and no other text." + ), + tools="Task,Read", + extra_args=("--agents", agents), + requires_tool_result=True, + ) + + +def _compact_command( + claude_bin: str, + server: RunningServer, + smoke_config: SmokeConfig, + provider_model: ProviderModel, + model_dir: Path, +) -> NimCliMatrixOutcome: + marker = _marker("COMPACT") + workspace = model_dir / "compact_command" + session_id = str(uuid.uuid4()) + offset = read_log_offset(server.log_path) + first = run_claude_cli( + claude_bin=claude_bin, + server=server, + config=smoke_config, + cwd=workspace, + prompt=f"Remember this smoke token: {marker}. Reply with exactly {marker}.", + tools="", + session_id=session_id, + no_session_persistence=False, + ) + second = run_claude_cli( + claude_bin=claude_bin, + server=server, + config=smoke_config, + cwd=workspace, + prompt=f"/compact preserve {marker}", + tools="", + resume_session_id=session_id, + no_session_persistence=False, + ) + log_delta = read_log_delta(server.log_path, offset) + run = ClaudeCliRun( + command=(*first.command, "&&", *second.command), + returncode=second.returncode if first.returncode == 0 else first.returncode, + stdout=f"{first.stdout}\n{second.stdout}", + stderr=f"{first.stderr}\n{second.stderr}", + duration_s=first.duration_s + second.duration_s, + timed_out=first.timed_out or second.timed_out, + ) + return make_outcome( + model=provider_model.model_name, + full_model=provider_model.full_model, + source=provider_model.source, + feature="compact_command", + marker="", + run=run, + log_delta=log_delta, + log_path=server.log_path, + requires_compact=True, + ) + + +def _run_probe( + *, + claude_bin: str, + server: RunningServer, + smoke_config: SmokeConfig, + provider_model: ProviderModel, + workspace: Path, + feature: str, + marker: str, + prompt: str, + tools: str | None, + extra_args: tuple[str, ...] = (), + requires_tool_result: bool = False, + requires_task: bool = False, +) -> NimCliMatrixOutcome: + offset = read_log_offset(server.log_path) + run = run_claude_cli( + claude_bin=claude_bin, + server=server, + config=smoke_config, + cwd=workspace, + prompt=prompt, + tools=tools, + extra_args=extra_args, + ) + log_delta = read_log_delta(server.log_path, offset) + return make_outcome( + model=provider_model.model_name, + full_model=provider_model.full_model, + source=provider_model.source, + feature=feature, + marker=marker, + run=run, + log_delta=log_delta, + log_path=server.log_path, + requires_tool_result=requires_tool_result, + requires_task=requires_task, + ) + + +def _marker(prefix: str) -> str: + return f"FCC_NIM_{prefix}_{uuid.uuid4().hex[:8].upper()}" + + +def _slug(value: str) -> str: + return "".join(char if char.isalnum() else "-" for char in value).strip("-") diff --git a/tests/contracts/test_nvidia_nim_cli_matrix.py b/tests/contracts/test_nvidia_nim_cli_matrix.py new file mode 100644 index 0000000..70a622d --- /dev/null +++ b/tests/contracts/test_nvidia_nim_cli_matrix.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from config.settings import Settings +from smoke.lib.config import DEFAULT_TARGETS, SmokeConfig +from smoke.lib.nvidia_nim_cli import ( + ClaudeCliRun, + make_outcome, + regression_failures, + write_matrix_report, +) + + +def _smoke_config(tmp_path: Path) -> SmokeConfig: + return SmokeConfig( + root=tmp_path, + results_dir=tmp_path / ".smoke-results", + live=False, + interactive=False, + targets=DEFAULT_TARGETS, + provider_matrix=frozenset(), + timeout_s=45.0, + prompt="Reply with exactly: FCC_SMOKE_PONG", + claude_bin="claude", + worker_id="test-worker", + settings=Settings.model_construct(anthropic_auth_token=""), + ) + + +def test_nvidia_nim_cli_matrix_report_shape_and_redaction( + tmp_path: Path, monkeypatch +) -> None: + monkeypatch.setenv("NVIDIA_NIM_API_KEY", "secret-nim-key") + run = ClaudeCliRun( + command=("claude", "-p", "redacted"), + returncode=0, + stdout="FCC_NIM_BASIC secret-nim-key", + stderr="", + duration_s=1.25, + ) + outcome = make_outcome( + model="z-ai/glm-5.1", + full_model="nvidia_nim/z-ai/glm-5.1", + source="nvidia_nim_cli_default", + feature="basic_text", + marker="FCC_NIM_BASIC", + run=run, + log_delta='POST /v1/messages HTTP/1.1" 200 OK secret-nim-key', + log_path=tmp_path / "server.log", + ) + + path = write_matrix_report(_smoke_config(tmp_path), [outcome]) + payload = json.loads(path.read_text(encoding="utf-8")) + + assert path.name.startswith("nvidia-nim-cli-matrix-test-worker-") + assert payload["target"] == "nvidia_nim_cli" + assert payload["models"] == ["nvidia_nim/z-ai/glm-5.1"] + saved = payload["outcomes"][0] + assert saved["feature"] == "basic_text" + assert saved["classification"] == "passed" + assert saved["request_count"] == 1 + assert saved["token_evidence"]["marker_present"] is True + assert "secret-nim-key" not in path.read_text(encoding="utf-8") + + +def test_nvidia_nim_cli_matrix_regression_detection(tmp_path: Path) -> None: + run = ClaudeCliRun( + command=("claude", "-p", "x"), + returncode=0, + stdout="", + stderr="", + duration_s=0.1, + ) + outcome = make_outcome( + model="z-ai/glm-5.1", + full_model="nvidia_nim/z-ai/glm-5.1", + source="nvidia_nim_cli_default", + feature="basic_text", + marker="FCC_NIM_BASIC", + run=run, + log_delta='POST /v1/messages HTTP/1.1" 500 Internal Server Error', + log_path=tmp_path / "server.log", + ) + + assert outcome.classification == "product_failure" + assert regression_failures([outcome]) == [ + "nvidia_nim/z-ai/glm-5.1 basic_text: product_failure" + ] + + +def test_nvidia_nim_cli_matrix_model_feature_failures_do_not_regress( + tmp_path: Path, +) -> None: + run = ClaudeCliRun( + command=("claude", "-p", "x"), + returncode=0, + stdout="ordinary answer", + stderr="", + duration_s=0.1, + ) + outcome = make_outcome( + model="z-ai/glm-5.1", + full_model="nvidia_nim/z-ai/glm-5.1", + source="nvidia_nim_cli_default", + feature="tool_use_roundtrip", + marker="FCC_NIM_TOOL", + run=run, + log_delta='POST /v1/messages HTTP/1.1" 200 OK', + log_path=tmp_path / "server.log", + requires_tool_result=True, + ) + + assert outcome.classification == "model_feature_failure" + assert regression_failures([outcome]) == [] + + +def test_nvidia_nim_cli_raw_payload_log_counts_as_proxy_request( + tmp_path: Path, +) -> None: + run = ClaudeCliRun( + command=("claude", "-p", "x"), + returncode=0, + stdout="ordinary answer", + stderr="", + duration_s=0.1, + ) + outcome = make_outcome( + model="z-ai/glm-5.1", + full_model="nvidia_nim/z-ai/glm-5.1", + source="nvidia_nim_cli_default", + feature="subagent_task", + marker="FCC_NIM_TASK", + run=run, + log_delta="API_REQUEST: request_id=req_1 model=z-ai/glm-5.1 messages=2", + log_path=tmp_path / "server.log", + requires_task=True, + ) + + assert outcome.classification == "model_feature_failure" + assert outcome.request_count == 1 + assert regression_failures([outcome]) == [] + + +def test_nvidia_nim_cli_timeout_is_not_model_missing( + tmp_path: Path, +) -> None: + run = ClaudeCliRun( + command=("claude", "-p", "x"), + returncode=None, + stdout='{"type":"assistant","content":[{"type":"text","text":"FCC_NIM_TOOL"}]}', + stderr="", + duration_s=45.0, + timed_out=True, + ) + outcome = make_outcome( + model="z-ai/glm-5.1", + full_model="nvidia_nim/z-ai/glm-5.1", + source="nvidia_nim_cli_default", + feature="tool_use_roundtrip", + marker="FCC_NIM_TOOL", + run=run, + log_delta="API_REQUEST: request_id=req_1 model=z-ai/glm-5.1 messages=2", + log_path=tmp_path / "server.log", + ) + + assert outcome.classification == "probe_timeout" + assert outcome.token_evidence["timed_out"] is True + assert regression_failures([outcome]) == [] + + +def test_nvidia_nim_cli_success_beats_verbose_timeout_words(tmp_path: Path) -> None: + run = ClaudeCliRun( + command=("claude", "-p", "x"), + returncode=0, + stdout="FCC_NIM_THINK", + stderr="", + duration_s=0.1, + ) + outcome = make_outcome( + model="z-ai/glm-5.1", + full_model="nvidia_nim/z-ai/glm-5.1", + source="nvidia_nim_cli_default", + feature="thinking", + marker="FCC_NIM_THINK", + run=run, + log_delta=( + "API_REQUEST: request_id=req_1 model=z-ai/glm-5.1 messages=1 " + "read_timeout_s=300" + ), + log_path=tmp_path / "server.log", + ) + + assert outcome.classification == "passed" + assert outcome.request_count == 1 diff --git a/tests/contracts/test_smoke_config.py b/tests/contracts/test_smoke_config.py index 2d02c3f..a687ded 100644 --- a/tests/contracts/test_smoke_config.py +++ b/tests/contracts/test_smoke_config.py @@ -4,10 +4,14 @@ from pathlib import Path from types import SimpleNamespace from smoke.lib.config import ( + ALL_TARGETS, DEFAULT_TARGETS, + NVIDIA_NIM_CLI_DEFAULT_MODELS, + OPT_IN_TARGETS, PROVIDER_SMOKE_DEFAULT_MODELS, TARGET_REQUIRED_ENV, SmokeConfig, + nvidia_nim_cli_model_refs, ) @@ -52,6 +56,13 @@ def test_ollama_is_default_smoke_target() -> None: assert "ollama" in TARGET_REQUIRED_ENV +def test_nvidia_nim_cli_is_opt_in_smoke_target() -> None: + assert "nvidia_nim_cli" not in DEFAULT_TARGETS + assert "nvidia_nim_cli" in OPT_IN_TARGETS + assert "nvidia_nim_cli" in ALL_TARGETS + assert "nvidia_nim_cli" in TARGET_REQUIRED_ENV + + def test_ollama_provider_configuration_uses_base_url() -> None: config = _smoke_config() @@ -190,3 +201,67 @@ def test_provider_smoke_does_not_include_default_local_urls_when_unmapped( config = _smoke_config(settings=_settings(model="nvidia_nim/test")) assert config.provider_smoke_models() == [] + + +def test_nvidia_nim_cli_default_models_are_normalized() -> None: + refs = nvidia_nim_cli_model_refs({}) + + assert tuple(refs) == tuple( + f"nvidia_nim/{model}" for model in NVIDIA_NIM_CLI_DEFAULT_MODELS + ) + assert "nvidia_nim/deepseek-ai/deepseek-v4-pro" in refs + assert "nvidia_nim/deepseek-ai/deepseek-v4-flash" in refs + assert set(refs.values()) == {"nvidia_nim_cli_default"} + + +def test_nvidia_nim_cli_models_override_and_append() -> None: + refs = nvidia_nim_cli_model_refs( + { + "FCC_SMOKE_NIM_MODELS": "z-ai/glm-5.1,nvidia_nim/custom/model", + "FCC_SMOKE_NIM_EXTRA_MODELS": "moonshotai/kimi-k2.6,z-ai/glm-5.1", + } + ) + + assert tuple(refs) == ( + "nvidia_nim/z-ai/glm-5.1", + "nvidia_nim/custom/model", + "nvidia_nim/moonshotai/kimi-k2.6", + ) + assert refs["nvidia_nim/z-ai/glm-5.1"] == "FCC_SMOKE_NIM_MODELS" + assert refs["nvidia_nim/moonshotai/kimi-k2.6"] == ("FCC_SMOKE_NIM_EXTRA_MODELS") + + +def test_nvidia_nim_cli_models_reject_empty_override() -> None: + try: + nvidia_nim_cli_model_refs({"FCC_SMOKE_NIM_MODELS": " , "}) + except ValueError as exc: + assert "FCC_SMOKE_NIM_MODELS" in str(exc) + else: + raise AssertionError("expected empty NVIDIA NIM CLI model override to fail") + + +def test_nvidia_nim_cli_models_reject_wrong_provider_prefix() -> None: + try: + nvidia_nim_cli_model_refs({"FCC_SMOKE_NIM_MODELS": "open_router/model"}) + except ValueError as exc: + assert "nvidia_nim" in str(exc) + else: + raise AssertionError("expected wrong provider prefix to fail") + + +def test_smoke_config_returns_nvidia_nim_cli_provider_models(monkeypatch) -> None: + monkeypatch.delenv("FCC_SMOKE_NIM_MODELS", raising=False) + monkeypatch.delenv("FCC_SMOKE_NIM_EXTRA_MODELS", raising=False) + config = _smoke_config( + settings=_settings( + model="nvidia_nim/z-ai/glm-5.1", + nvidia_nim_api_key="nim-key", + ollama_base_url="", + ) + ) + + models = config.nvidia_nim_cli_models() + + assert models[0].provider == "nvidia_nim" + assert models[0].full_model == "nvidia_nim/z-ai/glm-5.1" + assert models[0].source == "nvidia_nim_cli_default" diff --git a/tests/contracts/test_smoke_tiers.py b/tests/contracts/test_smoke_tiers.py index f343d88..ee92416 100644 --- a/tests/contracts/test_smoke_tiers.py +++ b/tests/contracts/test_smoke_tiers.py @@ -3,6 +3,7 @@ from __future__ import annotations import json from pathlib import Path +from smoke.lib.report import classify_outcome from smoke.lib.report_summary import format_summary, summarize_reports @@ -32,3 +33,13 @@ def test_smoke_report_summary_counts_regression_classes(tmp_path: Path) -> None: assert summary.classifications["product_failure"] == 1 assert summary.has_regression assert "status=regression" in format_summary(summary) + + +def test_target_disabled_skip_is_not_missing_env() -> None: + classification = classify_outcome( + nodeid="smoke/product/test_api_product_live.py::test_api_basic_conversation_e2e", + outcome="skipped", + detail="Skipped: smoke target disabled: api", + ) + + assert classification == "target_disabled" diff --git a/tests/providers/test_nvidia_nim.py b/tests/providers/test_nvidia_nim.py index 4314207..643a2d7 100644 --- a/tests/providers/test_nvidia_nim.py +++ b/tests/providers/test_nvidia_nim.py @@ -8,6 +8,7 @@ from httpx import Request, Response from config.nim import NimSettings from providers.defaults import NVIDIA_NIM_DEFAULT_BASE from providers.nvidia_nim import NvidiaNimProvider +from providers.nvidia_nim.request import NIM_TOOL_ARGUMENT_ALIASES_KEY # Mock data classes @@ -47,6 +48,46 @@ class MockRequest: setattr(self, k, v) +def _input_json_deltas(events): + deltas = [] + for event in events: + if "event: content_block_delta" not in event: + continue + for line in event.splitlines(): + if not line.startswith("data: "): + continue + payload = json.loads(line[6:]) + delta = payload.get("delta", {}) + if delta.get("type") == "input_json_delta": + deltas.append(delta.get("partial_json", "")) + return deltas + + +def _tool_call_chunk( + *, + name, + arguments, + tool_id="call_1", + index=0, + finish_reason=None, +): + mock_tc = MagicMock() + mock_tc.index = index + mock_tc.id = tool_id + mock_tc.function.name = name + mock_tc.function.arguments = arguments + + mock_chunk = MagicMock() + mock_chunk.choices = [ + MagicMock( + delta=MagicMock(content=None, reasoning_content="", tool_calls=[mock_tc]), + finish_reason=finish_reason, + ) + ] + mock_chunk.usage = None + return mock_chunk + + def _make_bad_request_error(message: str) -> openai.BadRequestError: response = Response( status_code=400, @@ -434,6 +475,195 @@ async def test_tool_call_stream(nim_provider): assert "search" in starts[0] +@pytest.mark.asyncio +async def test_stream_response_restores_aliased_tool_arguments(nim_provider): + """NIM-safe argument aliases are restored before Anthropic SSE emission.""" + req = MockRequest( + tools=[ + MockTool( + "Grep", + "Search file contents", + { + "type": "object", + "properties": { + "pattern": {"type": "string"}, + "-A": {"type": "number"}, + "type": {"type": "string"}, + }, + "required": ["pattern"], + }, + ) + ] + ) + mock_chunk = _tool_call_chunk( + name="Grep", + arguments=json.dumps({"pattern": "needle", "-A": 2, "_fcc_arg_type": "py"}), + ) + + async def mock_stream(): + yield mock_chunk + + with patch.object( + nim_provider._client.chat.completions, "create", new_callable=AsyncMock + ) as mock_create: + mock_create.return_value = mock_stream() + + events = [e async for e in nim_provider.stream_response(req)] + + await_args = mock_create.await_args + assert await_args is not None + create_kwargs = await_args.kwargs + assert NIM_TOOL_ARGUMENT_ALIASES_KEY not in create_kwargs + properties = create_kwargs["tools"][0]["function"]["parameters"]["properties"] + assert "-A" in properties + assert "type" not in properties + assert "_fcc_arg_A" not in properties + assert "_fcc_arg_type" in properties + + deltas = _input_json_deltas(events) + assert len(deltas) == 1 + assert json.loads(deltas[0]) == {"pattern": "needle", "-A": 2, "type": "py"} + assert "_fcc_arg_type" not in deltas[0] + + +@pytest.mark.asyncio +async def test_stream_response_buffers_chunked_aliased_tool_arguments(nim_provider): + """Chunked aliased args are emitted once as restored Claude Code args.""" + req = MockRequest( + tools=[ + MockTool( + "Grep", + "Search file contents", + { + "type": "object", + "properties": { + "pattern": {"type": "string"}, + "type": {"type": "string"}, + }, + "required": ["pattern"], + }, + ) + ] + ) + first_chunk = _tool_call_chunk( + name="Grep", + arguments='{"pattern": "needle", ', + tool_id="call_chunked", + ) + second_chunk = _tool_call_chunk( + name=None, + arguments='"_fcc_arg_type": "py"}', + tool_id="call_chunked", + ) + + async def mock_stream(): + yield first_chunk + yield second_chunk + + with patch.object( + nim_provider._client.chat.completions, "create", new_callable=AsyncMock + ) as mock_create: + mock_create.return_value = mock_stream() + + events = [e async for e in nim_provider.stream_response(req)] + + deltas = _input_json_deltas(events) + assert len(deltas) == 1 + assert json.loads(deltas[0]) == {"pattern": "needle", "type": "py"} + + +@pytest.mark.asyncio +async def test_stream_response_restores_nested_aliased_tool_arguments(nim_provider): + req = MockRequest( + tools=[ + MockTool( + "NotionLike", + "Nested type schema", + { + "type": "object", + "properties": { + "parent": { + "type": "object", + "properties": { + "type": {"type": "string"}, + "id": {"type": "string"}, + }, + "required": ["type", "id"], + } + }, + "required": ["parent"], + }, + ) + ] + ) + mock_chunk = _tool_call_chunk( + name="NotionLike", + arguments=json.dumps( + {"parent": {"_fcc_arg_type": "page_id", "id": "page_123"}} + ), + ) + + async def mock_stream(): + yield mock_chunk + + with patch.object( + nim_provider._client.chat.completions, "create", new_callable=AsyncMock + ) as mock_create: + mock_create.return_value = mock_stream() + + events = [e async for e in nim_provider.stream_response(req)] + + deltas = _input_json_deltas(events) + assert len(deltas) == 1 + assert json.loads(deltas[0]) == {"parent": {"type": "page_id", "id": "page_123"}} + + +@pytest.mark.asyncio +async def test_stream_response_task_tool_still_forces_background_false(nim_provider): + req = MockRequest( + tools=[ + MockTool( + "Task", + "Run a subagent", + { + "type": "object", + "properties": { + "description": {"type": "string"}, + "prompt": {"type": "string"}, + "run_in_background": {"type": "boolean"}, + }, + "required": ["description", "prompt"], + }, + ) + ] + ) + mock_chunk = _tool_call_chunk( + name="Task", + arguments=json.dumps( + { + "description": "Inspect", + "prompt": "Read the marker", + "run_in_background": True, + } + ), + tool_id="call_task", + ) + + async def mock_stream(): + yield mock_chunk + + with patch.object( + nim_provider._client.chat.completions, "create", new_callable=AsyncMock + ) as mock_create: + mock_create.return_value = mock_stream() + + events = [e async for e in nim_provider.stream_response(req)] + + deltas = _input_json_deltas(events) + assert len(deltas) == 1 + assert json.loads(deltas[0])["run_in_background"] is False + + @pytest.mark.asyncio async def test_stream_response_retries_without_reasoning_budget(nim_provider): req = MockRequest() diff --git a/tests/providers/test_nvidia_nim_request.py b/tests/providers/test_nvidia_nim_request.py index 8beed97..1bb3b27 100644 --- a/tests/providers/test_nvidia_nim_request.py +++ b/tests/providers/test_nvidia_nim_request.py @@ -1,6 +1,8 @@ """Tests for providers/nvidia_nim/request.py.""" +from copy import deepcopy from types import SimpleNamespace +from typing import Any from unittest.mock import MagicMock import pytest @@ -8,12 +10,36 @@ import pytest from config.nim import NimSettings from core.anthropic import set_if_not_none from providers.nvidia_nim.request import ( + NIM_TOOL_ARGUMENT_ALIASES_KEY, _set_extra, + body_without_nim_tool_argument_aliases, build_request_body, clone_body_without_chat_template, clone_body_without_reasoning_content, + nim_tool_argument_aliases_from_body, ) +GREP_SCHEMA_FROM_SERVER_LOG: dict[str, Any] = { + "type": "object", + "properties": { + "pattern": {"type": "string", "description": "The regular expression"}, + "path": {"type": "string", "description": "File or directory to search"}, + "glob": {"type": "string", "description": "Glob to filter files"}, + "output_mode": { + "type": "string", + "enum": ["content", "files_with_matches", "count"], + }, + "-A": {"type": "number", "description": "Lines after match"}, + "-B": {"type": "number", "description": "Lines before match"}, + "-C": {"type": "number", "description": "Lines around match"}, + "-i": {"type": "boolean", "description": "Case insensitive"}, + "-n": {"type": "boolean", "description": "Show line numbers"}, + "type": {"type": "string", "description": "File type to search"}, + }, + "additionalProperties": False, + "required": ["pattern"], +} + @pytest.fixture def req(): @@ -121,6 +147,129 @@ class TestBuildRequestBody: assert tool_schema["additionalProperties"] is False assert tool_schema["properties"]["nested"]["additionalProperties"] is False + def test_grep_schema_type_parameter_is_aliased_without_mutating_request(self, req): + tool_schema = deepcopy(GREP_SCHEMA_FROM_SERVER_LOG) + tool_schema["properties"]["_fcc_arg_type"] = { + "type": "string", + "description": "Existing safe property that collides with the alias", + } + tool_schema["required"] = ["pattern", "-A", "_fcc_arg_type"] + original_schema = deepcopy(tool_schema) + req.tools = [ + SimpleNamespace( + name="Grep", + description="Search file contents", + input_schema=tool_schema, + ) + ] + + body = build_request_body(req, NimSettings(), thinking_enabled=False) + + parameters = body["tools"][0]["function"]["parameters"] + properties = parameters["properties"] + aliases = body[NIM_TOOL_ARGUMENT_ALIASES_KEY]["Grep"] + assert "additionalProperties" not in parameters + assert properties["-A"] == original_schema["properties"]["-A"] + assert properties["-B"] == original_schema["properties"]["-B"] + assert properties["-C"] == original_schema["properties"]["-C"] + assert properties["-i"] == original_schema["properties"]["-i"] + assert properties["-n"] == original_schema["properties"]["-n"] + assert "type" not in properties + assert properties["pattern"] == original_schema["properties"]["pattern"] + assert properties["output_mode"]["enum"] == [ + "content", + "files_with_matches", + "count", + ] + assert ( + properties["_fcc_arg_type"] + == original_schema["properties"]["_fcc_arg_type"] + ) + assert aliases == {"_fcc_arg_type_2": "type"} + assert properties["_fcc_arg_type_2"] == original_schema["properties"]["type"] + assert "-A" in parameters["required"] + assert "_fcc_arg_type" in parameters["required"] + assert tool_schema == original_schema + + def test_safe_tool_schema_does_not_add_alias_metadata(self, req): + tool_schema = { + "type": "object", + "properties": { + "pattern": {"type": "string"}, + "path": {"type": "string"}, + "output_mode": {"type": "string", "enum": ["content", "count"]}, + }, + "required": ["pattern"], + } + req.tools = [ + SimpleNamespace( + name="Glob", + description="Find files", + input_schema=tool_schema, + ) + ] + + body = build_request_body(req, NimSettings(), thinking_enabled=False) + + assert NIM_TOOL_ARGUMENT_ALIASES_KEY not in body + parameters = body["tools"][0]["function"]["parameters"] + assert parameters["properties"] == tool_schema["properties"] + assert parameters["required"] == ["pattern"] + + def test_nested_schema_keyword_properties_are_aliased_without_mutating_request( + self, req + ): + tool_schema = { + "type": "object", + "properties": { + "parent": { + "type": "object", + "properties": { + "type": {"type": "string", "enum": ["page_id"]}, + "id": {"type": "string"}, + }, + "required": ["type", "id"], + } + }, + "required": ["parent"], + } + original_schema = deepcopy(tool_schema) + req.tools = [ + SimpleNamespace( + name="NotionLike", + description="Nested type schema", + input_schema=tool_schema, + ) + ] + + body = build_request_body(req, NimSettings(), thinking_enabled=False) + + aliases = body[NIM_TOOL_ARGUMENT_ALIASES_KEY]["NotionLike"] + parent = body["tools"][0]["function"]["parameters"]["properties"]["parent"] + parent_properties = parent["properties"] + assert "type" not in parent_properties + assert parent_properties["_fcc_arg_type"] == { + "type": "string", + "enum": ["page_id"], + } + assert parent["required"] == ["_fcc_arg_type", "id"] + assert aliases == {"_fcc_arg_type": "type"} + assert tool_schema == original_schema + + def test_private_alias_metadata_is_stripped_without_mutating_body(self): + body = { + "model": "test", + NIM_TOOL_ARGUMENT_ALIASES_KEY: {"Grep": {"_fcc_arg_A": "-A"}}, + } + + upstream_body = body_without_nim_tool_argument_aliases(body) + + assert NIM_TOOL_ARGUMENT_ALIASES_KEY not in upstream_body + assert body[NIM_TOOL_ARGUMENT_ALIASES_KEY] == {"Grep": {"_fcc_arg_A": "-A"}} + assert nim_tool_argument_aliases_from_body(body) == { + "Grep": {"_fcc_arg_A": "-A"} + } + def test_reasoning_params_in_extra_body(self): req = MagicMock() req.model = "test"