Fix live provider smoke defaults

Update live smoke model defaults for NIM, OpenRouter, and Gemini; normalize tool-call indexes; downgrade DeepSeek forced tool_choice; and add coverage for the provider smoke fixes.
This commit is contained in:
Alishahryar1
2026-05-31 13:02:15 -07:00
parent e4d6dc1f94
commit d501e5223a
21 changed files with 187 additions and 40 deletions
+4 -5
View File
@@ -127,7 +127,7 @@ Browse models at [build.nvidia.com](https://build.nvidia.com/explore/discover).
Get a key at [openrouter.ai/keys](https://openrouter.ai/keys).
In the Admin UI, paste it into `OPENROUTER_API_KEY`, then set `MODEL` to an OpenRouter slug such as `open_router/stepfun/step-3.5-flash:free`.
In the Admin UI, paste it into `OPENROUTER_API_KEY`, then set `MODEL` to an OpenRouter slug such as `open_router/openrouter/free`.
Browse [all models](https://openrouter.ai/models) or [free models](https://openrouter.ai/collections/free-models).
@@ -135,14 +135,13 @@ Browse [all models](https://openrouter.ai/models) or [free models](https://openr
Get a Gemini API key at [Google AI Studio](https://aistudio.google.com/apikey) (see Google's [Gemini OpenAI compatibility](https://ai.google.dev/gemini-api/docs/openai) docs).
In the Admin UI, paste it into `GEMINI_API_KEY`, then set `MODEL` to a Gemini model slug such as `gemini/gemini-2.5-flash` or `gemini/gemini-3.1-flash-lite`.
In the Admin UI, paste it into `GEMINI_API_KEY`, then set `MODEL` to a Gemini model slug such as `gemini/models/gemini-3.1-flash-lite`.
The Gemini API exposes an OpenAI-compatible endpoint at `https://generativelanguage.googleapis.com/v1beta/openai/`. Free tier quotas are per-model; prompts may be used to improve Google's products outside the UK/CH/EEA/EU unless your account region says otherwise—see Google's terms.
Popular examples:
- `gemini/gemini-2.5-flash`
- `gemini/gemini-3.1-flash-lite`
- `gemini/models/gemini-3.1-flash-lite`
### 4. [DeepSeek](https://platform.deepseek.com/)
@@ -305,7 +304,7 @@ In the Admin UI, keep or update `OLLAMA_BASE_URL`, then set `MODEL` to the same
Each model tier can use a different provider by setting `MODEL_OPUS`, `MODEL_SONNET`, and `MODEL_HAIKU` in the Admin UI. Leave a tier blank to inherit `MODEL`.
For example, you can route Opus to `nvidia_nim/moonshotai/kimi-k2.5`, Sonnet to `open_router/deepseek/deepseek-r1-0528:free`, Haiku to `lmstudio/unsloth/GLM-4.7-Flash-GGUF`, and keep the fallback `MODEL` on `zai/glm-5.1`.
For example, you can route Opus to `nvidia_nim/moonshotai/kimi-k2.6`, Sonnet to `open_router/openrouter/free`, Haiku to `lmstudio/qwen3.5-coder`, and keep the fallback `MODEL` on `zai/glm-5.1`.
## Connect Claude Code
+1 -1
View File
@@ -412,7 +412,7 @@ FIELDS: tuple[ConfigFieldSpec, ...] = (
"Default Model",
"models",
settings_attr="model",
default="nvidia_nim/z-ai/glm4.7",
default="nvidia_nim/nvidia/nemotron-3-super-120b-a12b",
description="Fallback provider/model route for all Claude model names.",
),
ConfigFieldSpec(
+4 -3
View File
@@ -1,7 +1,9 @@
"""FastAPI application factory and configuration."""
import os
import traceback
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Any
from fastapi import FastAPI, Request
@@ -85,9 +87,8 @@ class GracefulLifespanApp:
def create_app(*, lifespan_enabled: bool = True) -> FastAPI:
"""Create and configure the FastAPI application."""
settings = get_settings()
configure_logging(
server_log_path(), verbose_third_party=settings.log_raw_api_payloads
)
log_path = Path(os.getenv("LOG_FILE", server_log_path()))
configure_logging(log_path, verbose_third_party=settings.log_raw_api_payloads)
app_kwargs: dict[str, Any] = {
"title": "Claude Code Proxy",
+1 -1
View File
@@ -180,7 +180,7 @@ class Settings(BaseSettings):
# ==================== Model ====================
# All Claude model requests are mapped to this single model (fallback)
# Format: provider_type/model/name
model: str = "nvidia_nim/z-ai/glm4.7"
model: str = "nvidia_nim/nvidia/nemotron-3-super-120b-a12b"
# Per-model overrides (optional, falls back to MODEL)
# Each can use a different provider
+17
View File
@@ -395,6 +395,7 @@ def build_request_body(request_data: Any, *, thinking_enabled: bool) -> dict:
data["messages"] = _strip_unsupported_attachment_blocks(data["messages"])
_validate_deepseek_native_request_dict(data)
data.pop("extra_body", None)
_downgrade_forced_tool_choice(data)
has_tool_history = _has_tool_history(data)
has_replayable_tool_thinking = _has_replayable_tool_thinking(data)
@@ -456,3 +457,19 @@ def build_request_body(request_data: Any, *, thinking_enabled: bool) -> dict:
len(data.get("tools", [])),
)
return data
def _downgrade_forced_tool_choice(data: dict[str, Any]) -> None:
tool_choice = data.get("tool_choice")
if not isinstance(tool_choice, dict):
return
if tool_choice.get("type") != "tool" or not isinstance(
tool_choice.get("name"), str
):
return
logger.debug(
"DEEPSEEK_REQUEST: downgrading forced tool_choice to auto for unsupported "
"native request shape tool={}",
tool_choice["name"],
)
data["tool_choice"] = {"type": "auto"}
+2 -1
View File
@@ -234,7 +234,8 @@ class OpenAIChatTransport(BaseProvider):
tool_argument_alias_buffers: dict[int, str] | None = None,
) -> Iterator[str]:
"""Process a single tool call delta and yield SSE events."""
tc_index = tc.get("index", 0)
raw_index = tc.get("index", 0)
tc_index = raw_index if isinstance(raw_index, int) else 0
if tc_index < 0:
tc_index = len(sse.blocks.tool_states)
+13 -2
View File
@@ -31,6 +31,8 @@ _UPSTREAM_UNAVAILABLE_MARKERS = (
"overloaded",
"capacity",
"upstream provider",
"provider api request failed",
"httpstatuserror",
)
_HTTP_429_PATTERNS = (
r'HTTP/1\.[01]" 429\b',
@@ -694,7 +696,15 @@ def _has_proxy_regression(log_delta: str) -> bool:
def _has_proxy_request(log_delta: str) -> bool:
return "POST /v1/messages" in log_delta or "API_REQUEST:" in log_delta
return (
"POST /v1/messages" in log_delta
or "API_REQUEST:" in log_delta
or '"event": "api.request.received"' in log_delta
or (
'"http_method": "POST"' in log_delta
and '"http_path": "/v1/messages"' in log_delta
)
)
def _tool_catalog_has(log_delta: str, tool_name: str) -> bool:
@@ -755,7 +765,8 @@ def _has_upstream_unavailable_text(text: str) -> bool:
def _request_count(log_delta: str) -> int:
access_log_count = log_delta.count("POST /v1/messages")
service_log_count = log_delta.count("API_REQUEST:")
return max(access_log_count, service_log_count)
structured_log_count = log_delta.count('"event": "api.request.received"')
return max(access_log_count, service_log_count, structured_log_count)
def _marker(scope: str, prefix: str) -> str:
+3 -3
View File
@@ -42,8 +42,8 @@ TARGET_ALIASES = {
SECRET_KEY_PARTS = ("KEY", "TOKEN", "SECRET", "WEBHOOK", "AUTH")
PROVIDER_SMOKE_DEFAULT_MODELS: dict[str, str] = {
"nvidia_nim": "nvidia_nim/z-ai/glm4.7",
"open_router": "open_router/stepfun/step-3.5-flash:free",
"nvidia_nim": "nvidia_nim/nvidia/nemotron-3-super-120b-a12b",
"open_router": "open_router/openrouter/free",
"mistral": "mistral/devstral-small-latest",
"mistral_codestral": "mistral_codestral/codestral-latest",
"deepseek": "deepseek/deepseek-v4-pro",
@@ -54,7 +54,7 @@ PROVIDER_SMOKE_DEFAULT_MODELS: dict[str, str] = {
"opencode": "opencode/gpt-5.3-codex",
"opencode_go": "opencode_go/minimax-m2.7",
"zai": "zai/glm-5.1",
"gemini": "gemini/gemini-2.5-flash",
"gemini": "gemini/models/gemini-3.1-flash-lite",
"groq": "groq/llama-3.3-70b-versatile",
"cerebras": "cerebras/llama3.1-8b",
}
+1 -1
View File
@@ -31,7 +31,7 @@ def test_fcc_init_scaffolds_user_config(
check=False,
)
assert result.returncode == 0, result.stderr or result.stdout
assert (tmp_path / ".config" / "free-claude-code" / ".env").is_file()
assert (tmp_path / ".fcc" / ".env").is_file()
def test_free_claude_code_entrypoint_starts_server(smoke_config: SmokeConfig) -> None:
+9 -2
View File
@@ -9,7 +9,10 @@ from core.anthropic.stream_contracts import (
from smoke.lib.config import SmokeConfig
from smoke.lib.http import collect_message_stream, message_payload
from smoke.lib.server import start_server
from smoke.lib.skips import skip_if_upstream_unavailable_events
from smoke.lib.skips import (
skip_if_upstream_unavailable_events,
skip_if_upstream_unavailable_exception,
)
pytestmark = [pytest.mark.live, pytest.mark.smoke_target("tools")]
@@ -50,7 +53,11 @@ def test_live_tool_use_when_configured_model_supports_tools(
},
name="tools",
) as server:
events = collect_message_stream(server, payload, smoke_config)
try:
events = collect_message_stream(server, payload, smoke_config)
except Exception as exc:
skip_if_upstream_unavailable_exception(exc)
raise
skip_if_upstream_unavailable_events(events)
assert_anthropic_stream_contract(events)
assert has_tool_use(events), "model did not emit a tool_use block"
@@ -30,7 +30,7 @@ def test_entrypoint_init_e2e(smoke_config: SmokeConfig, tmp_path: Path) -> None:
check=False,
)
assert result.returncode == 0, result.stderr or result.stdout
env_file = tmp_path / ".config" / "free-claude-code" / ".env"
env_file = tmp_path / ".fcc" / ".env"
assert env_file.is_file()
assert env_file.read_text(encoding="utf-8").strip()
@@ -91,8 +91,9 @@ async def test_cli_session_stop_kills_child_e2e(tmp_path: Path) -> None:
process.wait = AsyncMock(side_effect=[asyncio.TimeoutError, 0])
session.process = process
stopped = await session.stop()
with patch("cli.session.kill_pid_tree_best_effort") as kill_tree:
stopped = await session.stop()
assert stopped is True
process.terminate.assert_called_once()
kill_tree.assert_called_once_with(process.pid)
process.kill.assert_called_once()
@@ -138,19 +138,28 @@ def test_proxy_timeout_config_e2e(smoke_config: SmokeConfig, tmp_path) -> None:
@pytest.mark.smoke_target("extensibility")
def test_provider_registry_e2e() -> None:
settings = Settings(
open_router_api_key="openrouter-key",
deepseek_api_key="deepseek-key",
nvidia_nim_api_key="nim-key",
lm_studio_base_url="http://localhost:1234/v1",
llamacpp_base_url="http://localhost:8080/v1",
)
settings_kwargs: dict[str, str] = {}
for descriptor in PROVIDER_DESCRIPTORS.values():
if descriptor.credential_attr is not None:
settings_kwargs[_settings_init_key(descriptor.credential_attr)] = (
f"{descriptor.provider_id}-key"
)
if descriptor.base_url_attr is not None and descriptor.default_base_url:
settings_kwargs[_settings_init_key(descriptor.base_url_attr)] = (
descriptor.default_base_url
)
settings = Settings.model_validate(settings_kwargs)
for descriptor in PROVIDER_DESCRIPTORS.values():
config = build_provider_config(descriptor, settings)
assert config.base_url
assert config.api_key
def _settings_init_key(field_name: str) -> str:
alias = Settings.model_fields[field_name].validation_alias
return alias if isinstance(alias, str) else field_name
@pytest.mark.smoke_target("extensibility")
def test_platform_factory_e2e() -> None:
assert create_messaging_platform("not-a-platform") is None
+2 -2
View File
@@ -219,7 +219,7 @@ def test_admin_apply_writes_gemini_key_and_masks_preview(monkeypatch, tmp_path):
"/admin/api/config/apply",
json={
"values": {
"MODEL": "gemini/gemini-2.5-flash",
"MODEL": "gemini/models/gemini-3.1-flash-lite",
"GEMINI_API_KEY": "gm-secret",
}
},
@@ -231,7 +231,7 @@ def test_admin_apply_writes_gemini_key_and_masks_preview(monkeypatch, tmp_path):
assert "GEMINI_API_KEY=********" in body["env_preview"]
env_file = tmp_path / ".fcc" / ".env"
text = env_file.read_text(encoding="utf-8")
assert "MODEL=gemini/gemini-2.5-flash" in text
assert "MODEL=gemini/models/gemini-3.1-flash-lite" in text
assert "GEMINI_API_KEY=gm-secret" in text
+23
View File
@@ -644,3 +644,26 @@ def test_create_app_writes_server_log_under_fcc_home(monkeypatch, tmp_path):
assert canonical_log.is_file()
assert "canonical log path test" in canonical_log.read_text(encoding="utf-8")
assert not (run_dir / "logs" / "server.log").exists()
def test_create_app_honors_process_log_file_override(monkeypatch, tmp_path):
"""Smoke subprocesses can redirect app logs without changing Settings."""
from loguru import logger
import config.logging_config as logging_config_mod
from api.app import create_app
from config.paths import server_log_path
custom_log = tmp_path / "smoke" / "server.log"
monkeypatch.setenv("HOME", str(tmp_path))
monkeypatch.setenv("USERPROFILE", str(tmp_path))
monkeypatch.setenv("LOG_FILE", str(custom_log))
monkeypatch.setattr(logging_config_mod, "_configured", False)
create_app(lifespan_enabled=False)
logger.info("process log path test")
logger.complete()
assert custom_log.is_file()
assert "process log path test" in custom_log.read_text(encoding="utf-8")
assert not server_log_path().exists()
+7 -3
View File
@@ -32,7 +32,7 @@ class TestSettings:
monkeypatch.delenv("HTTP_CONNECT_TIMEOUT", raising=False)
monkeypatch.setitem(Settings.model_config, "env_file", ())
settings = Settings()
assert settings.model == "nvidia_nim/z-ai/glm4.7"
assert settings.model == "nvidia_nim/nvidia/nemotron-3-super-120b-a12b"
assert isinstance(settings.provider_rate_limit, int)
assert isinstance(settings.provider_rate_window, int)
assert isinstance(settings.nim.temperature, float)
@@ -770,7 +770,10 @@ class TestPerModelMapping:
assert Settings.parse_provider_type("llamacpp/model") == "llamacpp"
assert Settings.parse_provider_type("ollama/llama3.1") == "ollama"
assert Settings.parse_provider_type("wafer/DeepSeek-V4-Pro") == "wafer"
assert Settings.parse_provider_type("gemini/gemini-2.5-flash") == "gemini"
assert (
Settings.parse_provider_type("gemini/models/gemini-3.1-flash-lite")
== "gemini"
)
assert Settings.parse_provider_type("groq/llama-3.3-70b-versatile") == "groq"
assert Settings.parse_provider_type("cerebras/llama3.1-8b") == "cerebras"
@@ -793,7 +796,8 @@ class TestPerModelMapping:
assert Settings.parse_model_name("ollama/llama3.1") == "llama3.1"
assert Settings.parse_model_name("wafer/DeepSeek-V4-Pro") == "DeepSeek-V4-Pro"
assert (
Settings.parse_model_name("gemini/gemini-2.5-flash") == "gemini-2.5-flash"
Settings.parse_model_name("gemini/models/gemini-3.1-flash-lite")
== "models/gemini-3.1-flash-lite"
)
assert (
Settings.parse_model_name("groq/llama-3.3-70b-versatile")
@@ -331,6 +331,36 @@ def test_cli_matrix_agent_prompt_text_without_tool_evidence_does_not_pass(
assert outcome.token_evidence["agent_tool_count"] == 0
def test_cli_matrix_structured_provider_error_is_upstream_unavailable(
tmp_path: Path,
) -> None:
run = ClaudeCliRun(
command=("claude", "-p", "x"),
returncode=0,
stdout="Provider API request failed. (request_id=req_123)",
stderr="",
duration_s=0.1,
)
outcome = make_outcome(
model="minimax/minimax-m2.5:free",
full_model="open_router/minimax/minimax-m2.5:free",
source="openrouter_free_cli_default",
feature="tool_use_roundtrip",
marker="FCC_OPENROUTER_FREE_TOOL",
run=run,
log_delta=(
'{"event": "api.request.received", "http_method": "POST", '
'"http_path": "/v1/messages"}\n'
'{"event": "provider.response.error", "exc_type": "HTTPStatusError"}'
),
log_path=tmp_path / "server.log",
requires_tool_result=True,
)
assert outcome.classification == "upstream_unavailable"
assert outcome.request_count == 1
def test_nvidia_nim_cli_timeout_is_not_model_missing(
tmp_path: Path,
) -> None:
+4 -2
View File
@@ -147,7 +147,9 @@ def test_provider_smoke_model_override_accepts_model_name_without_prefix(
def test_provider_smoke_model_override_accepts_owner_model_name(
monkeypatch,
) -> None:
monkeypatch.setenv("FCC_SMOKE_MODEL_NVIDIA_NIM", "z-ai/glm4.7")
monkeypatch.setenv(
"FCC_SMOKE_MODEL_NVIDIA_NIM", "nvidia/nemotron-3-super-120b-a12b"
)
config = _smoke_config(
settings=_settings(
model="deepseek/deepseek-chat",
@@ -160,7 +162,7 @@ def test_provider_smoke_model_override_accepts_owner_model_name(
models = config.provider_smoke_models()
assert models[0].full_model == "nvidia_nim/z-ai/glm4.7"
assert models[0].full_model == "nvidia_nim/nvidia/nemotron-3-super-120b-a12b"
assert models[0].source == "FCC_SMOKE_MODEL_NVIDIA_NIM"
+25
View File
@@ -154,6 +154,31 @@ def test_build_request_body_tool_choice_keeps_thinking(deepseek_provider):
assert body["tool_choice"] == {"type": "auto"}
def test_build_request_body_forced_tool_choice_downgrades_to_auto(
deepseek_provider,
):
request = MessagesRequest.model_validate(
{
"model": "m",
"messages": [{"role": "user", "content": "x"}],
"tool_choice": {"type": "tool", "name": "Read"},
"tools": [
{
"name": "Read",
"description": "Read a file",
"input_schema": {"type": "object", "properties": {}},
}
],
"thinking": {"type": "enabled", "budget_tokens": 2000},
}
)
body = deepseek_provider._build_request_body(request)
assert body["thinking"] == {"type": "enabled", "budget_tokens": 2000}
assert body["tool_choice"] == {"type": "auto"}
def test_build_request_body_respects_global_thinking_disable():
provider = DeepSeekProvider(
ProviderConfig(
+2 -2
View File
@@ -17,7 +17,7 @@ class MockMessage:
class MockRequest:
def __init__(self, **kwargs):
self.model = "gemini-2.5-flash"
self.model = "models/gemini-3.1-flash-lite"
self.messages = [MockMessage("user", "Hello")]
self.max_tokens = 100
self.temperature = 0.5
@@ -97,7 +97,7 @@ def test_build_request_body_basic(gemini_provider):
req = MockRequest()
body = gemini_provider._build_request_body(req)
assert body["model"] == "gemini-2.5-flash"
assert body["model"] == "models/gemini-3.1-flash-lite"
assert body["messages"][0]["role"] == "system"
assert "reasoning_effort" not in body
eb = body.get("extra_body")
+2 -2
View File
@@ -25,7 +25,7 @@ class MockMessage:
class MockRequest:
def __init__(self, **kwargs):
self.model = "stepfun/step-3.5-flash:free"
self.model = "moonshotai/kimi-k2.6:free"
self.messages = [MockMessage("user", "Hello")]
self.max_tokens = 100
self.temperature = 0.5
@@ -132,7 +132,7 @@ def test_build_request_body_is_native_anthropic(open_router_provider):
req = MockRequest()
body = open_router_provider._build_request_body(req)
assert body["model"] == "stepfun/step-3.5-flash:free"
assert body["model"] == "moonshotai/kimi-k2.6:free"
assert body["temperature"] == 0.5
assert body["stream"] is True
assert body["messages"] == [{"role": "user", "content": "Hello"}]
+17
View File
@@ -708,6 +708,23 @@ class TestProcessToolCall:
# Should not crash, should still emit events
assert len(events) > 0
def test_none_tool_index_defaults_to_zero(self):
"""Gemini may stream tool_call deltas with a null index."""
provider = _make_provider()
from core.anthropic import SSEBuilder
sse = SSEBuilder("msg_test", "test-model")
tc = {
"index": None,
"id": "call_none",
"function": {"name": "test", "arguments": "{}"},
}
events = list(provider._process_tool_call(tc, sse))
event_text = "".join(events)
assert "tool_use" in event_text
assert "call_none" in event_text
def test_tool_args_emitted_as_delta(self):
"""Arguments are emitted as input_json_delta events."""
provider = _make_provider()