Add per-model thinking toggles

This commit is contained in:
Alishahryar1
2026-04-24 00:14:49 -07:00
parent 462a9430bb
commit 1f12a33dd7
14 changed files with 271 additions and 88 deletions
+6 -3
View File
@@ -28,9 +28,12 @@ MODEL="nvidia_nim/z-ai/glm4.7"
# Thinking output
# Global switch for provider reasoning requests and Claude thinking blocks.
# Set false to suppress thinking across NIM, OpenRouter, LM Studio, and llama.cpp.
ENABLE_THINKING=true
# Per-Claude-model-family switches for provider reasoning requests and Claude thinking blocks.
# MODEL_ENABLE_THINKING applies when the incoming Claude model is not Opus, Sonnet, or Haiku.
OPUS_ENABLE_THINKING=true
SONNET_ENABLE_THINKING=true
HAIKU_ENABLE_THINKING=true
MODEL_ENABLE_THINKING=true
# Provider config
+11 -5
View File
@@ -81,8 +81,11 @@ MODEL_SONNET="nvidia_nim/moonshotai/kimi-k2-thinking"
MODEL_HAIKU="nvidia_nim/stepfun-ai/step-3.5-flash"
MODEL="nvidia_nim/z-ai/glm4.7" # fallback
# Global switch for provider reasoning requests and Claude thinking blocks.
ENABLE_THINKING=true
# Per-Claude-model-family switches for provider reasoning requests and Claude thinking blocks.
OPUS_ENABLE_THINKING=true
SONNET_ENABLE_THINKING=true
HAIKU_ENABLE_THINKING=true
MODEL_ENABLE_THINKING=true
```
</details>
@@ -158,7 +161,7 @@ MODEL="nvidia_nim/z-ai/glm4.7" # fallback
</details>
> Migration: `NIM_ENABLE_THINKING` was removed in this release. Rename it to `ENABLE_THINKING`.
> Migration: `NIM_ENABLE_THINKING` and `ENABLE_THINKING` were removed in this release. Use `MODEL_ENABLE_THINKING` for fallback behavior, or `OPUS_ENABLE_THINKING`, `SONNET_ENABLE_THINKING`, and `HAIKU_ENABLE_THINKING` for family-specific control.
<details>
<summary><b>Optional Authentication</b> (restrict access to your proxy)</summary>
@@ -321,7 +324,7 @@ free-claude-code # starts the server
- **Per-model routing**: Opus / Sonnet / Haiku requests resolve to their model-specific backend, with `MODEL` as fallback
- **Request optimization**: 5 categories of trivial requests (quota probes, title generation, prefix detection, suggestions, filepath extraction) are intercepted and responded to locally without using API quota
- **Format translation**: Requests are translated from Anthropic format to the provider's OpenAI-compatible format and streamed back
- **Thinking tokens**: `<think>` tags and `reasoning_content` fields are converted into native Claude thinking blocks when `ENABLE_THINKING=true`
- **Thinking tokens**: `<think>` tags and `reasoning_content` fields are converted into native Claude thinking blocks when the matching `*_ENABLE_THINKING` flag is true
The proxy also exposes Claude-compatible probe routes: `GET /v1/models`, `POST /v1/messages`, `POST /v1/messages/count_tokens`, plus `HEAD`/`OPTIONS` support for the common probe endpoints.
@@ -507,7 +510,10 @@ Configure via `WHISPER_DEVICE` (`cpu` | `cuda` | `nvidia_nim`) and `WHISPER_MODE
| `MODEL_SONNET` | Model for Claude Sonnet requests (falls back to `MODEL`) | `open_router/arcee-ai/trinity-large-preview:free` |
| `MODEL_HAIKU` | Model for Claude Haiku requests (falls back to `MODEL`) | `open_router/stepfun/step-3.5-flash:free` |
| `NVIDIA_NIM_API_KEY` | NVIDIA API key | required for NIM |
| `ENABLE_THINKING` | Global switch for provider reasoning requests and Claude thinking blocks. Set `false` to hide thinking across all providers. | `true` |
| `OPUS_ENABLE_THINKING` | Thinking switch for Claude Opus requests | `true` |
| `SONNET_ENABLE_THINKING` | Thinking switch for Claude Sonnet requests | `true` |
| `HAIKU_ENABLE_THINKING` | Thinking switch for Claude Haiku requests | `true` |
| `MODEL_ENABLE_THINKING` | Thinking switch for unrecognized Claude model names and fallback `MODEL` requests | `true` |
| `OPENROUTER_API_KEY` | OpenRouter API key | required for OpenRouter |
| `DEEPSEEK_API_KEY` | DeepSeek API key | required for DeepSeek |
| `LM_STUDIO_BASE_URL` | LM Studio server URL | `http://localhost:1234/v1` |
+21 -35
View File
@@ -29,6 +29,22 @@ def _get_proxy_value(settings: Settings, attr_name: str) -> str:
return value if isinstance(value, str) else ""
def _provider_config_kwargs(settings: Settings) -> dict:
"""Return settings shared by all provider configs."""
return {
"rate_limit": settings.provider_rate_limit,
"rate_window": settings.provider_rate_window,
"max_concurrency": settings.provider_max_concurrency,
"http_read_timeout": settings.http_read_timeout,
"http_write_timeout": settings.http_write_timeout,
"http_connect_timeout": settings.http_connect_timeout,
"opus_enable_thinking": settings.opus_enable_thinking,
"sonnet_enable_thinking": settings.sonnet_enable_thinking,
"haiku_enable_thinking": settings.haiku_enable_thinking,
"model_enable_thinking": settings.model_enable_thinking,
}
def _create_provider_for_type(provider_type: str, settings: Settings) -> BaseProvider:
"""Construct and return a new provider instance for the given provider type."""
_proxy_map = {
@@ -48,14 +64,8 @@ def _create_provider_for_type(provider_type: str, settings: Settings) -> BasePro
config = ProviderConfig(
api_key=settings.nvidia_nim_api_key,
base_url=NVIDIA_NIM_BASE_URL,
rate_limit=settings.provider_rate_limit,
rate_window=settings.provider_rate_window,
max_concurrency=settings.provider_max_concurrency,
http_read_timeout=settings.http_read_timeout,
http_write_timeout=settings.http_write_timeout,
http_connect_timeout=settings.http_connect_timeout,
enable_thinking=settings.enable_thinking,
proxy=proxy,
**_provider_config_kwargs(settings),
)
return NvidiaNimProvider(config, nim_settings=settings.nim)
if provider_type == "open_router":
@@ -67,14 +77,8 @@ def _create_provider_for_type(provider_type: str, settings: Settings) -> BasePro
config = ProviderConfig(
api_key=settings.open_router_api_key,
base_url=OPENROUTER_BASE_URL,
rate_limit=settings.provider_rate_limit,
rate_window=settings.provider_rate_window,
max_concurrency=settings.provider_max_concurrency,
http_read_timeout=settings.http_read_timeout,
http_write_timeout=settings.http_write_timeout,
http_connect_timeout=settings.http_connect_timeout,
enable_thinking=settings.enable_thinking,
proxy=proxy,
**_provider_config_kwargs(settings),
)
return OpenRouterProvider(config)
if provider_type == "deepseek":
@@ -86,41 +90,23 @@ def _create_provider_for_type(provider_type: str, settings: Settings) -> BasePro
config = ProviderConfig(
api_key=settings.deepseek_api_key,
base_url=DEEPSEEK_BASE_URL,
rate_limit=settings.provider_rate_limit,
rate_window=settings.provider_rate_window,
max_concurrency=settings.provider_max_concurrency,
http_read_timeout=settings.http_read_timeout,
http_write_timeout=settings.http_write_timeout,
http_connect_timeout=settings.http_connect_timeout,
enable_thinking=settings.enable_thinking,
**_provider_config_kwargs(settings),
)
return DeepSeekProvider(config)
if provider_type == "lmstudio":
config = ProviderConfig(
api_key="lm-studio",
base_url=settings.lm_studio_base_url,
rate_limit=settings.provider_rate_limit,
rate_window=settings.provider_rate_window,
max_concurrency=settings.provider_max_concurrency,
http_read_timeout=settings.http_read_timeout,
http_write_timeout=settings.http_write_timeout,
http_connect_timeout=settings.http_connect_timeout,
enable_thinking=settings.enable_thinking,
proxy=proxy,
**_provider_config_kwargs(settings),
)
return LMStudioProvider(config)
if provider_type == "llamacpp":
config = ProviderConfig(
api_key="llamacpp",
base_url=settings.llamacpp_base_url,
rate_limit=settings.provider_rate_limit,
rate_window=settings.provider_rate_window,
max_concurrency=settings.provider_max_concurrency,
http_read_timeout=settings.http_read_timeout,
http_write_timeout=settings.http_write_timeout,
http_connect_timeout=settings.http_connect_timeout,
enable_thinking=settings.enable_thinking,
proxy=proxy,
**_provider_config_kwargs(settings),
)
return LlamaCppProvider(config)
logger.error(
+44 -14
View File
@@ -56,21 +56,29 @@ def _env_file_contains_key(path: Path, key: str) -> bool:
def _removed_env_var_message(model_config: Mapping[str, Any]) -> str | None:
"""Return a migration error for removed env vars, if present."""
removed_key = "NIM_ENABLE_THINKING"
replacement = "ENABLE_THINKING"
removed_vars = {
"NIM_ENABLE_THINKING": (
"Use MODEL_ENABLE_THINKING for fallback behavior, or "
"OPUS_ENABLE_THINKING, SONNET_ENABLE_THINKING, and "
"HAIKU_ENABLE_THINKING for family-specific control."
),
"ENABLE_THINKING": (
"Rename it to MODEL_ENABLE_THINKING for fallback behavior, or "
"use OPUS_ENABLE_THINKING, SONNET_ENABLE_THINKING, and "
"HAIKU_ENABLE_THINKING for family-specific control."
),
}
if removed_key in os.environ:
return (
f"{removed_key} has been removed in this release. "
f"Rename it to {replacement}."
)
for removed_key, replacement in removed_vars.items():
if removed_key in os.environ:
return f"{removed_key} has been removed in this release. {replacement}"
for env_file in _configured_env_files(model_config):
if _env_file_contains_key(env_file, removed_key):
return (
f"{removed_key} has been removed in this release. "
f"Rename it to {replacement}. Found in {env_file}."
)
for env_file in _configured_env_files(model_config):
if _env_file_contains_key(env_file, removed_key):
return (
f"{removed_key} has been removed in this release. "
f"{replacement} Found in {env_file}."
)
return None
@@ -130,7 +138,18 @@ class Settings(BaseSettings):
provider_max_concurrency: int = Field(
default=5, validation_alias="PROVIDER_MAX_CONCURRENCY"
)
enable_thinking: bool = Field(default=True, validation_alias="ENABLE_THINKING")
opus_enable_thinking: bool = Field(
default=True, validation_alias="OPUS_ENABLE_THINKING"
)
sonnet_enable_thinking: bool = Field(
default=True, validation_alias="SONNET_ENABLE_THINKING"
)
haiku_enable_thinking: bool = Field(
default=True, validation_alias="HAIKU_ENABLE_THINKING"
)
model_enable_thinking: bool = Field(
default=True, validation_alias="MODEL_ENABLE_THINKING"
)
# ==================== HTTP Client Timeouts ====================
http_read_timeout: float = Field(
@@ -287,6 +306,17 @@ class Settings(BaseSettings):
return self.model_sonnet
return self.model
def resolve_thinking_enabled(self, claude_model_name: str) -> bool:
"""Resolve the configured thinking flag for a Claude model family."""
name_lower = claude_model_name.lower()
if "opus" in name_lower:
return self.opus_enable_thinking
if "haiku" in name_lower:
return self.haiku_enable_thinking
if "sonnet" in name_lower:
return self.sonnet_enable_thinking
return self.model_enable_thinking
@staticmethod
def parse_provider_type(model_string: str) -> str:
"""Extract provider type from any 'provider/model' string."""
+20 -2
View File
@@ -22,7 +22,10 @@ class ProviderConfig(BaseModel):
http_read_timeout: float = 300.0
http_write_timeout: float = 10.0
http_connect_timeout: float = 2.0
enable_thinking: bool = True
opus_enable_thinking: bool = True
sonnet_enable_thinking: bool = True
haiku_enable_thinking: bool = True
model_enable_thinking: bool = True
proxy: str = ""
@@ -40,7 +43,22 @@ class BaseProvider(ABC):
if thinking is not None and hasattr(thinking, "enabled")
else True
)
return self._config.enable_thinking and request_enabled
request_model = getattr(request, "original_model", None) or getattr(
request, "model", ""
)
model_enabled = self._resolve_model_thinking_enabled(str(request_model))
return model_enabled and request_enabled
def _resolve_model_thinking_enabled(self, model: str) -> bool:
"""Return the configured thinking flag for a Claude model family."""
model_lower = model.lower()
if "opus" in model_lower:
return self._config.opus_enable_thinking
if "haiku" in model_lower:
return self._config.haiku_enable_thinking
if "sonnet" in model_lower:
return self._config.sonnet_enable_thinking
return self._config.model_enable_thinking
@abstractmethod
async def cleanup(self) -> None:
+29 -6
View File
@@ -32,7 +32,10 @@ def _make_mock_settings(**overrides):
mock.http_read_timeout = 300.0
mock.http_write_timeout = 10.0
mock.http_connect_timeout = 2.0
mock.enable_thinking = True
mock.opus_enable_thinking = True
mock.sonnet_enable_thinking = True
mock.haiku_enable_thinking = True
mock.model_enable_thinking = True
for key, value in overrides.items():
setattr(mock, key, value)
return mock
@@ -134,7 +137,7 @@ async def test_get_provider_deepseek():
assert isinstance(provider, DeepSeekProvider)
assert provider._base_url == "https://api.deepseek.com"
assert provider._api_key == "test_deepseek_key"
assert provider._config.enable_thinking is True
assert provider._config.model_enable_thinking is True
@pytest.mark.asyncio
@@ -152,18 +155,38 @@ async def test_get_provider_deepseek_uses_fixed_base_url():
@pytest.mark.asyncio
async def test_get_provider_deepseek_passes_enable_thinking():
"""DeepSeek provider receives the global thinking toggle."""
async def test_get_provider_deepseek_passes_model_enable_thinking():
"""DeepSeek provider receives the fallback thinking toggle."""
with patch("api.dependencies.get_settings") as mock_settings:
mock_settings.return_value = _make_mock_settings(
provider_type="deepseek",
enable_thinking=False,
model_enable_thinking=False,
)
provider = get_provider()
assert isinstance(provider, DeepSeekProvider)
assert provider._config.enable_thinking is False
assert provider._config.model_enable_thinking is False
@pytest.mark.asyncio
async def test_get_provider_passes_per_model_thinking_flags():
"""Provider config receives every per-model thinking toggle."""
with patch("api.dependencies.get_settings") as mock_settings:
mock_settings.return_value = _make_mock_settings(
opus_enable_thinking=False,
sonnet_enable_thinking=True,
haiku_enable_thinking=False,
model_enable_thinking=True,
)
provider = get_provider()
assert isinstance(provider, NvidiaNimProvider)
assert provider._config.opus_enable_thinking is False
assert provider._config.sonnet_enable_thinking is True
assert provider._config.haiku_enable_thinking is False
assert provider._config.model_enable_thinking is True
@pytest.mark.asyncio
+46 -6
View File
@@ -29,7 +29,10 @@ class TestSettings:
assert isinstance(settings.provider_rate_window, int)
assert isinstance(settings.nim.temperature, float)
assert isinstance(settings.fast_prefix_detection, bool)
assert isinstance(settings.enable_thinking, bool)
assert isinstance(settings.opus_enable_thinking, bool)
assert isinstance(settings.sonnet_enable_thinking, bool)
assert isinstance(settings.haiku_enable_thinking, bool)
assert isinstance(settings.model_enable_thinking, bool)
assert settings.http_read_timeout == 120.0
def test_get_settings_cached(self):
@@ -110,20 +113,34 @@ class TestSettings:
settings = Settings()
assert settings.http_connect_timeout == 5.0
def test_enable_thinking_from_env(self, monkeypatch):
"""ENABLE_THINKING env var is loaded into settings."""
def test_per_model_thinking_from_env(self, monkeypatch):
"""Per-model thinking env vars are loaded into settings."""
from config.settings import Settings
monkeypatch.setenv("ENABLE_THINKING", "false")
monkeypatch.setenv("OPUS_ENABLE_THINKING", "false")
monkeypatch.setenv("SONNET_ENABLE_THINKING", "true")
monkeypatch.setenv("HAIKU_ENABLE_THINKING", "false")
monkeypatch.setenv("MODEL_ENABLE_THINKING", "true")
settings = Settings()
assert settings.enable_thinking is False
assert settings.opus_enable_thinking is False
assert settings.sonnet_enable_thinking is True
assert settings.haiku_enable_thinking is False
assert settings.model_enable_thinking is True
def test_removed_nim_enable_thinking_raises(self, monkeypatch):
"""NIM_ENABLE_THINKING now fails fast with a migration message."""
from config.settings import Settings
monkeypatch.setenv("NIM_ENABLE_THINKING", "false")
with pytest.raises(ValidationError, match="Rename it to ENABLE_THINKING"):
with pytest.raises(ValidationError, match="MODEL_ENABLE_THINKING"):
Settings()
def test_removed_enable_thinking_raises(self, monkeypatch):
"""ENABLE_THINKING now fails fast with a migration message."""
from config.settings import Settings
monkeypatch.setenv("ENABLE_THINKING", "false")
with pytest.raises(ValidationError, match="MODEL_ENABLE_THINKING"):
Settings()
@@ -494,6 +511,29 @@ class TestPerModelMapping:
s.model_opus = "open_router/opus-model"
assert s.resolve_model("Claude-OPUS-4") == "open_router/opus-model"
def test_resolve_thinking_enabled_per_model_family(self):
"""resolve_thinking_enabled returns the matching model-family flag."""
from config.settings import Settings
s = Settings()
s.opus_enable_thinking = False
s.sonnet_enable_thinking = True
s.haiku_enable_thinking = False
s.model_enable_thinking = True
assert s.resolve_thinking_enabled("claude-opus-4-20250514") is False
assert s.resolve_thinking_enabled("claude-sonnet-4-20250514") is True
assert s.resolve_thinking_enabled("claude-haiku-4-20250514") is False
assert s.resolve_thinking_enabled("claude-2.1") is True
def test_resolve_thinking_enabled_case_insensitive(self):
"""Thinking model-family classification is case-insensitive."""
from config.settings import Settings
s = Settings()
s.opus_enable_thinking = False
assert s.resolve_thinking_enabled("Claude-OPUS-4") is False
def test_parse_provider_type(self):
"""parse_provider_type extracts provider from model string."""
from config.settings import Settings
+78
View File
@@ -0,0 +1,78 @@
from providers.base import BaseProvider, ProviderConfig
class DummyProvider(BaseProvider):
async def cleanup(self) -> None:
return None
async def stream_response(self, request, input_tokens=0, *, request_id=None):
if False:
yield ""
class DummyThinking:
def __init__(self, enabled: bool):
self.enabled = enabled
class DummyRequest:
def __init__(
self,
*,
model: str,
original_model: str | None = None,
thinking: DummyThinking | None = None,
):
self.model = model
self.original_model = original_model
self.thinking = thinking
def test_is_thinking_enabled_uses_original_model_family():
provider = DummyProvider(
ProviderConfig(
api_key="test",
opus_enable_thinking=False,
model_enable_thinking=True,
)
)
request = DummyRequest(
model="provider-model",
original_model="claude-opus-4-20250514",
thinking=DummyThinking(True),
)
assert provider._is_thinking_enabled(request) is False
def test_is_thinking_enabled_falls_back_to_request_model():
provider = DummyProvider(
ProviderConfig(
api_key="test",
sonnet_enable_thinking=False,
model_enable_thinking=True,
)
)
request = DummyRequest(model="claude-sonnet-4-20250514")
assert provider._is_thinking_enabled(request) is False
def test_is_thinking_enabled_unknown_model_uses_fallback_flag():
provider = DummyProvider(
ProviderConfig(api_key="test", model_enable_thinking=False)
)
request = DummyRequest(model="provider-model")
assert provider._is_thinking_enabled(request) is False
def test_is_thinking_enabled_respects_request_disable():
provider = DummyProvider(ProviderConfig(api_key="test", opus_enable_thinking=True))
request = DummyRequest(
model="provider-model",
original_model="claude-opus-4-20250514",
thinking=DummyThinking(False),
)
assert provider._is_thinking_enabled(request) is False
+5 -6
View File
@@ -44,7 +44,6 @@ def deepseek_config():
base_url=DEEPSEEK_BASE_URL,
rate_limit=10,
rate_window=60,
enable_thinking=True,
)
@@ -86,15 +85,15 @@ def test_build_request_body_enables_thinking_for_chat_model(deepseek_provider):
assert body["messages"][0]["role"] == "system"
def test_build_request_body_global_disable_blocks_request_thinking():
"""Global disable suppresses provider-side thinking even if the request enables it."""
def test_build_request_body_model_disable_blocks_request_thinking():
"""Model disable suppresses provider-side thinking even if the request enables it."""
provider = DeepSeekProvider(
ProviderConfig(
api_key="test_deepseek_key",
base_url=DEEPSEEK_BASE_URL,
rate_limit=10,
rate_window=60,
enable_thinking=False,
model_enable_thinking=False,
)
)
req = MockRequest(model="deepseek-chat")
@@ -103,8 +102,8 @@ def test_build_request_body_global_disable_blocks_request_thinking():
assert "extra_body" not in body or "thinking" not in body["extra_body"]
def test_build_request_body_request_disable_blocks_global_thinking(deepseek_provider):
"""Request-level disable suppresses provider-side thinking when global is enabled."""
def test_build_request_body_request_disable_blocks_model_thinking(deepseek_provider):
"""Request-level disable suppresses provider-side thinking when model is enabled."""
req = MockRequest(model="deepseek-chat")
req.thinking.enabled = False
body = deepseek_provider._build_request_body(req)
+2 -2
View File
@@ -111,9 +111,9 @@ def test_init_base_url_strips_trailing_slash():
@pytest.mark.asyncio
async def test_stream_response_omits_thinking_when_globally_disabled(llamacpp_config):
async def test_stream_response_omits_thinking_when_model_disabled(llamacpp_config):
provider = LlamaCppProvider(
llamacpp_config.model_copy(update={"enable_thinking": False})
llamacpp_config.model_copy(update={"model_enable_thinking": False})
)
req = MockRequest()
+2 -2
View File
@@ -111,9 +111,9 @@ def test_init_base_url_strips_trailing_slash():
@pytest.mark.asyncio
async def test_stream_response_omits_thinking_when_globally_disabled(lmstudio_config):
async def test_stream_response_omits_thinking_when_model_disabled(lmstudio_config):
provider = LMStudioProvider(
lmstudio_config.model_copy(update={"enable_thinking": False})
lmstudio_config.model_copy(update={"model_enable_thinking": False})
)
req = MockRequest()
+3 -3
View File
@@ -121,13 +121,13 @@ async def test_build_request_body(provider_config):
@pytest.mark.asyncio
async def test_build_request_body_omits_reasoning_when_globally_disabled(
async def test_build_request_body_omits_reasoning_when_model_disabled(
provider_config,
):
from config.nim import NimSettings
provider = NvidiaNimProvider(
provider_config.model_copy(update={"enable_thinking": False}),
provider_config.model_copy(update={"model_enable_thinking": False}),
nim_settings=NimSettings(),
)
req = MockRequest()
@@ -244,7 +244,7 @@ async def test_stream_response_suppresses_thinking_when_disabled(provider_config
from config.nim import NimSettings
provider = NvidiaNimProvider(
provider_config.model_copy(update={"enable_thinking": False}),
provider_config.model_copy(update={"model_enable_thinking": False}),
nim_settings=NimSettings(),
)
req = MockRequest()
+3 -3
View File
@@ -105,9 +105,9 @@ def test_build_request_body_has_reasoning_extra(open_router_provider):
assert body["extra_body"]["reasoning"]["enabled"] is True
def test_build_request_body_omits_reasoning_when_globally_disabled(open_router_config):
def test_build_request_body_omits_reasoning_when_model_disabled(open_router_config):
provider = OpenRouterProvider(
open_router_config.model_copy(update={"enable_thinking": False})
open_router_config.model_copy(update={"model_enable_thinking": False})
)
req = MockRequest()
body = provider._build_request_body(req)
@@ -228,7 +228,7 @@ async def test_stream_response_reasoning_content(open_router_provider):
@pytest.mark.asyncio
async def test_stream_response_suppresses_reasoning_when_disabled(open_router_config):
provider = OpenRouterProvider(
open_router_config.model_copy(update={"enable_thinking": False})
open_router_config.model_copy(update={"model_enable_thinking": False})
)
req = MockRequest()
+1 -1
View File
@@ -46,7 +46,7 @@ def _make_provider_with_thinking_enabled(enabled: bool):
base_url="https://test.api.nvidia.com/v1",
rate_limit=10,
rate_window=60,
enable_thinking=enabled,
model_enable_thinking=enabled,
)
return NvidiaNimProvider(config, nim_settings=NimSettings())