feat: add Harbor eval runner (#9138)

This commit is contained in:
Lucas Kim
2026-05-20 13:03:01 -07:00
committed by GitHub
parent 37f4e44ae2
commit 49f1adedd3
10 changed files with 1076 additions and 0 deletions
+4
View File
@@ -0,0 +1,4 @@
.runs/
.venv/
.pytest_cache/
uv.lock
+141
View File
@@ -0,0 +1,141 @@
# Harbor
This directory contains a developer tool for running Harbor benchmark datasets
with Goose.
The runner takes a prebuilt Goose executable, writes a Harbor job config, and
runs Harbor with the local `goose_harbor` adapter.
## Requirements
- `uv`
- `harbor`
- Docker, for Docker-backed Harbor datasets
- A Goose executable compatible with the benchmark task environment
Dependencies are declared in `pyproject.toml`. `uv` resolves them from the
developer's configured package index.
## Run A Task
```bash
uv run --project evals/harbor evals/harbor/run \
--goose-binary ./target/x86_64-unknown-linux-gnu/release/goose \
--goose-profile ~/.config/goose-benchmark \
--dataset terminal-bench/terminal-bench-2 \
--model databricks/<model-name> \
--task terminal-bench/fix-git \
--trials 1 \
--concurrency 1
```
Use `--dry-run` to write the Harbor config without starting the benchmark:
```bash
uv run --project evals/harbor evals/harbor/run \
--goose-binary ./target/x86_64-unknown-linux-gnu/release/goose \
--goose-profile ~/.config/goose-benchmark \
--dataset terminal-bench/terminal-bench-2 \
--model databricks/<model-name> \
--task terminal-bench/fix-git \
--dry-run
```
Outputs default to:
```text
evals/harbor/.runs/configs/
evals/harbor/.runs/jobs/
```
Override them with `--config-dir` and `--jobs-dir`.
## Goose Executable
`--goose-binary` must point to a Goose executable that can run inside the
benchmark task container. The runner does not build Goose for you; it uploads
the executable you provide into each task container and runs that copy.
For Terminal-Bench 2.0, use a Linux amd64 Goose binary.
On Linux:
```bash
cargo build --release -p goose-cli --bin goose
uv run --project evals/harbor evals/harbor/run --goose-binary ./target/release/goose ...
```
On macOS or Windows, use a cross-compiled Linux amd64 binary. Prefer a binary
built for benchmark/container use. In particular, a Goose CLI binary without
local inference is usually the best fit for Harbor runs because local inference
pulls in runtime dependencies that may not exist in benchmark task images.
When using a GitHub release binary for Terminal-Bench, use the standard Linux
amd64 artifact, not the Vulkan artifact.
Some Linux release binaries still require GCC's OpenMP runtime, packaged as
`libgomp1` on Debian and Ubuntu. If the binary fails to start with a missing
`libgomp.so.1` error, rerun with:
```bash
uv run --project evals/harbor evals/harbor/run \
--goose-binary ./goose \
--goose-profile ~/.config/goose-benchmark \
--dataset terminal-bench/terminal-bench-2 \
--model databricks/<model-name> \
--install-goose-runtime-deps
```
This installs only the minimal known Goose runtime dependency, currently
`libgomp1`, inside each Debian/Ubuntu task container before Goose starts. Leave
it off when the provided Goose executable can start in the task container
without extra OS packages.
For local models, prefer running Ollama or llama.cpp outside the task container
and configuring Goose to call that server through its normal provider/profile
configuration. Avoid running local inference inside each benchmark task
container unless you have specifically built and verified a compatible Goose
binary for that environment.
## Goose Profile
Pass `--goose-profile` to copy an explicit Goose profile into each benchmark
task container. The path can be either:
- a `GOOSE_PATH_ROOT` directory with `config/`, `data/`, and `state/`
- a Goose config directory containing `config.yaml`
The adapter sets `GOOSE_PATH_ROOT` inside the container after copying the
profile. `--model provider/model` still selects the provider and model for the
benchmark run.
If the profile contains `secrets.yaml`, that file will be copied into arbitrary
benchmark task containers. Prefer benchmark-scoped or disposable credentials.
## Local Models
For local models, prefer running the model server on the host and configuring
the benchmark profile to reach it from the task container. This keeps model
loading and hardware acceleration outside Docker while Goose runs inside the
benchmark environment.
For example, an Ollama profile can set:
```yaml
GOOSE_PROVIDER: ollama
GOOSE_MODEL: qwen3.6:27b
OLLAMA_HOST: http://host.docker.internal:11434
```
Then run with `--goose-profile` pointing at that profile and `--model
ollama/qwen3.6:27b`.
Running Goose's built-in local inference inside the benchmark container is less
portable: the model file, CPU/GPU support, target architecture, and container
runtime all have to line up.
## Tests
```bash
uv run --project evals/harbor pytest evals/harbor/tests
```
+1
View File
@@ -0,0 +1 @@
+231
View File
@@ -0,0 +1,231 @@
from __future__ import annotations
import os
import shlex
from pathlib import Path
from tempfile import TemporaryDirectory
from harbor.agents.installed.base import with_prompt_template
from harbor.agents.installed.goose import Goose
from harbor.environments.base import BaseEnvironment
from harbor.models.agent.context import AgentContext
CONTAINER_GOOSE_PATH_ROOT = "/installed-agent/goose-profile"
CONTAINER_RECIPE_PATH = "/installed-agent/harbor-recipe.yaml"
CONTAINER_CA_BUNDLE_PATH = "/installed-agent/ca-certificates.crt"
class GooseBinaryAgent(Goose):
"""Run a caller-provided Goose binary in the benchmark environment."""
def __init__(
self,
*args,
goose_binary: str,
goose_profile: str,
install_goose_runtime_deps: bool = False,
**kwargs,
):
super().__init__(*args, **kwargs)
self.goose_binary = Path(goose_binary).expanduser().resolve()
self.goose_profile = Path(goose_profile).expanduser().resolve()
self.install_goose_runtime_deps = install_goose_runtime_deps
self.ca_bundle_env_path: str | None = None
@staticmethod
def name() -> str:
return "goose-binary"
def get_version_command(self) -> str | None:
return "/installed-agent/goose --version"
def _profile_source_target(self) -> tuple[Path, str]:
if not self.goose_profile.is_dir():
raise FileNotFoundError(f"Goose profile does not exist: {self.goose_profile}")
if (self.goose_profile / "config.yaml").is_file():
return self.goose_profile, f"{CONTAINER_GOOSE_PATH_ROOT}/config"
return self.goose_profile, CONTAINER_GOOSE_PATH_ROOT
def _run_env(self) -> dict[str, str]:
if not self.model_name or "/" not in self.model_name:
raise ValueError("Model name must be in the format provider/model_name")
provider, model = self.model_name.split("/", 1)
env = {
"GOOSE_MODEL": model,
"GOOSE_PROVIDER": provider,
"GOOSE_TELEMETRY_ENABLED": "false",
"GOOSE_TELEMETRY_OFF": "true",
"CONFIGURE": "false",
"GOOSE_PATH_ROOT": CONTAINER_GOOSE_PATH_ROOT,
"GOOSE_DISABLE_KEYRING": "true",
}
if self.ca_bundle_env_path:
env["SSL_CERT_FILE"] = self.ca_bundle_env_path
return env
def _host_ca_bundle(self) -> Path:
candidates = [
"SSL_CERT_FILE",
"REQUESTS_CA_BUNDLE",
"CURL_CA_BUNDLE",
]
for env_var in candidates:
value = os.environ.get(env_var)
if value and Path(value).expanduser().is_file():
return Path(value).expanduser().resolve()
for path in [
Path("/etc/ssl/certs/ca-certificates.crt"),
Path("/etc/ssl/cert.pem"),
Path("/opt/homebrew/etc/ca-certificates/cert.pem"),
]:
if path.is_file():
return path.resolve()
raise FileNotFoundError("Could not find a host CA bundle to copy into the task container")
async def _ensure_ca_bundle(self, environment: BaseEnvironment) -> None:
result = await self.exec_as_root(
environment,
command=(
"if [ -r /etc/ssl/certs/ca-certificates.crt ]; "
"then echo present; else echo missing; fi"
),
timeout_sec=10,
)
if result.stdout.strip() != "missing":
return
await environment.upload_file(self._host_ca_bundle(), CONTAINER_CA_BUNDLE_PATH)
await self.exec_as_root(
environment,
command=f"chmod 644 {shlex.quote(CONTAINER_CA_BUNDLE_PATH)}",
timeout_sec=10,
)
self.ca_bundle_env_path = CONTAINER_CA_BUNDLE_PATH
async def _install_goose_runtime_deps(self, environment: BaseEnvironment) -> None:
await self.exec_as_root(
environment,
command=(
"command -v apt-get >/dev/null 2>&1 || "
"(echo 'install_goose_runtime_deps requires apt-get in the task container' >&2; exit 1); "
"apt-get update && "
"DEBIAN_FRONTEND=noninteractive apt-get install -y libgomp1"
),
timeout_sec=300,
)
def _build_register_skills_command(self) -> str | None:
if not self.skills_dir:
return None
skills_target = f"{CONTAINER_GOOSE_PATH_ROOT}/config/skills"
return (
f"mkdir -p {shlex.quote(skills_target)} && "
f"cp -r {shlex.quote(self.skills_dir)}/* "
f"{shlex.quote(skills_target)}/ 2>/dev/null || true"
)
async def _agent_uid_gid(self, environment: BaseEnvironment) -> tuple[str, str]:
result = await self.exec_as_agent(
environment,
command="id -u && id -g",
timeout_sec=10,
)
ids = [line.strip() for line in result.stdout.splitlines() if line.strip()]
if len(ids) < 2:
raise RuntimeError(f"Could not determine agent uid/gid: {result.stdout!r}")
return ids[0], ids[1]
async def _chown_to_agent_user(
self,
environment: BaseEnvironment,
path: str,
*,
recursive: bool = False,
) -> None:
uid, gid = await self._agent_uid_gid(environment)
recursive_flag = "-R " if recursive else ""
await self.exec_as_root(
environment,
command=(
f"chown {recursive_flag}{shlex.quote(uid)}:{shlex.quote(gid)} "
f"{shlex.quote(path)}"
),
)
async def install(self, environment: BaseEnvironment) -> None:
if not self.goose_binary.is_file():
raise FileNotFoundError(f"Goose binary does not exist: {self.goose_binary}")
await environment.upload_file(self.goose_binary, "/installed-agent/goose")
await self.exec_as_root(environment, command="chmod 755 /installed-agent/goose")
if self.install_goose_runtime_deps:
await self._install_goose_runtime_deps(environment)
await self._ensure_ca_bundle(environment)
source, target = self._profile_source_target()
await self.exec_as_root(environment, command=f"mkdir -p {shlex.quote(target)}")
await environment.upload_dir(source, target)
await self._chown_to_agent_user(
environment, CONTAINER_GOOSE_PATH_ROOT, recursive=True
)
await self.exec_as_agent(
environment,
command=(
"mkdir -p ~/.local/bin && "
"ln -sf /installed-agent/goose ~/.local/bin/goose && "
"~/.local/bin/goose --version"
),
env={
"GOOSE_DISABLE_KEYRING": "true",
"GOOSE_TELEMETRY_ENABLED": "false",
"GOOSE_TELEMETRY_OFF": "true",
"CONFIGURE": "false",
},
timeout_sec=30,
)
@with_prompt_template
async def run(
self,
instruction: str,
environment: BaseEnvironment,
context: AgentContext,
) -> None:
env = self._run_env()
recipe_yaml = self._create_recipe_yaml(instruction)
skills_command = self._build_register_skills_command()
if skills_command:
await self.exec_as_agent(
environment,
command=skills_command,
env=env,
timeout_sec=10,
)
with TemporaryDirectory() as tmp_dir:
recipe_path = Path(tmp_dir) / "harbor-recipe.yaml"
recipe_path.write_text(recipe_yaml)
await environment.upload_file(recipe_path, CONTAINER_RECIPE_PATH)
await self._chown_to_agent_user(environment, CONTAINER_RECIPE_PATH)
cli_flags = self.build_cli_flags()
await self.exec_as_agent(
environment,
command=(
'export PATH="$HOME/.local/bin:$PATH" && '
f"goose run --recipe {shlex.quote(CONTAINER_RECIPE_PATH)} "
"--output-format stream-json "
+ ((cli_flags + " ") if cli_flags else "")
+ "2>&1 | stdbuf -oL tee /logs/agent/goose.txt"
),
env=env,
)
+207
View File
@@ -0,0 +1,207 @@
from __future__ import annotations
import argparse
import json
import os
import re
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import Any
HARBOR_AGENT_IMPORT_PATH = "goose_harbor.goose_binary:GooseBinaryAgent"
def harbor_dir() -> Path:
return Path(__file__).resolve().parents[1]
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Run a Harbor dataset with a caller-provided Goose binary.",
)
parser.add_argument("--goose-binary", required=True, type=Path)
parser.add_argument(
"--goose-profile",
required=True,
type=Path,
help=(
"Goose profile directory to copy into the benchmark container. "
"Accepts either a GOOSE_PATH_ROOT-style directory or a config directory "
"containing config.yaml."
),
)
parser.add_argument("--dataset", required=True)
parser.add_argument("--model", required=True)
parser.add_argument("--task", action="append", default=[], dest="tasks")
parser.add_argument("--trials", type=int, default=1)
parser.add_argument("--concurrency", type=int, default=1)
parser.add_argument("--max-turns", type=int)
parser.add_argument("--jobs-dir", type=Path, default=harbor_dir() / ".runs" / "jobs")
parser.add_argument(
"--config-dir", type=Path, default=harbor_dir() / ".runs" / "configs"
)
parser.add_argument("--job-name")
parser.add_argument("--force-build", action="store_true")
parser.add_argument(
"--install-goose-runtime-deps",
action="store_true",
help=(
"Install minimal OS runtime dependencies required by some Goose release "
"binaries inside Debian/Ubuntu task containers."
),
)
parser.add_argument("--dry-run", action="store_true")
return parser
def pythonpath_with_harbor() -> str:
existing = os.environ.get("PYTHONPATH", "")
return f"{harbor_dir()}{os.pathsep}{existing}" if existing else str(harbor_dir())
def dataset_config(dataset_ref: str, tasks: list[str]) -> dict[str, Any]:
name, sep, ref = dataset_ref.rpartition("@")
dataset: dict[str, Any] = {"name": name if sep else dataset_ref}
if sep:
dataset["ref" if "/" in name else "version"] = ref
if tasks:
dataset["task_names"] = tasks
return dataset
def package_index_env() -> dict[str, str]:
index_url = next(
(
os.environ[key]
for key in ("UV_DEFAULT_INDEX", "PIP_INDEX_URL", "UV_INDEX_URL")
if os.environ.get(key)
),
None,
)
if index_url is None:
return {}
return {
"PIP_INDEX_URL": index_url,
"UV_DEFAULT_INDEX": index_url,
"UV_INDEX_URL": index_url,
}
def default_job_name(model: str, dataset: str) -> str:
safe_model = re.sub(r"[^A-Za-z0-9._-]+", "-", model).strip("-")
safe_dataset = re.sub(r"[^A-Za-z0-9._-]+", "-", dataset).strip("-")
timestamp = datetime.now().strftime("%Y-%m-%d__%H-%M-%S")
return f"goose-{safe_dataset}-{safe_model}-{timestamp}"
def validate_job_name(job_name: str) -> str:
if not re.match(r"^[A-Za-z0-9][A-Za-z0-9._-]*$", job_name):
raise ValueError(
"Job name must start with a letter or number and contain only "
"letters, numbers, dots, underscores, and hyphens"
)
return job_name
def build_harbor_config(args: argparse.Namespace) -> dict[str, Any]:
goose_binary = args.goose_binary.expanduser().resolve()
goose_profile = args.goose_profile.expanduser().resolve()
if "/" not in args.model:
raise ValueError(
"Model must be in provider/model form, for example databricks/my-model"
)
if args.trials < 1:
raise ValueError("--trials must be at least 1")
if args.concurrency < 1:
raise ValueError("--concurrency must be at least 1")
if not goose_binary.is_file():
raise ValueError(
f"--goose-binary does not exist or is not a file: {args.goose_binary}"
)
if not goose_profile.is_dir():
raise ValueError(
"--goose-profile does not exist or is not a directory: "
f"{args.goose_profile}"
)
agent_kwargs: dict[str, Any] = {
"goose_binary": str(goose_binary),
"goose_profile": str(goose_profile),
}
if args.install_goose_runtime_deps:
agent_kwargs["install_goose_runtime_deps"] = True
if args.max_turns is not None:
agent_kwargs["max_turns"] = args.max_turns
index_env = package_index_env()
job_name = (
validate_job_name(args.job_name)
if args.job_name
else default_job_name(args.model, args.dataset)
)
return {
"job_name": job_name,
"jobs_dir": str(args.jobs_dir.expanduser()),
"n_attempts": args.trials,
"n_concurrent_trials": args.concurrency,
"environment": {
"type": "docker",
"force_build": args.force_build,
"delete": True,
"env": index_env,
},
"verifier": {"env": index_env},
"agents": [
{
"import_path": HARBOR_AGENT_IMPORT_PATH,
"model_name": args.model,
"kwargs": agent_kwargs,
}
],
"datasets": [dataset_config(args.dataset, args.tasks)],
}
def run_harbor(command: list[str]) -> int:
env = os.environ.copy()
env["PYTHONPATH"] = pythonpath_with_harbor()
completed = subprocess.run(command, env=env, check=False)
return completed.returncode
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
try:
config = build_harbor_config(args)
config_dir = args.config_dir.expanduser()
config_dir.mkdir(parents=True, exist_ok=True)
config_path = config_dir / f"{config['job_name']}.json"
config_path.write_text(json.dumps(config, indent=2) + "\n")
command = ["harbor", "run", "-c", str(config_path)]
except Exception as error:
print(f"error: {error}", file=sys.stderr)
return 2
print(f"Wrote Harbor config: {config_path}")
print(f"Jobs directory: {config['jobs_dir']}")
print(f"PYTHONPATH: {pythonpath_with_harbor()}")
print(f"Command: {' '.join(command)}")
if args.dry_run:
return 0
try:
return run_harbor(command)
except FileNotFoundError:
print("error: `harbor` was not found on PATH", file=sys.stderr)
return 127
if __name__ == "__main__":
raise SystemExit(main())
+13
View File
@@ -0,0 +1,13 @@
[project]
name = "goose-harbor-eval"
version = "0.1.0"
description = "Goose eval tooling for Harbor benchmark datasets"
requires-python = ">=3.12"
dependencies = [
"harbor==0.6.4",
]
[dependency-groups]
dev = [
"pytest>=8.4.0",
]
+12
View File
@@ -0,0 +1,12 @@
#!/usr/bin/env sh
set -eu
SCRIPT_DIR=$(CDPATH= cd -- "$(dirname -- "$0")" && pwd)
if [ -n "${PYTHONPATH:-}" ]; then
export PYTHONPATH="$SCRIPT_DIR:$PYTHONPATH"
else
export PYTHONPATH="$SCRIPT_DIR"
fi
exec python3 "$SCRIPT_DIR/goose_harbor/runner.py" "$@"
+7
View File
@@ -0,0 +1,7 @@
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
+315
View File
@@ -0,0 +1,315 @@
from __future__ import annotations
import asyncio
from pathlib import Path
import pytest
from goose_harbor.goose_binary import GooseBinaryAgent
from goose_harbor.goose_binary import CONTAINER_CA_BUNDLE_PATH
from goose_harbor.goose_binary import CONTAINER_RECIPE_PATH
from goose_harbor.goose_binary import CONTAINER_GOOSE_PATH_ROOT
class ExecResult:
def __init__(self, stdout: str = "goose 1.0.0") -> None:
self.return_code = 0
self.stdout = stdout
self.stderr = ""
class FakeEnvironment:
def __init__(self) -> None:
self.uploads: list[tuple[Path, str]] = []
self.dir_uploads: list[tuple[Path, str]] = []
self.commands: list[dict[str, object]] = []
self.default_user: str | int | None = None
self.has_system_ca = True
async def upload_file(self, source_path: Path | str, target_path: str) -> None:
self.uploads.append((Path(source_path), target_path))
async def upload_dir(self, source_dir: Path | str, target_dir: str) -> None:
self.dir_uploads.append((Path(source_dir), target_dir))
async def exec(
self,
command: str,
cwd: str | None = None,
env: dict[str, str] | None = None,
timeout_sec: int | None = None,
user: str | int | None = None,
) -> ExecResult:
self.commands.append(
{
"command": command,
"cwd": cwd,
"env": env,
"timeout_sec": timeout_sec,
"user": user,
}
)
if "id -u && id -g" in command:
return ExecResult("1000\n1000\n")
if "ca-certificates.crt" in command and "echo present" in command:
return ExecResult("present\n" if self.has_system_ca else "missing\n")
return ExecResult()
@pytest.fixture
def goose_binary(tmp_path: Path) -> Path:
path = tmp_path / "goose"
path.write_text("#!/bin/sh\n")
return path
@pytest.fixture
def goose_profile(tmp_path: Path) -> Path:
path = tmp_path / "profile"
(path / "config").mkdir(parents=True)
(path / "config" / "config.yaml").write_text("GOOSE_PROVIDER: databricks\n")
return path
def test_install_uploads_binary_and_profile(
goose_binary: Path,
goose_profile: Path,
tmp_path: Path,
) -> None:
async def run_test() -> FakeEnvironment:
agent = GooseBinaryAgent(
logs_dir=tmp_path,
model_name="databricks/model",
goose_binary=str(goose_binary),
goose_profile=str(goose_profile),
)
environment = FakeEnvironment()
await agent.install(environment)
return environment
environment = asyncio.run(run_test())
assert environment.uploads == [(goose_binary.resolve(), "/installed-agent/goose")]
commands = "\n".join(str(item["command"]) for item in environment.commands)
assert "chmod 755 /installed-agent/goose" in commands
assert "ln -sf /installed-agent/goose ~/.local/bin/goose" in commands
assert environment.dir_uploads == [(goose_profile.resolve(), "/installed-agent/goose-profile")]
def test_install_uploads_config_directory_profile(
goose_binary: Path,
tmp_path: Path,
) -> None:
async def run_test() -> FakeEnvironment:
config_dir = tmp_path / "config"
config_dir.mkdir()
(config_dir / "config.yaml").write_text("GOOSE_PROVIDER: databricks\n")
agent = GooseBinaryAgent(
logs_dir=tmp_path,
model_name="databricks/model",
goose_binary=str(goose_binary),
goose_profile=str(config_dir),
)
environment = FakeEnvironment()
await agent.install(environment)
return environment
environment = asyncio.run(run_test())
assert environment.dir_uploads == [(tmp_path / "config", "/installed-agent/goose-profile/config")]
def test_install_chowns_uploaded_profile_when_agent_user_is_image_default(
goose_binary: Path,
goose_profile: Path,
tmp_path: Path,
) -> None:
async def run_test() -> FakeEnvironment:
agent = GooseBinaryAgent(
logs_dir=tmp_path,
model_name="databricks/model",
goose_binary=str(goose_binary),
goose_profile=str(goose_profile),
)
environment = FakeEnvironment()
await agent.install(environment)
return environment
environment = asyncio.run(run_test())
commands = [str(item["command"]) for item in environment.commands]
assert any("id -u && id -g" in command for command in commands)
assert any(
"chown -R 1000:1000 /installed-agent/goose-profile" in command
for command in commands
)
def test_install_can_install_goose_runtime_deps(
goose_binary: Path,
goose_profile: Path,
tmp_path: Path,
) -> None:
async def run_test() -> FakeEnvironment:
agent = GooseBinaryAgent(
logs_dir=tmp_path,
model_name="databricks/model",
goose_binary=str(goose_binary),
goose_profile=str(goose_profile),
install_goose_runtime_deps=True,
)
environment = FakeEnvironment()
await agent.install(environment)
return environment
environment = asyncio.run(run_test())
commands = [str(item["command"]) for item in environment.commands]
assert any("apt-get install -y libgomp1" in command for command in commands)
def test_missing_container_ca_bundle_is_uploaded_and_used(
goose_binary: Path,
goose_profile: Path,
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
async def run_test() -> FakeEnvironment:
host_ca_bundle = tmp_path / "cert.pem"
host_ca_bundle.write_text("test cert\n")
monkeypatch.setenv("SSL_CERT_FILE", str(host_ca_bundle))
agent = GooseBinaryAgent(
logs_dir=tmp_path,
model_name="databricks/model",
goose_binary=str(goose_binary),
goose_profile=str(goose_profile),
)
environment = FakeEnvironment()
environment.has_system_ca = False
await agent.install(environment)
await agent.run("fix the repo", environment, object())
return environment
environment = asyncio.run(run_test())
assert any(target == CONTAINER_CA_BUNDLE_PATH for _, target in environment.uploads)
assert environment.commands[-1]["env"]["SSL_CERT_FILE"] == CONTAINER_CA_BUNDLE_PATH
def test_run_uses_profile_without_keyring_or_provider_env_forwarding(
goose_binary: Path,
tmp_path: Path,
) -> None:
async def run_test() -> FakeEnvironment:
profile_root = tmp_path / "profile"
(profile_root / "config").mkdir(parents=True)
(profile_root / "config" / "config.yaml").write_text("GOOSE_PROVIDER: databricks\n")
agent = GooseBinaryAgent(
logs_dir=tmp_path,
model_name="databricks/model",
goose_binary=str(goose_binary),
goose_profile=str(profile_root),
)
environment = FakeEnvironment()
await agent.run("fix the repo", environment, object())
return environment
environment = asyncio.run(run_test())
run_command = environment.commands[-1]
env = run_command["env"]
assert isinstance(env, dict)
assert env["GOOSE_PATH_ROOT"] == "/installed-agent/goose-profile"
assert env["GOOSE_DISABLE_KEYRING"] == "true"
assert "DATABRICKS_TOKEN" not in env
def test_run_uploads_recipe_file_instead_of_heredoc(
goose_binary: Path,
goose_profile: Path,
tmp_path: Path,
) -> None:
async def run_test() -> FakeEnvironment:
agent = GooseBinaryAgent(
logs_dir=tmp_path,
model_name="databricks/model",
goose_binary=str(goose_binary),
goose_profile=str(goose_profile),
)
environment = FakeEnvironment()
await agent.run("line before\nEOF\nline after", environment, object())
return environment
environment = asyncio.run(run_test())
commands = [str(item["command"]) for item in environment.commands]
assert all("<< 'EOF'" not in command for command in commands)
assert any(target == CONTAINER_RECIPE_PATH for _, target in environment.uploads)
assert any(
f"goose run --recipe {CONTAINER_RECIPE_PATH}" in command
for command in commands
)
def test_run_copies_skills_into_isolated_profile(
goose_binary: Path,
goose_profile: Path,
tmp_path: Path,
) -> None:
async def run_test() -> FakeEnvironment:
skills_dir = tmp_path / "skills"
skills_dir.mkdir()
agent = GooseBinaryAgent(
logs_dir=tmp_path,
model_name="databricks/model",
goose_binary=str(goose_binary),
goose_profile=str(goose_profile),
skills_dir=str(skills_dir),
)
environment = FakeEnvironment()
await agent.run("fix the repo", environment, object())
return environment
environment = asyncio.run(run_test())
commands = [str(item["command"]) for item in environment.commands]
assert any(
f"{CONTAINER_GOOSE_PATH_ROOT}/config/skills" in command
and "~/.config/goose/skills" not in command
for command in commands
)
def test_run_chowns_uploaded_recipe_for_image_default_agent_user(
goose_binary: Path,
goose_profile: Path,
tmp_path: Path,
) -> None:
async def run_test() -> FakeEnvironment:
agent = GooseBinaryAgent(
logs_dir=tmp_path,
model_name="databricks/model",
goose_binary=str(goose_binary),
goose_profile=str(goose_profile),
)
environment = FakeEnvironment()
await agent.run("fix the repo", environment, object())
return environment
environment = asyncio.run(run_test())
commands = [str(item["command"]) for item in environment.commands]
assert any("id -u && id -g" in command for command in commands)
assert any(
f"chown 1000:1000 {CONTAINER_RECIPE_PATH}" in command
for command in commands
)
+145
View File
@@ -0,0 +1,145 @@
from __future__ import annotations
import json
from pathlib import Path
import pytest
from goose_harbor import runner
@pytest.fixture(autouse=True)
def clear_package_index_env(monkeypatch: pytest.MonkeyPatch) -> None:
for key in ("UV_DEFAULT_INDEX", "PIP_INDEX_URL", "UV_INDEX_URL"):
monkeypatch.delenv(key, raising=False)
def test_dry_run_writes_config_without_running_harbor(tmp_path: Path) -> None:
goose_binary = tmp_path / "goose"
goose_binary.write_text("#!/bin/sh\n")
goose_profile = tmp_path / "goose-profile"
goose_profile.mkdir()
config_dir = tmp_path / "configs"
result = runner.main(
[
"--goose-binary",
str(goose_binary),
"--goose-profile",
str(goose_profile),
"--dataset",
"terminal-bench/terminal-bench-2",
"--model",
"databricks/model",
"--task",
"terminal-bench/fix-git",
"--install-goose-runtime-deps",
"--config-dir",
str(config_dir),
"--dry-run",
]
)
assert result == 0
config_path = next(config_dir.glob("*.json"))
config = json.loads(config_path.read_text())
assert config["datasets"] == [
{
"name": "terminal-bench/terminal-bench-2",
"task_names": ["terminal-bench/fix-git"],
}
]
assert config["agents"][0]["kwargs"]["install_goose_runtime_deps"] is True
def test_package_dataset_suffix_uses_ref(tmp_path: Path) -> None:
goose_binary = tmp_path / "goose"
goose_binary.write_text("#!/bin/sh\n")
goose_profile = tmp_path / "goose-profile"
goose_profile.mkdir()
config_dir = tmp_path / "configs"
result = runner.main(
[
"--goose-binary",
str(goose_binary),
"--goose-profile",
str(goose_profile),
"--dataset",
"terminal-bench/terminal-bench-2@v1",
"--model",
"databricks/model",
"--config-dir",
str(config_dir),
"--dry-run",
]
)
assert result == 0
config = json.loads(next(config_dir.glob("*.json")).read_text())
assert config["datasets"] == [
{"name": "terminal-bench/terminal-bench-2", "ref": "v1"}
]
def test_registry_dataset_suffix_uses_version(tmp_path: Path) -> None:
goose_binary = tmp_path / "goose"
goose_binary.write_text("#!/bin/sh\n")
goose_profile = tmp_path / "goose-profile"
goose_profile.mkdir()
config_dir = tmp_path / "configs"
result = runner.main(
[
"--goose-binary",
str(goose_binary),
"--goose-profile",
str(goose_profile),
"--dataset",
"terminal-bench@2.0",
"--model",
"databricks/model",
"--config-dir",
str(config_dir),
"--dry-run",
]
)
assert result == 0
config = json.loads(next(config_dir.glob("*.json")).read_text())
assert config["datasets"] == [{"name": "terminal-bench", "version": "2.0"}]
def test_dry_run_accepts_unexpanded_home_paths(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
home = tmp_path / "home"
goose_binary = home / "bin" / "goose"
goose_binary.parent.mkdir(parents=True)
goose_binary.write_text("#!/bin/sh\n")
goose_profile = home / "goose-profile"
goose_profile.mkdir()
config_dir = tmp_path / "configs"
monkeypatch.setenv("HOME", str(home))
result = runner.main(
[
"--goose-binary",
"~/bin/goose",
"--goose-profile",
"~/goose-profile",
"--dataset",
"terminal-bench/terminal-bench-2",
"--model",
"databricks/model",
"--config-dir",
str(config_dir),
"--dry-run",
]
)
assert result == 0
config = json.loads(next(config_dir.glob("*.json")).read_text())
assert config["agents"][0]["kwargs"]["goose_binary"] == str(goose_binary)
assert config["agents"][0]["kwargs"]["goose_profile"] == str(goose_profile)