mirror of
https://github.com/aaif-goose/goose.git
synced 2026-06-02 06:14:27 +02:00
Use media marker parts for local vision prompts
Signed-off-by: jh-block <jhugo@block.xyz>
This commit is contained in:
@@ -188,17 +188,28 @@ pub fn recommend_local_model(runtime: &InferenceRuntime) -> String {
|
||||
FEATURED_MODELS[0].spec.to_string()
|
||||
}
|
||||
|
||||
fn build_openai_messages_json(system: &str, messages: &[Message]) -> String {
|
||||
fn build_openai_messages_json(
|
||||
system: &str,
|
||||
messages: &[Message],
|
||||
media_marker: Option<&str>,
|
||||
) -> String {
|
||||
use crate::providers::formats::openai::format_messages;
|
||||
use crate::providers::utils::ImageFormat;
|
||||
|
||||
let mut arr: Vec<Value> = vec![json!({"role": "system", "content": system})];
|
||||
arr.extend(format_messages(messages, &ImageFormat::OpenAi));
|
||||
strip_image_parts_from_messages(&mut arr);
|
||||
if let Some(marker) = media_marker {
|
||||
convert_text_media_markers(&mut arr, marker);
|
||||
}
|
||||
serde_json::to_string(&arr).unwrap_or_else(|_| "[]".to_string())
|
||||
}
|
||||
|
||||
fn build_openai_text_messages_json(system: &str, messages: &[Message]) -> String {
|
||||
fn build_openai_text_messages_json(
|
||||
system: &str,
|
||||
messages: &[Message],
|
||||
media_marker: Option<&str>,
|
||||
) -> String {
|
||||
let mut arr: Vec<Value> = vec![json!({"role": "system", "content": system})];
|
||||
arr.extend(messages.iter().filter_map(|m| {
|
||||
let content = extract_text_content(m);
|
||||
@@ -211,9 +222,75 @@ fn build_openai_text_messages_json(system: &str, messages: &[Message]) -> String
|
||||
};
|
||||
Some(json!({"role": role, "content": content}))
|
||||
}));
|
||||
if let Some(marker) = media_marker {
|
||||
convert_text_media_markers(&mut arr, marker);
|
||||
}
|
||||
serde_json::to_string(&arr).unwrap_or_else(|_| "[]".to_string())
|
||||
}
|
||||
|
||||
fn convert_text_media_markers(messages: &mut [Value], marker: &str) {
|
||||
if marker.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
for msg in messages {
|
||||
let Some(content) = msg.get_mut("content") else {
|
||||
continue;
|
||||
};
|
||||
|
||||
if let Some(text) = content.as_str() {
|
||||
if let Some(parts) = split_media_marker_text(text, marker) {
|
||||
*content = json!(parts);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(content_parts) = content.as_array_mut() else {
|
||||
continue;
|
||||
};
|
||||
let mut updated = Vec::new();
|
||||
let mut changed = false;
|
||||
for part in content_parts.iter() {
|
||||
if part.get("type").and_then(|v| v.as_str()) == Some("text") {
|
||||
if let Some(text) = part.get("text").and_then(|v| v.as_str()) {
|
||||
if let Some(parts) = split_media_marker_text(text, marker) {
|
||||
updated.extend(parts);
|
||||
changed = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
updated.push(part.clone());
|
||||
}
|
||||
if changed {
|
||||
*content_parts = updated;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn split_media_marker_text(text: &str, marker: &str) -> Option<Vec<Value>> {
|
||||
let mut parts = Vec::new();
|
||||
let mut rest = text;
|
||||
let mut found_marker = false;
|
||||
while let Some((before, after)) = rest.split_once(marker) {
|
||||
found_marker = true;
|
||||
let before = before.strip_suffix('\n').unwrap_or(before);
|
||||
if !before.is_empty() {
|
||||
parts.push(json!({"type": "text", "text": before}));
|
||||
}
|
||||
parts.push(json!({"type": "media_marker", "text": marker}));
|
||||
rest = after;
|
||||
rest = rest.strip_prefix('\n').unwrap_or(rest);
|
||||
}
|
||||
if !found_marker {
|
||||
return None;
|
||||
}
|
||||
if !rest.is_empty() {
|
||||
parts.push(json!({"type": "text", "text": rest}));
|
||||
}
|
||||
Some(parts)
|
||||
}
|
||||
|
||||
/// Remove `image_url` content parts from OpenAI-format messages JSON, replacing
|
||||
/// each with a text note. This prevents an FFI crash in llama.cpp which does not
|
||||
/// accept `image_url` content-part types.
|
||||
@@ -577,3 +654,51 @@ impl Provider for LocalInferenceProvider {
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn converts_marker_in_string_content_to_media_marker_part() {
|
||||
let mut messages = vec![json!({
|
||||
"role": "user",
|
||||
"content": "look\n<__media__>\nclosely",
|
||||
})];
|
||||
|
||||
convert_text_media_markers(&mut messages, "<__media__>");
|
||||
|
||||
assert_eq!(
|
||||
messages[0]["content"],
|
||||
json!([
|
||||
{"type": "text", "text": "look"},
|
||||
{"type": "media_marker", "text": "<__media__>"},
|
||||
{"type": "text", "text": "closely"},
|
||||
])
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn converts_marker_inside_text_content_parts() {
|
||||
let mut messages = vec![json!({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "<__media__>describe"},
|
||||
{"type": "text", "text": "next"},
|
||||
{"type": "media_marker", "text": "<__media__>"},
|
||||
],
|
||||
})];
|
||||
|
||||
convert_text_media_markers(&mut messages, "<__media__>");
|
||||
|
||||
assert_eq!(
|
||||
messages[0]["content"],
|
||||
json!([
|
||||
{"type": "media_marker", "text": "<__media__>"},
|
||||
{"type": "text", "text": "describe"},
|
||||
{"type": "text", "text": "next"},
|
||||
{"type": "media_marker", "text": "<__media__>"},
|
||||
])
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -449,6 +449,7 @@ impl LocalInferenceBackend for LlamaCppBackend {
|
||||
} else {
|
||||
(Vec::new(), None)
|
||||
};
|
||||
let has_media = !images.is_empty();
|
||||
let effective_messages = vision_messages.as_deref().unwrap_or(request.messages);
|
||||
|
||||
let code_mode_enabled = request.tools.iter().any(|t| t.name == CODE_EXECUTION_TOOL);
|
||||
@@ -469,7 +470,11 @@ impl LocalInferenceBackend for LlamaCppBackend {
|
||||
if matches!(request.settings.tool_calling, ToolCallingMode::Auto)
|
||||
&& has_native_tool_payload
|
||||
{
|
||||
let messages_json = build_openai_messages_json(request.system, effective_messages);
|
||||
let messages_json = build_openai_messages_json(
|
||||
request.system,
|
||||
effective_messages,
|
||||
has_media.then_some(marker),
|
||||
);
|
||||
if let Some(template) = loaded.templates.tool_use.as_ref() {
|
||||
supports_native_tool_calling(
|
||||
loaded,
|
||||
@@ -506,9 +511,17 @@ impl LocalInferenceBackend for LlamaCppBackend {
|
||||
};
|
||||
|
||||
let oai_messages_json = if use_emulator {
|
||||
build_openai_text_messages_json(&system_prompt, effective_messages)
|
||||
build_openai_text_messages_json(
|
||||
&system_prompt,
|
||||
effective_messages,
|
||||
has_media.then_some(marker),
|
||||
)
|
||||
} else {
|
||||
build_openai_messages_json(&system_prompt, effective_messages)
|
||||
build_openai_messages_json(
|
||||
&system_prompt,
|
||||
effective_messages,
|
||||
has_media.then_some(marker),
|
||||
)
|
||||
};
|
||||
|
||||
if !images.is_empty() && loaded.mtmd_ctx.is_none() {
|
||||
|
||||
Reference in New Issue
Block a user