Use media marker parts for local vision prompts

Signed-off-by: jh-block <jhugo@block.xyz>
This commit is contained in:
jh-block
2026-05-27 21:01:48 +02:00
parent 35d1fc7c51
commit 2f5e6613e1
2 changed files with 143 additions and 5 deletions
+127 -2
View File
@@ -188,17 +188,28 @@ pub fn recommend_local_model(runtime: &InferenceRuntime) -> String {
FEATURED_MODELS[0].spec.to_string()
}
fn build_openai_messages_json(system: &str, messages: &[Message]) -> String {
fn build_openai_messages_json(
system: &str,
messages: &[Message],
media_marker: Option<&str>,
) -> String {
use crate::providers::formats::openai::format_messages;
use crate::providers::utils::ImageFormat;
let mut arr: Vec<Value> = vec![json!({"role": "system", "content": system})];
arr.extend(format_messages(messages, &ImageFormat::OpenAi));
strip_image_parts_from_messages(&mut arr);
if let Some(marker) = media_marker {
convert_text_media_markers(&mut arr, marker);
}
serde_json::to_string(&arr).unwrap_or_else(|_| "[]".to_string())
}
fn build_openai_text_messages_json(system: &str, messages: &[Message]) -> String {
fn build_openai_text_messages_json(
system: &str,
messages: &[Message],
media_marker: Option<&str>,
) -> String {
let mut arr: Vec<Value> = vec![json!({"role": "system", "content": system})];
arr.extend(messages.iter().filter_map(|m| {
let content = extract_text_content(m);
@@ -211,9 +222,75 @@ fn build_openai_text_messages_json(system: &str, messages: &[Message]) -> String
};
Some(json!({"role": role, "content": content}))
}));
if let Some(marker) = media_marker {
convert_text_media_markers(&mut arr, marker);
}
serde_json::to_string(&arr).unwrap_or_else(|_| "[]".to_string())
}
fn convert_text_media_markers(messages: &mut [Value], marker: &str) {
if marker.is_empty() {
return;
}
for msg in messages {
let Some(content) = msg.get_mut("content") else {
continue;
};
if let Some(text) = content.as_str() {
if let Some(parts) = split_media_marker_text(text, marker) {
*content = json!(parts);
}
continue;
}
let Some(content_parts) = content.as_array_mut() else {
continue;
};
let mut updated = Vec::new();
let mut changed = false;
for part in content_parts.iter() {
if part.get("type").and_then(|v| v.as_str()) == Some("text") {
if let Some(text) = part.get("text").and_then(|v| v.as_str()) {
if let Some(parts) = split_media_marker_text(text, marker) {
updated.extend(parts);
changed = true;
continue;
}
}
}
updated.push(part.clone());
}
if changed {
*content_parts = updated;
}
}
}
fn split_media_marker_text(text: &str, marker: &str) -> Option<Vec<Value>> {
let mut parts = Vec::new();
let mut rest = text;
let mut found_marker = false;
while let Some((before, after)) = rest.split_once(marker) {
found_marker = true;
let before = before.strip_suffix('\n').unwrap_or(before);
if !before.is_empty() {
parts.push(json!({"type": "text", "text": before}));
}
parts.push(json!({"type": "media_marker", "text": marker}));
rest = after;
rest = rest.strip_prefix('\n').unwrap_or(rest);
}
if !found_marker {
return None;
}
if !rest.is_empty() {
parts.push(json!({"type": "text", "text": rest}));
}
Some(parts)
}
/// Remove `image_url` content parts from OpenAI-format messages JSON, replacing
/// each with a text note. This prevents an FFI crash in llama.cpp which does not
/// accept `image_url` content-part types.
@@ -577,3 +654,51 @@ impl Provider for LocalInferenceProvider {
}))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn converts_marker_in_string_content_to_media_marker_part() {
let mut messages = vec![json!({
"role": "user",
"content": "look\n<__media__>\nclosely",
})];
convert_text_media_markers(&mut messages, "<__media__>");
assert_eq!(
messages[0]["content"],
json!([
{"type": "text", "text": "look"},
{"type": "media_marker", "text": "<__media__>"},
{"type": "text", "text": "closely"},
])
);
}
#[test]
fn converts_marker_inside_text_content_parts() {
let mut messages = vec![json!({
"role": "user",
"content": [
{"type": "text", "text": "<__media__>describe"},
{"type": "text", "text": "next"},
{"type": "media_marker", "text": "<__media__>"},
],
})];
convert_text_media_markers(&mut messages, "<__media__>");
assert_eq!(
messages[0]["content"],
json!([
{"type": "media_marker", "text": "<__media__>"},
{"type": "text", "text": "describe"},
{"type": "text", "text": "next"},
{"type": "media_marker", "text": "<__media__>"},
])
);
}
}
@@ -449,6 +449,7 @@ impl LocalInferenceBackend for LlamaCppBackend {
} else {
(Vec::new(), None)
};
let has_media = !images.is_empty();
let effective_messages = vision_messages.as_deref().unwrap_or(request.messages);
let code_mode_enabled = request.tools.iter().any(|t| t.name == CODE_EXECUTION_TOOL);
@@ -469,7 +470,11 @@ impl LocalInferenceBackend for LlamaCppBackend {
if matches!(request.settings.tool_calling, ToolCallingMode::Auto)
&& has_native_tool_payload
{
let messages_json = build_openai_messages_json(request.system, effective_messages);
let messages_json = build_openai_messages_json(
request.system,
effective_messages,
has_media.then_some(marker),
);
if let Some(template) = loaded.templates.tool_use.as_ref() {
supports_native_tool_calling(
loaded,
@@ -506,9 +511,17 @@ impl LocalInferenceBackend for LlamaCppBackend {
};
let oai_messages_json = if use_emulator {
build_openai_text_messages_json(&system_prompt, effective_messages)
build_openai_text_messages_json(
&system_prompt,
effective_messages,
has_media.then_some(marker),
)
} else {
build_openai_messages_json(&system_prompt, effective_messages)
build_openai_messages_json(
&system_prompt,
effective_messages,
has_media.then_some(marker),
)
};
if !images.is_empty() && loaded.mtmd_ctx.is_none() {