common/autoparser: fixes for newline handling / forced tool calls (#22654)
* chat/autoparser: the fixes * Move optspace() to chat-peg-parser, comment out server tests invalidated due to content now allowed with forced tool calls. * Trim whitespace on apply instead
This commit is contained in:
committed by
GitHub
parent
994118a183
commit
a4701c98f7
@@ -136,10 +136,10 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
|
||||
if (!end.empty()) {
|
||||
if (!start.empty()) {
|
||||
// Standard tag-based: optional(<think>reasoning</think>)
|
||||
return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
|
||||
return p.optional(p.optspace(start) + p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
|
||||
}
|
||||
// Delimiter-style (empty start)
|
||||
return p.optional(p.reasoning(p.until(end)) + end + p.space());
|
||||
return p.optional(p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -186,7 +186,6 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
|
||||
common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
// Build effective field names with dot notation if function_field is set
|
||||
std::string name_field = format.name_field;
|
||||
@@ -225,8 +224,7 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
|
||||
tool_start = format.per_call_start;
|
||||
}
|
||||
|
||||
return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
|
||||
p.end();
|
||||
return ctx.reasoning_parser + p.optional(p.content(p.until(tool_start))) + tools_parser + p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
|
||||
@@ -270,7 +268,6 @@ common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p,
|
||||
common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
common_peg_parser tool_choice = p.choice();
|
||||
|
||||
@@ -336,14 +333,12 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
|
||||
|
||||
std::string trigger_marker = !format.section_start.empty() ? format.section_start : format.per_call_start;
|
||||
auto content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
|
||||
return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
|
||||
p.end();
|
||||
return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));
|
||||
|
||||
@@ -471,8 +466,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||
|
||||
std::string trigger_marker = !format.section_start.empty() ? format.section_start : format.per_call_start;
|
||||
auto content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
|
||||
return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
|
||||
p.end();
|
||||
return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
|
||||
}
|
||||
|
||||
} // namespace autoparser
|
||||
|
||||
@@ -342,7 +342,7 @@ void analyze_reasoning::compare_thinking_enabled() {
|
||||
if (left_trimmed.empty() && !diff.right.empty()) {
|
||||
if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
|
||||
if (start.empty()) {
|
||||
start = trim_leading_whitespace(diff.right);
|
||||
start = diff.right;
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
}
|
||||
}
|
||||
@@ -353,7 +353,7 @@ void analyze_reasoning::compare_thinking_enabled() {
|
||||
if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
|
||||
start = seg[seg.size() - 2].value;
|
||||
}
|
||||
end = trim_trailing_whitespace(diff.left);
|
||||
end = diff.left;
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
}
|
||||
}
|
||||
@@ -445,14 +445,14 @@ void analyze_reasoning::compare_reasoning_scope() {
|
||||
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
|
||||
if (result.result.success()) {
|
||||
start = result.tags["pre"];
|
||||
end = trim_trailing_whitespace(result.tags["post"]);
|
||||
end = result.tags["post"];
|
||||
} else {
|
||||
auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
|
||||
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
|
||||
});
|
||||
result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
|
||||
if (result.result.success()) {
|
||||
end = trim_trailing_whitespace(result.tags["post"]);
|
||||
end = result.tags["post"];
|
||||
} else {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
|
||||
mode = reasoning_mode::NONE;
|
||||
|
||||
@@ -816,6 +816,32 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s
|
||||
return literal(s.substr(0, s.rfind(delimiter)));
|
||||
}
|
||||
|
||||
common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
|
||||
auto parser = eps();
|
||||
size_t end_of_prefix_space = tag.size();
|
||||
size_t start_of_suffix_space = tag.size();
|
||||
for (size_t i = 0; i < tag.size(); i++) {
|
||||
if (!std::isspace(tag[i])) {
|
||||
end_of_prefix_space = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (size_t i = tag.size(); i > 0; i--) {
|
||||
if (!std::isspace(tag[i - 1])) {
|
||||
start_of_suffix_space = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < end_of_prefix_space; i++) {
|
||||
parser += optional(literal(std::string(1, tag[i])));
|
||||
}
|
||||
parser += literal(tag.substr(end_of_prefix_space, start_of_suffix_space - end_of_prefix_space));
|
||||
for (size_t i = start_of_suffix_space; i < tag.size(); i++) {
|
||||
parser += optional(literal(std::string(1, tag[i])));
|
||||
}
|
||||
return parser;
|
||||
}
|
||||
|
||||
common_peg_parser common_chat_peg_builder::standard_json_tools(
|
||||
const std::string & section_start,
|
||||
const std::string & section_end,
|
||||
|
||||
@@ -96,6 +96,9 @@ class common_chat_peg_builder : public common_peg_parser_builder {
|
||||
// Return a parser that parses the prefix of a string, up to a given delimiter.
|
||||
common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});
|
||||
|
||||
// Return a parser that parses all elements of tag, but leading and trailing spaces are optional
|
||||
common_peg_parser optspace(const std::string & tag);
|
||||
|
||||
// Legacy-compatible helper for building standard JSON tool calls
|
||||
// Used by tests and manual parsers
|
||||
// name_key/args_key: JSON key names for function name and arguments
|
||||
|
||||
+2
-2
@@ -2221,8 +2221,8 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
|
||||
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
|
||||
if (auto_params.supports_thinking) {
|
||||
auto_params.thinking_start_tag = autoparser.reasoning.start;
|
||||
auto_params.thinking_end_tag = autoparser.reasoning.end;
|
||||
auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
|
||||
auto_params.thinking_end_tag = trim_whitespace(autoparser.reasoning.end);
|
||||
}
|
||||
auto_params.generation_prompt = params.generation_prompt;
|
||||
common_peg_arena arena;
|
||||
|
||||
@@ -158,6 +158,8 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
|
||||
for (size_t i = 0; i < cur_p->size; i++) {
|
||||
if (cur_p->data[i].id != forced) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
cur_p->data[i].logit = +INFINITY; // force the token
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,7 +79,7 @@ def print_info(msg):
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def chat_completion(url, messages, tools=None, stream=False):
|
||||
def chat_completion(url, messages, tools=None, stream=False, force_tools=False):
|
||||
payload = {
|
||||
"messages": messages,
|
||||
"stream": stream,
|
||||
@@ -87,7 +87,10 @@ def chat_completion(url, messages, tools=None, stream=False):
|
||||
}
|
||||
if tools:
|
||||
payload["tools"] = tools
|
||||
payload["tool_choice"] = "auto"
|
||||
if force_tools:
|
||||
payload["tool_choice"] = "required"
|
||||
else:
|
||||
payload["tool_choice"] = "auto"
|
||||
|
||||
try:
|
||||
response = requests.post(url, json=payload, stream=stream)
|
||||
@@ -160,7 +163,13 @@ def chat_completion(url, messages, tools=None, stream=False):
|
||||
return result
|
||||
|
||||
|
||||
def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6):
|
||||
def all_tools_called(tools, all_tool_calls):
|
||||
all_tool_names = set([tc["function"]["name"] for tc in tools])
|
||||
all_called_tool_names = set([tc["function"]["name"] for tc in all_tool_calls])
|
||||
return all_tool_names == all_called_tool_names
|
||||
|
||||
|
||||
def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6, force_tools=False):
|
||||
"""
|
||||
Drive the multi-turn tool-call loop:
|
||||
1. Send messages to model.
|
||||
@@ -172,8 +181,8 @@ def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turn
|
||||
msgs = list(messages)
|
||||
all_tool_calls: list[dict] = []
|
||||
|
||||
for _ in range(max_turns):
|
||||
result = chat_completion(url, msgs, tools=tools, stream=stream)
|
||||
for t in range(max_turns):
|
||||
result = chat_completion(url, msgs, tools=tools, stream=stream, force_tools=(force_tools and not all_tools_called(tools, all_tool_calls)))
|
||||
if result is None:
|
||||
return all_tool_calls, None
|
||||
|
||||
@@ -235,10 +244,10 @@ def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turn
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def run_test(url, test_case, stream):
|
||||
def run_test(url, test_case, stream, force_tools):
|
||||
name = test_case["name"]
|
||||
mode = f"{'stream' if stream else 'non-stream'}"
|
||||
print_header(f"{name} [{mode}]")
|
||||
print_header(f"{name} [{mode}, force_tools={force_tools}] ")
|
||||
|
||||
all_tool_calls, final_content = run_agentic_loop(
|
||||
url,
|
||||
@@ -246,6 +255,7 @@ def run_test(url, test_case, stream):
|
||||
tools=test_case["tools"],
|
||||
mock_tool_responses=test_case["mock_tool_responses"],
|
||||
stream=stream,
|
||||
force_tools=force_tools
|
||||
)
|
||||
|
||||
if final_content is None and not all_tool_calls:
|
||||
@@ -1093,6 +1103,9 @@ def main():
|
||||
parser.add_argument(
|
||||
"--stream-only", action="store_true", help="Only run streaming mode tests"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-tools", action="store_true", help="Change tool mode to forced instead of auto"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
help="Run only the test whose name contains this substring (case-insensitive)",
|
||||
@@ -1103,10 +1116,13 @@ def main():
|
||||
print_info(f"Testing server at {url}")
|
||||
|
||||
modes = []
|
||||
force_tools = False
|
||||
if not args.stream_only:
|
||||
modes.append(False)
|
||||
if not args.no_stream:
|
||||
modes.append(True)
|
||||
if args.force_tools:
|
||||
force_tools = True
|
||||
|
||||
cases: list[dict] = ALL_TEST_CASES
|
||||
if args.test:
|
||||
@@ -1121,7 +1137,7 @@ def main():
|
||||
for stream in modes:
|
||||
for case in cases:
|
||||
total += 1
|
||||
if run_test(url, case, stream=stream):
|
||||
if run_test(url, case, stream=stream, force_tools=force_tools):
|
||||
passed += 1
|
||||
|
||||
color = GREEN if passed == total else RED
|
||||
|
||||
+265
-12
@@ -542,6 +542,36 @@ static common_chat_tool edit_tool{
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool manage_todo_list_tool{
|
||||
/* .name = */ "manage_todo_list",
|
||||
/* .description = */ "Create or update the todo list",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"todos": {
|
||||
"type": "array",
|
||||
"description": "List of TODO list items"
|
||||
}
|
||||
},
|
||||
"required": ["todos"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool run_in_terminal_tool{
|
||||
/* .name = */ "run_in_terminal",
|
||||
/* .description = */ "Run a shell command.",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"command": {
|
||||
"type": "string",
|
||||
"description": "Shell command to run"
|
||||
}
|
||||
},
|
||||
"required": ["command"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool magic_tool{
|
||||
/* .name = */ "magic",
|
||||
/* .description = */ "Magic tool that takes a hash",
|
||||
@@ -1379,6 +1409,16 @@ class peg_test_builder {
|
||||
return *this;
|
||||
}
|
||||
|
||||
peg_test_builder & tool_choice(common_chat_tool_choice choice) {
|
||||
tc_.params.tool_choice = choice;
|
||||
return *this;
|
||||
}
|
||||
|
||||
peg_test_builder & messages(std::vector<common_chat_msg> messages) {
|
||||
tc_.params.messages = std::move(messages);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Execute the test
|
||||
void run() {
|
||||
// Check template filter
|
||||
@@ -1755,23 +1795,23 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>"
|
||||
)
|
||||
"</tool_call>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({
|
||||
python_tool
|
||||
})
|
||||
.expect_reasoning("Let's call a tool: <tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Not the real call!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.expect_reasoning(
|
||||
"Let's call a tool: <tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Not the real call!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.expect_tool_calls({
|
||||
{ "python", "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} },
|
||||
})
|
||||
@@ -1800,6 +1840,219 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.tools({ empty_args_tool_no_properties })
|
||||
.expect(message_with_tool_calls("empty_args_no_props", "{}"))
|
||||
.run();
|
||||
|
||||
// Edge cases when reasoning traces are not sent
|
||||
tst.test(
|
||||
"<think>\n\n</think>\n\n"
|
||||
"<tool_call>\n"
|
||||
"<function=special_function>\n"
|
||||
"<parameter=arg1>\n1\n</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({
|
||||
special_function_tool
|
||||
})
|
||||
.expect_reasoning("<think>\n\n")
|
||||
.expect_tool_calls({ { "special_function", "{\"arg1\": 1}", "" } })
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"</think>\n\n"
|
||||
"<tool_call>\n"
|
||||
"<function=special_function>\n"
|
||||
"<parameter=arg1>\n1\n</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({
|
||||
special_function_tool
|
||||
})
|
||||
.expect_reasoning("")
|
||||
.expect_tool_calls({ { "special_function", "{\"arg1\": 1}", "" } })
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"</think>\n\n"
|
||||
"<tool_call>\n"
|
||||
"<function=run_in_terminal>\n"
|
||||
"<parameter=command>\n"
|
||||
"pwd\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.tools({
|
||||
run_in_terminal_tool
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "run_in_terminal", R"({"command": "pwd"})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"</think>\n\n"
|
||||
"Let me inspect the current directory.\n"
|
||||
"<tool_call>\n"
|
||||
"<function=run_in_terminal>\n"
|
||||
"<parameter=command>\n"
|
||||
"pwd\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.tools({
|
||||
run_in_terminal_tool
|
||||
})
|
||||
.expect_content("Let me inspect the current directory.\n")
|
||||
.expect_tool_calls({
|
||||
{ "run_in_terminal", R"({"command": "pwd"})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"</think>\n\n"
|
||||
"Let me inspect the current directory.\n"
|
||||
"<tool_call>\n"
|
||||
"<function=run_in_terminal>\n"
|
||||
"<parameter=command>\n"
|
||||
"pwd\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.tools({
|
||||
run_in_terminal_tool
|
||||
})
|
||||
.tool_choice(COMMON_CHAT_TOOL_CHOICE_REQUIRED)
|
||||
.expect_content("Let me inspect the current directory.\n")
|
||||
.expect_tool_calls({
|
||||
{ "run_in_terminal", R"({"command": "pwd"})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"I should inspect the directory.\n"
|
||||
"</think>\n\n"
|
||||
"Let me inspect it now.\n"
|
||||
"<tool_call>\n"
|
||||
"<function=run_in_terminal>\n"
|
||||
"<parameter=command>\n"
|
||||
"pwd\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.tools({
|
||||
run_in_terminal_tool
|
||||
})
|
||||
.expect_reasoning("I should inspect the directory.")
|
||||
.expect_content("Let me inspect it now.\n")
|
||||
.expect_tool_calls({
|
||||
{ "run_in_terminal", R"({"command": "pwd"})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"I might call <tool_call> later, but I am still thinking.\n"
|
||||
"</think>\n\n"
|
||||
"Final answer without tools.")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.tools({ run_in_terminal_tool })
|
||||
.expect_reasoning("I might call <tool_call> later, but I am still thinking.")
|
||||
.expect_content("Final answer without tools.")
|
||||
.run();
|
||||
|
||||
{
|
||||
common_chat_msg user_start;
|
||||
user_start.role = "user";
|
||||
user_start.content = "Create a todo list, then inspect the repository.";
|
||||
|
||||
common_chat_msg assistant_todos =
|
||||
simple_assist_msg("", "", "manage_todo_list",
|
||||
R"({"todos":[{"item":"Inspect repository","selected":false}]})", "call_todos");
|
||||
|
||||
common_chat_msg tool_result;
|
||||
tool_result.role = "tool";
|
||||
tool_result.content = "Successfully wrote todo list";
|
||||
tool_result.tool_call_id = "call_todos";
|
||||
|
||||
common_chat_msg user_continue;
|
||||
user_continue.role = "user";
|
||||
user_continue.content = "Proceed.";
|
||||
|
||||
tst.test(
|
||||
"I need to run a terminal command.\n"
|
||||
"</think>\n\n"
|
||||
"<tool_call>\n"
|
||||
"<function=run_in_terminal>\n"
|
||||
"<parameter=command>\n"
|
||||
"pwd\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.tools({
|
||||
manage_todo_list_tool, run_in_terminal_tool
|
||||
})
|
||||
.messages({ user_start, assistant_todos, tool_result, user_continue })
|
||||
.expect_reasoning("I need to run a terminal command.")
|
||||
.expect_tool_calls({
|
||||
{ "run_in_terminal", R"({"command": "pwd"})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"I need to run a terminal command.\n"
|
||||
"</think>\n\n"
|
||||
"Let me inspect the current directory.\n"
|
||||
"<tool_call>\n"
|
||||
"<function=run_in_terminal>\n"
|
||||
"<parameter=command>\n"
|
||||
"pwd\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.tools({
|
||||
manage_todo_list_tool, run_in_terminal_tool
|
||||
})
|
||||
.tool_choice(COMMON_CHAT_TOOL_CHOICE_REQUIRED)
|
||||
.messages({ user_start, assistant_todos, tool_result, user_continue })
|
||||
.expect_reasoning("I need to run a terminal command.")
|
||||
.expect_content("Let me inspect the current directory.\n")
|
||||
.expect_tool_calls({
|
||||
{ "run_in_terminal", R"({"command": "pwd"})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"</think>\n\n"
|
||||
"<tool_call>\n"
|
||||
"<function=run_in_terminal>\n"
|
||||
"<parameter=command>\n"
|
||||
"pwd\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.tools({
|
||||
manage_todo_list_tool, run_in_terminal_tool
|
||||
})
|
||||
.messages({ user_start, assistant_todos, tool_result, user_continue })
|
||||
.expect_tool_calls({
|
||||
{ "run_in_terminal", R"({"command": "pwd"})", {} },
|
||||
})
|
||||
.run();
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
@@ -70,20 +70,20 @@ static void test_reasoning_budget(
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
|
||||
// Check if forcing is active (all logits except one should be -INFINITY)
|
||||
size_t finite_count = 0;
|
||||
llama_token finite_token = -1;
|
||||
size_t not_neg_inf = 0;
|
||||
llama_token not_neg_inf_token = -1;
|
||||
for (size_t j = 0; j < cur.size(); j++) {
|
||||
if (std::isfinite(cur[j].logit)) {
|
||||
finite_count++;
|
||||
finite_token = cur[j].id;
|
||||
if (std::isfinite(cur[j].logit) || cur[j].logit > 0) { // +INFINITY
|
||||
not_neg_inf++;
|
||||
not_neg_inf_token = cur[j].id;
|
||||
}
|
||||
}
|
||||
|
||||
llama_sampler_accept(sampler, sequence[i]);
|
||||
|
||||
fprintf(stderr, " i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token);
|
||||
fprintf(stderr, " i=%zu: token=%d, not_neg_inf_count=%zu, not_neg_inf_token=%d\n", i, (int)sequence[i], not_neg_inf, (int)not_neg_inf_token);
|
||||
|
||||
if (finite_count == 1) {
|
||||
if (not_neg_inf == 1) {
|
||||
if (actual_force_start == SIZE_MAX) {
|
||||
actual_force_start = i;
|
||||
}
|
||||
|
||||
@@ -126,69 +126,70 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict
|
||||
actual_arguments = json.loads(actual_arguments)
|
||||
assert argument_key in actual_arguments, f"tool arguments: {actual_arguments}, expected: {argument_key}"
|
||||
|
||||
# PR #22654: commented out since we're now allowing content before tool calls in tool_call: required, so we can't force this
|
||||
# in the tiny model just by using the grammar
|
||||
#
|
||||
# @pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
|
||||
# @pytest.mark.parametrize("template_name,tool,argument_key", [
|
||||
# ("Qwen3-Coder", TEST_TOOL, "success"),
|
||||
# ("Qwen3-Coder", TEST_TOOL, "success"),
|
||||
# ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"),
|
||||
# ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"),
|
||||
# ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
|
||||
# ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
|
||||
# ])
|
||||
# def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
|
||||
# global server
|
||||
# n_predict = 1024
|
||||
# # server = ServerPreset.stories15m_moe()
|
||||
# server.jinja = True
|
||||
# server.n_predict = n_predict
|
||||
# server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
# server.start()
|
||||
# do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
|
||||
|
||||
@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
|
||||
@pytest.mark.parametrize("template_name,tool,argument_key", [
|
||||
("Qwen3-Coder", TEST_TOOL, "success"),
|
||||
("Qwen3-Coder", TEST_TOOL, "success"),
|
||||
("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"),
|
||||
("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"),
|
||||
("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
|
||||
("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
|
||||
])
|
||||
def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
|
||||
global server
|
||||
n_predict = 1024
|
||||
# server = ServerPreset.stories15m_moe()
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start()
|
||||
do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
|
||||
# @pytest.mark.slow
|
||||
# @pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
|
||||
# @pytest.mark.parametrize("template_name,tool,argument_key", [
|
||||
# ("meta-llama-Llama-3.1-8B-Instruct", TEST_TOOL, "success"),
|
||||
# ("meta-llama-Llama-3.1-8B-Instruct", PYTHON_TOOL, "code"),
|
||||
|
||||
# ("meetkai-functionary-medium-v3.1", TEST_TOOL, "success"),
|
||||
# ("meetkai-functionary-medium-v3.1", PYTHON_TOOL, "code"),
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
|
||||
@pytest.mark.parametrize("template_name,tool,argument_key", [
|
||||
("meta-llama-Llama-3.1-8B-Instruct", TEST_TOOL, "success"),
|
||||
("meta-llama-Llama-3.1-8B-Instruct", PYTHON_TOOL, "code"),
|
||||
# ("meetkai-functionary-medium-v3.2", TEST_TOOL, "success"),
|
||||
# # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
|
||||
# # ("meetkai-functionary-medium-v3.2", PYTHON_TOOL, "code"),
|
||||
|
||||
("meetkai-functionary-medium-v3.1", TEST_TOOL, "success"),
|
||||
("meetkai-functionary-medium-v3.1", PYTHON_TOOL, "code"),
|
||||
# ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL, "success"),
|
||||
# ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL, "code"),
|
||||
|
||||
("meetkai-functionary-medium-v3.2", TEST_TOOL, "success"),
|
||||
# Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
|
||||
# ("meetkai-functionary-medium-v3.2", PYTHON_TOOL, "code"),
|
||||
# ("meta-llama-Llama-3.2-3B-Instruct", TEST_TOOL, "success"),
|
||||
# ("meta-llama-Llama-3.2-3B-Instruct", PYTHON_TOOL, "code"),
|
||||
|
||||
("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL, "success"),
|
||||
("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL, "code"),
|
||||
# ("mistralai-Mistral-Nemo-Instruct-2407", TEST_TOOL, "success"),
|
||||
# ("mistralai-Mistral-Nemo-Instruct-2407", PYTHON_TOOL, "code"),
|
||||
|
||||
("meta-llama-Llama-3.2-3B-Instruct", TEST_TOOL, "success"),
|
||||
("meta-llama-Llama-3.2-3B-Instruct", PYTHON_TOOL, "code"),
|
||||
# ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", TEST_TOOL, "success"),
|
||||
# ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", PYTHON_TOOL, "code"),
|
||||
|
||||
("mistralai-Mistral-Nemo-Instruct-2407", TEST_TOOL, "success"),
|
||||
("mistralai-Mistral-Nemo-Instruct-2407", PYTHON_TOOL, "code"),
|
||||
# ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"),
|
||||
# ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"),
|
||||
|
||||
("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", TEST_TOOL, "success"),
|
||||
("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", PYTHON_TOOL, "code"),
|
||||
# ("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"),
|
||||
# # ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "codeFalse), True),
|
||||
# # ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
|
||||
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"),
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"),
|
||||
|
||||
("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"),
|
||||
# ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "codeFalse), True),
|
||||
# ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
|
||||
|
||||
])
|
||||
def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
|
||||
global server
|
||||
n_predict = 512
|
||||
# server = ServerPreset.stories15m_moe()
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
|
||||
# ])
|
||||
# def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
|
||||
# global server
|
||||
# n_predict = 512
|
||||
# # server = ServerPreset.stories15m_moe()
|
||||
# server.jinja = True
|
||||
# server.n_predict = n_predict
|
||||
# server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
# server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
# do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
|
||||
Reference in New Issue
Block a user