logs : reduce (#23021)
* logs : reduce * args : fix envs * server : fix build * common : print verbosity level at start * server : clean-up logs * server : print prompt processing timings + sampling params * minor : whitespaces
This commit is contained in:
+12
-8
@@ -308,12 +308,14 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
|
||||
common_download_opts opts;
|
||||
opts.bearer_token = params.hf_token;
|
||||
opts.offline = params.offline;
|
||||
|
||||
LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
|
||||
const int status = common_download_file_single(preset_url, preset_path, opts);
|
||||
const bool has_preset = status >= 200 && status < 400;
|
||||
|
||||
// remote preset is optional, so we don't error out if not found
|
||||
if (has_preset) {
|
||||
LOG_INF("applying remote preset from %s\n", preset_url.c_str());
|
||||
LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
|
||||
common_preset_context ctx(ex, /* only_remote_allowed */ true);
|
||||
common_preset global;
|
||||
auto remote_presets = ctx.load_from_ini(preset_path, global);
|
||||
@@ -326,7 +328,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
|
||||
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s", "no remote preset found, skipping\n");
|
||||
LOG_TRC("%s: no remote preset found, skipping\n", __func__);
|
||||
}
|
||||
|
||||
return has_preset;
|
||||
@@ -3301,18 +3303,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_env("LLAMA_LOG_VERBOSITY"));
|
||||
add_opt(common_arg(
|
||||
{"--log-prefix"},
|
||||
{"--no-log-prefix"},
|
||||
"Enable prefix in log messages",
|
||||
[](common_params &) {
|
||||
common_log_set_prefix(common_log_main(), true);
|
||||
[](common_params &, bool value) {
|
||||
common_log_set_prefix(common_log_main(), value);
|
||||
}
|
||||
).set_env("LLAMA_LOG_PREFIX"));
|
||||
).set_env("LLAMA_ARG_LOG_PREFIX"));
|
||||
add_opt(common_arg(
|
||||
{"--log-timestamps"},
|
||||
{"--no-log-timestamps"},
|
||||
"Enable timestamps in log messages",
|
||||
[](common_params &) {
|
||||
common_log_set_timestamps(common_log_main(), true);
|
||||
[](common_params &, bool value) {
|
||||
common_log_set_timestamps(common_log_main(), value);
|
||||
}
|
||||
).set_env("LLAMA_LOG_TIMESTAMPS"));
|
||||
).set_env("LLAMA_ARG_LOG_TIMESTAMPS"));
|
||||
|
||||
//
|
||||
// speculative parameters
|
||||
|
||||
+22
-7
@@ -366,15 +366,29 @@ void common_init() {
|
||||
SetConsoleCP(CP_UTF8);
|
||||
#endif
|
||||
|
||||
llama_log_set(common_log_default_callback, NULL);
|
||||
common_log_set_prefix(common_log_main(), true);
|
||||
common_log_set_timestamps(common_log_main(), true);
|
||||
|
||||
llama_log_set(common_log_default_callback, NULL);
|
||||
}
|
||||
|
||||
void common_params_print_info(const common_params & params) {
|
||||
#ifdef NDEBUG
|
||||
const char * build_type = "";
|
||||
#else
|
||||
const char * build_type = " (debug)";
|
||||
#endif
|
||||
LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
|
||||
|
||||
LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
|
||||
LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
|
||||
LOG_INF("device_info:\n");
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
||||
}
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
std::string common_params_get_system_info(const common_params & params) {
|
||||
@@ -1147,7 +1161,8 @@ common_init_result::common_init_result(common_params & params) :
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
if (params.fit_params) {
|
||||
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
||||
LOG_INF("%s: fitting params to device memory ...\n", __func__);
|
||||
LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__);
|
||||
common_fit_params(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split,
|
||||
params.tensor_buft_overrides.data(),
|
||||
@@ -1196,7 +1211,7 @@ common_init_result::common_init_result(common_params & params) :
|
||||
// initialize once
|
||||
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
||||
if (llama_vocab_is_eog(vocab, i)) {
|
||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
|
||||
LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
|
||||
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
||||
}
|
||||
}
|
||||
@@ -1209,12 +1224,12 @@ common_init_result::common_init_result(common_params & params) :
|
||||
}
|
||||
|
||||
//if (params.sampling.penalty_last_n == -1) {
|
||||
// LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
// LOG_TRC("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
// params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
||||
//}
|
||||
|
||||
//if (params.sampling.dry_penalty_last_n == -1) {
|
||||
// LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
// LOG_TRC("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||
//}
|
||||
|
||||
@@ -1422,7 +1437,7 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
|
||||
|
||||
// try to remove the last tokens
|
||||
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
||||
LOG_WRN("%s: the context does not support partial sequence removal\n", __func__);
|
||||
LOG_TRC("%s: the context does not support partial sequence removal\n", __func__);
|
||||
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
|
||||
goto done;
|
||||
}
|
||||
|
||||
@@ -686,6 +686,7 @@ struct common_params {
|
||||
// initializes the logging system and prints info about the build
|
||||
void common_init();
|
||||
|
||||
void common_params_print_info(const common_params & params);
|
||||
std::string common_params_get_system_info(const common_params & params);
|
||||
|
||||
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
|
||||
+2
-2
@@ -320,9 +320,9 @@ static int common_download_file_single_online(const std::string & url,
|
||||
|
||||
auto head = cli.Head(parts.path);
|
||||
if (!head || head->status < 200 || head->status >= 300) {
|
||||
LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
|
||||
LOG_TRC("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
|
||||
if (file_exists) {
|
||||
LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
|
||||
LOG_TRC("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
|
||||
return 304; // 304 Not Modified - fake cached response
|
||||
}
|
||||
return head ? head->status : -1;
|
||||
|
||||
+44
-44
@@ -168,7 +168,7 @@ static void common_params_fit_impl(
|
||||
|
||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||
|
||||
LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
|
||||
@@ -213,13 +213,13 @@ static void common_params_fit_impl(
|
||||
LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (sum_projected_free >= margins[0]) {
|
||||
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
|
||||
LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
|
||||
__func__, sum_projected_free/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
if (nd > 1) {
|
||||
LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
LOG_TRC("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
@@ -234,16 +234,16 @@ static void common_params_fit_impl(
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LOG_INF("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
LOG_TRC("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
LOG_TRC("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
@@ -256,7 +256,7 @@ static void common_params_fit_impl(
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
LOG_TRC("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -275,10 +275,10 @@ static void common_params_fit_impl(
|
||||
}
|
||||
if (global_surplus < 0) {
|
||||
if (nd <= 1) {
|
||||
LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
LOG_TRC("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
LOG_INF(
|
||||
LOG_TRC(
|
||||
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, -global_surplus/MiB);
|
||||
}
|
||||
@@ -320,28 +320,28 @@ static void common_params_fit_impl(
|
||||
|
||||
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
||||
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
||||
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (nd <= 1) {
|
||||
LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
|
||||
LOG_TRC("%s: entire model can be fit by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||
LOG_TRC("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||
} else {
|
||||
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
||||
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
}
|
||||
} else {
|
||||
if (n_ctx_min == UINT32_MAX) {
|
||||
LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
|
||||
LOG_TRC("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
|
||||
} else {
|
||||
LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
LOG_TRC("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
__func__, hp_nct, n_ctx_min);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||
LOG_TRC("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -485,10 +485,10 @@ static void common_params_fit_impl(
|
||||
const dmds_t dmd_nl = common_get_device_memory_data(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
LOG_INF("%s: memory for test allocation by device:\n", func_name);
|
||||
LOG_TRC("%s: memory for test allocation by device:\n", func_name);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const ngl_t & n = ngl_per_device[id];
|
||||
LOG_INF(
|
||||
LOG_TRC(
|
||||
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||
}
|
||||
@@ -509,7 +509,7 @@ static void common_params_fit_impl(
|
||||
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
|
||||
LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
const dmds_t dmds_cpu_moe = common_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
@@ -519,10 +519,10 @@ static void common_params_fit_impl(
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||
LOG_TRC("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||
__func__, global_surplus_cpu_moe/MiB);
|
||||
} else {
|
||||
LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||
LOG_TRC("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||
__func__, -global_surplus_cpu_moe/MiB);
|
||||
}
|
||||
|
||||
@@ -535,7 +535,7 @@ static void common_params_fit_impl(
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margins[id]);
|
||||
LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
LOG_TRC("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
||||
@@ -555,9 +555,9 @@ static void common_params_fit_impl(
|
||||
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||
// - the last device has the output layer, which cannot be a partial layer
|
||||
if (hp_nex == 0) {
|
||||
LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
|
||||
LOG_TRC("%s: filling dense layers back-to-front:\n", __func__);
|
||||
} else {
|
||||
LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||
LOG_TRC("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||
}
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
uint32_t n_unassigned = hp_ngl + 1;
|
||||
@@ -576,7 +576,7 @@ static void common_params_fit_impl(
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
||||
LOG_TRC("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
@@ -593,11 +593,11 @@ static void common_params_fit_impl(
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
|
||||
LOG_TRC("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
|
||||
}
|
||||
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
}
|
||||
@@ -605,12 +605,12 @@ static void common_params_fit_impl(
|
||||
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
LOG_TRC(
|
||||
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
@@ -634,7 +634,7 @@ static void common_params_fit_impl(
|
||||
}
|
||||
assert(id_dense_start < nd);
|
||||
|
||||
LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
LOG_TRC("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||
@@ -674,13 +674,13 @@ static void common_params_fit_impl(
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
id_dense_start_high = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
LOG_TRC("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||
}
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
@@ -690,7 +690,7 @@ static void common_params_fit_impl(
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
id_dense_start = id_dense_start_high;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
|
||||
@@ -710,44 +710,44 @@ static void common_params_fit_impl(
|
||||
if (id < nd - 1) {
|
||||
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
||||
}
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
LOG_TRC(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
@@ -755,7 +755,7 @@ static void common_params_fit_impl(
|
||||
// print info for devices that were not changed during the conversion from dense only to full layers:
|
||||
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
LOG_TRC(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
@@ -776,7 +776,7 @@ enum common_params_fit_status common_fit_params(
|
||||
common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
|
||||
try {
|
||||
common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
||||
LOG_INF("%s: successfully fit params to free device memory\n", __func__);
|
||||
LOG_TRC("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const common_params_fit_exception & e) {
|
||||
LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = COMMON_PARAMS_FIT_STATUS_FAILURE;
|
||||
@@ -785,7 +785,7 @@ enum common_params_fit_status common_fit_params(
|
||||
status = COMMON_PARAMS_FIT_STATUS_ERROR;
|
||||
}
|
||||
const int64_t t1_us = llama_time_us();
|
||||
LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||
LOG_TRC("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -925,7 +925,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
|
||||
}
|
||||
}
|
||||
for (const auto & td : table_data) {
|
||||
LOG_INF(td[0].c_str(),
|
||||
LOG_TRC(td[0].c_str(),
|
||||
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
||||
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
||||
}
|
||||
|
||||
+2
-2
@@ -435,10 +435,10 @@ void common_log_flush(struct common_log * log) {
|
||||
static int common_get_verbosity(enum ggml_log_level level) {
|
||||
switch (level) {
|
||||
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
|
||||
case GGML_LOG_LEVEL_INFO: return LOG_LEVEL_INFO;
|
||||
case GGML_LOG_LEVEL_INFO: return LOG_LEVEL_TRACE;
|
||||
case GGML_LOG_LEVEL_WARN: return LOG_LEVEL_WARN;
|
||||
case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
|
||||
case GGML_LOG_LEVEL_CONT: return LOG_LEVEL_INFO; // same as INFO
|
||||
case GGML_LOG_LEVEL_CONT: return LOG_LEVEL_TRACE;
|
||||
case GGML_LOG_LEVEL_NONE:
|
||||
default:
|
||||
return LOG_LEVEL_OUTPUT;
|
||||
|
||||
+5
-2
@@ -21,7 +21,8 @@
|
||||
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
|
||||
#define LOG_LEVEL_DEBUG 4
|
||||
#define LOG_LEVEL_DEBUG 5
|
||||
#define LOG_LEVEL_TRACE 4
|
||||
#define LOG_LEVEL_INFO 3
|
||||
#define LOG_LEVEL_WARN 2
|
||||
#define LOG_LEVEL_ERROR 1
|
||||
@@ -111,13 +112,15 @@ void common_log_flush (struct common_log * log); // f
|
||||
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
||||
|
||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define LOG_TRC(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, LOG_LEVEL_TRACE, __VA_ARGS__)
|
||||
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, LOG_LEVEL_INFO, __VA_ARGS__) // same as INFO
|
||||
|
||||
#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
|
||||
#define LOG_TRCV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_TRACE, verbosity, __VA_ARGS__)
|
||||
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
|
||||
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
|
||||
#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
|
||||
#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
|
||||
#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT, verbosity, __VA_ARGS__)
|
||||
|
||||
@@ -984,7 +984,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
}
|
||||
|
||||
if (impls.empty()) {
|
||||
LOG_WRN("%s", "no implementations specified for speculative decoding\n");
|
||||
LOG_WRN("%s: no implementations specified for speculative decoding\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
@@ -672,7 +672,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
![[dev->mtl_device name] containsString:@"M6"] &&
|
||||
![[dev->mtl_device name] containsString:@"A19"] &&
|
||||
![[dev->mtl_device name] containsString:@"A20"]) {
|
||||
GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
|
||||
GGML_LOG_INFO("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
|
||||
dev->props.has_tensor = false;
|
||||
}
|
||||
|
||||
|
||||
@@ -473,10 +473,10 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define LOG_ERR(...) clip_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define LOG_CNT(...) clip_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
|
||||
|
||||
//
|
||||
|
||||
+1
-1
@@ -169,7 +169,7 @@ struct clip_ctx {
|
||||
throw std::runtime_error("failed to initialize CPU backend");
|
||||
}
|
||||
if (ctx_params.use_gpu) {
|
||||
auto backend_name = std::getenv("MTMD_BACKEND_DEVICE");
|
||||
auto * backend_name = std::getenv("MTMD_BACKEND_DEVICE");
|
||||
if (backend_name != nullptr) {
|
||||
backend = ggml_backend_init_by_name(backend_name, nullptr);
|
||||
if (!backend) {
|
||||
|
||||
@@ -15,17 +15,19 @@
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_TRC(slot, fmt, ...) LOG_TRC("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_CNT(slot, fmt, ...) LOG_CNT("" fmt, __VA_ARGS__)
|
||||
#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_CNT(slot, fmt, ...) LOG_CNT("" fmt, __VA_ARGS__)
|
||||
|
||||
#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_TRC(fmt, ...) LOG_TRC("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_CNT(fmt, ...) LOG_CNT("" fmt, __VA_ARGS__)
|
||||
#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_CNT(fmt, ...) LOG_CNT("" fmt, __VA_ARGS__)
|
||||
|
||||
using raw_buffer = std::vector<uint8_t>;
|
||||
|
||||
|
||||
@@ -166,6 +166,7 @@ struct server_slot {
|
||||
// stats
|
||||
size_t n_sent_text = 0; // number of sent text character
|
||||
|
||||
int64_t t_print_last = 0;
|
||||
int64_t t_start_process_prompt;
|
||||
int64_t t_start_generation;
|
||||
|
||||
@@ -233,7 +234,7 @@ struct server_slot {
|
||||
}
|
||||
}
|
||||
|
||||
SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
|
||||
SLT_TRC(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
|
||||
(ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
|
||||
}
|
||||
|
||||
@@ -417,6 +418,36 @@ struct server_slot {
|
||||
return stop_pos;
|
||||
}
|
||||
|
||||
void print_timings_tg() {
|
||||
if (n_decoded < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t t_now = ggml_time_us();
|
||||
|
||||
if (t_now - t_print_last < 3*1000*1000) {
|
||||
return;
|
||||
}
|
||||
|
||||
t_print_last = t_now;
|
||||
|
||||
const double n_gen_second = 1e3 / t_token_generation * n_decoded;
|
||||
|
||||
SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s\n", n_decoded, n_gen_second);
|
||||
}
|
||||
|
||||
void print_timings_pp() const {
|
||||
const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||
const double f_progress = (float) prompt.n_tokens() / task->n_tokens();
|
||||
|
||||
if (t_prompt_processing < 3000.0) {
|
||||
return;
|
||||
}
|
||||
|
||||
SLT_INF(*this, "prompt processing, n_tokens = %6d, progress = %.2f, t = %6.2f s / %.2f tokens per second\n",
|
||||
n_prompt_tokens_processed, f_progress, t_prompt_processing / 1e3, n_prompt_second);
|
||||
}
|
||||
|
||||
void print_timings() const {
|
||||
const double t_prompt = t_prompt_processing / n_prompt_tokens_processed;
|
||||
const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||
@@ -588,6 +619,10 @@ public:
|
||||
// note: chat_params must not be refreshed upon existing sleeping state
|
||||
server_chat_params chat_params;
|
||||
|
||||
server_context_impl() {
|
||||
mtmd_helper_log_set(common_log_default_callback, nullptr);
|
||||
}
|
||||
|
||||
~server_context_impl() {
|
||||
if (!sleeping) {
|
||||
// destroy() is already called when entering sleeping state
|
||||
@@ -749,10 +784,6 @@ private:
|
||||
|
||||
std::string & mmproj_path = params_base.mmproj.path;
|
||||
if (!mmproj_path.empty()) {
|
||||
if (!is_resume) {
|
||||
mtmd_helper_log_set(common_log_default_callback, nullptr);
|
||||
}
|
||||
|
||||
mtmd_context_params mparams = mtmd_context_params_default();
|
||||
|
||||
mparams.use_gpu = params_base.mmproj_use_gpu;
|
||||
@@ -896,17 +927,17 @@ private:
|
||||
|
||||
if (params_base.cache_ram_mib != 0) {
|
||||
if (params_base.cache_ram_mib < 0) {
|
||||
SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit");
|
||||
SRV_INF("prompt cache is enabled, size limit: %s\n", "no limit");
|
||||
} else {
|
||||
SRV_WRN("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
|
||||
SRV_INF("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
|
||||
}
|
||||
SRV_WRN("%s", "use `--cache-ram 0` to disable the prompt cache\n");
|
||||
SRV_INF("%s", "use `--cache-ram 0` to disable the prompt cache\n");
|
||||
|
||||
prompt_cache = std::make_unique<server_prompt_cache>(params_base.cache_ram_mib, n_ctx);
|
||||
} else {
|
||||
SRV_WRN("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
|
||||
SRV_INF("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
|
||||
}
|
||||
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
|
||||
SRV_INF("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
|
||||
|
||||
if (!params_base.model_alias.empty()) {
|
||||
// backward compat: use first alias as model name
|
||||
@@ -954,13 +985,13 @@ private:
|
||||
|
||||
if (params_base.cache_idle_slots) {
|
||||
if (!params_base.kv_unified) {
|
||||
SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
|
||||
SRV_WRN("%s", "--cache-idle-slots requires --kv-unified, disabling\n");
|
||||
params_base.cache_idle_slots = false;
|
||||
} else if (params_base.cache_ram_mib == 0) {
|
||||
SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
|
||||
SRV_WRN("%s", "--cache-idle-slots requires --cache-ram, disabling\n");
|
||||
params_base.cache_idle_slots = false;
|
||||
} else {
|
||||
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
|
||||
SRV_INF("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n");
|
||||
SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
|
||||
}
|
||||
}
|
||||
@@ -1112,7 +1143,7 @@ private:
|
||||
update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
|
||||
|
||||
if (update_cache) {
|
||||
SRV_WRN("%s", "updating prompt cache\n");
|
||||
SRV_INF("%s", "updating prompt cache\n");
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
@@ -1127,7 +1158,7 @@ private:
|
||||
|
||||
prompt_cache->update();
|
||||
|
||||
SRV_WRN("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
|
||||
SRV_INF("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1186,10 +1217,10 @@ private:
|
||||
if (!are_lora_equal(task_loras, slot.lora)) {
|
||||
// if lora has changed, check to see if the cache should be cleared
|
||||
if (lora_should_clear_cache(slot.lora, task_loras)) {
|
||||
SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
|
||||
SLT_TRC(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
|
||||
slot.prompt.tokens.clear();
|
||||
} else {
|
||||
SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task_loras.size());
|
||||
SLT_TRC(slot, "keeping cache for alora. %zu target loras\n", task_loras.size());
|
||||
}
|
||||
slot.lora = task_loras;
|
||||
}
|
||||
@@ -1281,7 +1312,8 @@ private:
|
||||
llama_set_sampler(ctx_tgt, slot.id, nullptr);
|
||||
}
|
||||
|
||||
SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
|
||||
SLT_TRC(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
|
||||
SLT_TRC(slot, "sampler params: \n%s\n", task.params.sampling.print().c_str());
|
||||
} else {
|
||||
slot.smpl.reset();
|
||||
}
|
||||
@@ -1800,7 +1832,7 @@ private:
|
||||
cur.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
SLT_WRN(slot,
|
||||
SLT_INF(slot,
|
||||
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
|
||||
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
|
||||
cur.pos_max, cur.n_tokens, (float) cur.size() / 1024 / 1024);
|
||||
@@ -2339,7 +2371,7 @@ private:
|
||||
|
||||
slot.state = SLOT_STATE_PROCESSING_PROMPT;
|
||||
|
||||
SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
|
||||
SLT_TRC(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
|
||||
slot.n_ctx, slot.task->params.n_keep, slot.task->n_tokens());
|
||||
|
||||
// print prompt tokens (for debugging)
|
||||
@@ -2454,7 +2486,7 @@ private:
|
||||
}
|
||||
|
||||
if (n_match >= (size_t) n_cache_reuse) {
|
||||
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
|
||||
SLT_TRC(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
|
||||
//for (size_t i = head_p; i < head_p + n_match; i++) {
|
||||
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx_tgt, prompt_tokens[i]).c_str());
|
||||
//}
|
||||
@@ -2620,10 +2652,14 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t t_current = ggml_time_us();
|
||||
slot.t_prompt_processing = (t_current - slot.t_start_process_prompt) / 1e3;
|
||||
slot.print_timings_pp();
|
||||
|
||||
// truncate any tokens that are beyond n_past for this slot
|
||||
const llama_pos p0 = slot.prompt.tokens.pos_next();
|
||||
|
||||
SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
|
||||
SLT_TRC(slot, "cached n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
|
||||
|
||||
if (!llama_memory_seq_rm(llama_get_memory(ctx_tgt), slot.id, p0, -1)) {
|
||||
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
|
||||
@@ -2764,7 +2800,6 @@ private:
|
||||
slot.i_batch = batch.n_tokens - 1;
|
||||
|
||||
slot.init_sampler();
|
||||
SLT_INF(slot, "prompt processing done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
|
||||
} else {
|
||||
if (slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch) {
|
||||
// near the end of the prompt
|
||||
@@ -2786,8 +2821,6 @@ private:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens());
|
||||
}
|
||||
|
||||
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);
|
||||
@@ -3084,6 +3117,8 @@ private:
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
slot.print_timings_tg();
|
||||
}
|
||||
|
||||
// speculative decoding - main model sample and accept
|
||||
@@ -3196,6 +3231,8 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
slot.print_timings_tg();
|
||||
|
||||
SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) n_draft, slot.prompt.n_tokens());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
|
||||
|
||||
// reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
|
||||
|
||||
SRV_INF("done request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
|
||||
SRV_TRC("done request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
|
||||
|
||||
SRV_DBG("request: %s\n", req.body.c_str());
|
||||
SRV_DBG("response: %s\n", res.body.c_str());
|
||||
@@ -89,10 +89,10 @@ bool server_http_context::init(const common_params & params) {
|
||||
hostname = params.hostname;
|
||||
|
||||
if (gcp.enabled) {
|
||||
LOG_INF("%s: Google Cloud Platform compat: health route = %s, predict route = %s, port = %d\n", __func__, gcp.path_health.c_str(), gcp.path_predict.c_str(), gcp.port);
|
||||
SRV_INF("Google Cloud Platform compat: health route = %s, predict route = %s, port = %d\n", gcp.path_health.c_str(), gcp.path_predict.c_str(), gcp.port);
|
||||
|
||||
if (port != gcp.port) {
|
||||
LOG_WRN("%s: Google Cloud Platform compat: overriding server port %d with AIP_HTTP_PORT %d\n", __func__, port, gcp.port);
|
||||
SRV_WRN("Google Cloud Platform compat: overriding server port %d with AIP_HTTP_PORT %d\n", port, gcp.port);
|
||||
}
|
||||
|
||||
port = gcp.port;
|
||||
@@ -102,17 +102,17 @@ bool server_http_context::init(const common_params & params) {
|
||||
|
||||
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
|
||||
LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
|
||||
SRV_INF("running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
|
||||
srv.reset(
|
||||
new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
|
||||
);
|
||||
} else {
|
||||
LOG_INF("Running without SSL\n");
|
||||
SRV_INF("%s", "running without SSL\n");
|
||||
srv.reset(new httplib::Server());
|
||||
}
|
||||
#else
|
||||
if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
|
||||
LOG_ERR("Server is built without SSL support\n");
|
||||
SRV_ERR("%s", "the server is built without SSL support\n");
|
||||
return false;
|
||||
}
|
||||
srv.reset(new httplib::Server());
|
||||
@@ -134,7 +134,7 @@ bool server_http_context::init(const common_params & params) {
|
||||
|
||||
res.status = 500;
|
||||
res.set_content(message, "text/plain");
|
||||
LOG_ERR("got exception: %s\n", message.c_str());
|
||||
SRV_ERR("got exception: %s\n", message.c_str());
|
||||
});
|
||||
|
||||
srv->set_error_handler([](const httplib::Request &, httplib::Response & res) {
|
||||
@@ -162,7 +162,7 @@ bool server_http_context::init(const common_params & params) {
|
||||
#ifdef SO_REUSEPORT
|
||||
httplib::set_socket_opt(sock, SOL_SOCKET, SO_REUSEPORT, 1);
|
||||
#else
|
||||
LOG_WRN("%s: SO_REUSEPORT is not supported\n", __func__);
|
||||
SRV_WRN("%s", "SO_REUSEPORT is not supported\n");
|
||||
#endif
|
||||
}
|
||||
});
|
||||
@@ -170,9 +170,9 @@ bool server_http_context::init(const common_params & params) {
|
||||
if (params.api_keys.size() == 1) {
|
||||
auto key = params.api_keys[0];
|
||||
std::string substr = key.substr(std::max((int)(key.length() - 4), 0));
|
||||
LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str());
|
||||
SRV_INF("api_keys: ****%s\n", substr.c_str());
|
||||
} else if (params.api_keys.size() > 1) {
|
||||
LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size());
|
||||
SRV_INF("api_keys: %zu keys loaded\n", params.api_keys.size());
|
||||
}
|
||||
|
||||
//
|
||||
@@ -232,7 +232,7 @@ bool server_http_context::init(const common_params & params) {
|
||||
"application/json; charset=utf-8"
|
||||
);
|
||||
|
||||
LOG_WRN("Unauthorized: Invalid API Key\n");
|
||||
SRV_WRN("%s", "unauthorized: Invalid API Key\n");
|
||||
|
||||
return false;
|
||||
};
|
||||
@@ -292,7 +292,7 @@ bool server_http_context::init(const common_params & params) {
|
||||
// +4 threads for monitoring, health and some threads reserved for MCP and other tasks in the future
|
||||
n_threads_http = std::max(params.n_parallel + 4, (int32_t) std::thread::hardware_concurrency() - 1);
|
||||
}
|
||||
LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http);
|
||||
SRV_INF("using %d threads for HTTP server\n", n_threads_http);
|
||||
srv->new_task_queue = [n_threads_http] {
|
||||
// spawn n_threads_http fixed thread (always alive), while allow up to 1024 max possible additional threads
|
||||
// when n_threads_http is used, server will create new "dynamic" threads that will be destroyed after processing each request
|
||||
@@ -306,14 +306,14 @@ bool server_http_context::init(const common_params & params) {
|
||||
//
|
||||
|
||||
if (!params.webui) {
|
||||
LOG_INF("Web UI is disabled\n");
|
||||
SRV_INF("%s", "the WebUI is disabled\n");
|
||||
} else {
|
||||
// register static assets routes
|
||||
if (!params.public_path.empty()) {
|
||||
// Set the base directory for serving static files
|
||||
bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path);
|
||||
if (!is_found) {
|
||||
LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
|
||||
SRV_ERR("static assets path not found: %s\n", params.public_path.c_str());
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
@@ -348,13 +348,13 @@ bool server_http_context::start() {
|
||||
bool is_sock = false;
|
||||
if (string_ends_with(std::string(hostname), ".sock")) {
|
||||
is_sock = true;
|
||||
LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
|
||||
SRV_INF("%s", "setting address family to AF_UNIX\n");
|
||||
srv->set_address_family(AF_UNIX);
|
||||
// bind_to_port requires a second arg, any value other than 0 should
|
||||
// simply get ignored
|
||||
was_bound = srv->bind_to_port(hostname, 8080);
|
||||
} else {
|
||||
LOG_INF("%s: binding port with default address family\n", __func__);
|
||||
SRV_INF("%s", "binding port with default address family\n");
|
||||
// bind HTTP listen port
|
||||
if (port == 0) {
|
||||
int bound_port = srv->bind_to_any_port(hostname);
|
||||
@@ -368,7 +368,7 @@ bool server_http_context::start() {
|
||||
}
|
||||
|
||||
if (!was_bound) {
|
||||
LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port);
|
||||
SRV_ERR("couldn't bind HTTP server socket, hostname: %s, port: %d\n", hostname.c_str(), port);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -580,7 +580,7 @@ void server_http_context::register_gcp_compat() {
|
||||
}
|
||||
|
||||
if (handlers.count(gcp.path_predict)) {
|
||||
LOG_ERR("%s: AIP_PREDICT_ROUTE=%s conflicts with an existing llama-server route\n", __func__, gcp.path_predict.c_str());
|
||||
SRV_ERR("AIP_PREDICT_ROUTE=%s conflicts with an existing llama-server route\n", gcp.path_predict.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@@ -651,7 +651,7 @@ void server_http_context::register_gcp_compat() {
|
||||
payload.erase("@requestFormat");
|
||||
|
||||
if (payload.contains("stream")) {
|
||||
LOG_WRN("%s: ignoring client-provided stream field in instance, streaming is not supported in predict route\n", __func__);
|
||||
SRV_WRN("%s", "ignoring client-provided stream field in instance, streaming is not supported in predict route\n");
|
||||
payload["stream"] = false;
|
||||
}
|
||||
|
||||
|
||||
@@ -1988,7 +1988,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t
|
||||
const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens);
|
||||
|
||||
if (cur_lcp_len == (int) prompt.tokens.size()) {
|
||||
SRV_WRN("%s", " - prompt is already in the cache, skipping\n");
|
||||
SRV_INF("%s", " - prompt is already in the cache, skipping\n");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@@ -2043,7 +2043,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok
|
||||
float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins
|
||||
float sim_best = float(lcp_best) / tokens_new.size();
|
||||
|
||||
SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
|
||||
SRV_INF(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
|
||||
|
||||
auto it_best = states.end();
|
||||
|
||||
@@ -2068,7 +2068,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok
|
||||
}
|
||||
|
||||
if (it_best != states.end()) {
|
||||
SRV_WRN(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
|
||||
SRV_INF(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
|
||||
|
||||
{
|
||||
auto & data = it_best->data.main;
|
||||
@@ -2076,7 +2076,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok
|
||||
const size_t size = data.size();
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx_tgt, data.data(), size, id_slot, 0);
|
||||
if (n != size) {
|
||||
SRV_WRN("failed to restore state with size %zu\n", size);
|
||||
SRV_ERR("failed to restore state with size %zu\n", size);
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -2145,11 +2145,11 @@ void server_prompt_cache::update() {
|
||||
}
|
||||
}
|
||||
|
||||
SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
|
||||
SRV_INF(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
|
||||
states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
|
||||
|
||||
for (const auto & state : states) {
|
||||
SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
|
||||
SRV_INF(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
|
||||
(const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
|
||||
}
|
||||
}
|
||||
|
||||
+21
-23
@@ -83,17 +83,22 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
common_params_print_info(params);
|
||||
|
||||
// validate batch size for embeddings
|
||||
// embeddings require all tokens to be processed in a single ubatch
|
||||
// see https://github.com/ggml-org/llama.cpp/issues/12836
|
||||
if (params.embedding && params.n_batch > params.n_ubatch) {
|
||||
LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params.n_batch, params.n_ubatch);
|
||||
LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params.n_ubatch);
|
||||
SRV_WRN("embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", params.n_batch, params.n_ubatch);
|
||||
SRV_WRN("setting n_batch = n_ubatch = %d to avoid assertion failure\n", params.n_ubatch);
|
||||
params.n_batch = params.n_ubatch;
|
||||
}
|
||||
|
||||
if (params.n_parallel < 0) {
|
||||
LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__);
|
||||
SRV_INF("%s", "n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n");
|
||||
|
||||
params.n_parallel = 4;
|
||||
params.kv_unified = true;
|
||||
@@ -107,15 +112,9 @@ int main(int argc, char ** argv) {
|
||||
// struct that contains llama context and inference
|
||||
server_context ctx_server;
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
LOG_INF("build_info: %s\n", llama_build_info());
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
|
||||
server_http_context ctx_http;
|
||||
if (!ctx_http.init(params)) {
|
||||
LOG_ERR("%s: failed to initialize HTTP server\n", __func__);
|
||||
SRV_ERR("%s", "failed to initialize HTTP server\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -134,7 +133,7 @@ int main(int argc, char ** argv) {
|
||||
try {
|
||||
models_routes.emplace(params, argc, argv);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
|
||||
SRV_ERR("failed to initialize router models: %s\n", e.what());
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -222,7 +221,7 @@ int main(int argc, char ** argv) {
|
||||
try {
|
||||
tools.setup(params.server_tools);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: tools setup failed: %s\n", __func__, e.what());
|
||||
SRV_ERR("tools setup failed: %s\n", e.what());
|
||||
return 1;
|
||||
}
|
||||
SRV_WRN("%s", "-----------------\n");
|
||||
@@ -240,7 +239,7 @@ int main(int argc, char ** argv) {
|
||||
std::function<void()> clean_up;
|
||||
|
||||
if (is_router_server) {
|
||||
LOG_INF("%s: starting router server, no model will be loaded in this process\n", __func__);
|
||||
SRV_INF("%s", "starting router server, no model will be loaded in this process\n");
|
||||
|
||||
clean_up = [&models_routes]() {
|
||||
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
||||
@@ -252,7 +251,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (!ctx_http.start()) {
|
||||
clean_up();
|
||||
LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
|
||||
SRV_ERR("%s", "exiting due to HTTP server error\n");
|
||||
return 1;
|
||||
}
|
||||
ctx_http.is_ready.store(true);
|
||||
@@ -273,12 +272,12 @@ int main(int argc, char ** argv) {
|
||||
// start the HTTP server before loading the model to be able to serve /health requests
|
||||
if (!ctx_http.start()) {
|
||||
clean_up();
|
||||
LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
|
||||
SRV_ERR("%s", "exiting due to HTTP server error\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// load the model
|
||||
LOG_INF("%s: loading model\n", __func__);
|
||||
SRV_INF("%s", "loading model\n");
|
||||
|
||||
if (server_models::is_child_server()) {
|
||||
ctx_server.on_sleeping_changed([&](bool sleeping) {
|
||||
@@ -291,14 +290,14 @@ int main(int argc, char ** argv) {
|
||||
if (ctx_http.thread.joinable()) {
|
||||
ctx_http.thread.join();
|
||||
}
|
||||
LOG_ERR("%s: exiting due to model loading error\n", __func__);
|
||||
SRV_ERR("%s", "exiting due to model loading error\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
routes.update_meta(ctx_server);
|
||||
ctx_http.is_ready.store(true);
|
||||
|
||||
LOG_INF("%s: model loaded\n", __func__);
|
||||
SRV_INF("%s", "model loaded\n");
|
||||
|
||||
shutdown_handler = [&](int) {
|
||||
// this will unblock start_loop()
|
||||
@@ -322,9 +321,9 @@ int main(int argc, char ** argv) {
|
||||
#endif
|
||||
|
||||
if (is_router_server) {
|
||||
LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
|
||||
LOG_INF("%s: NOTE: router mode is experimental\n", __func__);
|
||||
LOG_INF("%s: it is not recommended to use this mode in untrusted environments\n", __func__);
|
||||
SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());
|
||||
SRV_WRN("%s", "NOTE: router mode is experimental\n");
|
||||
SRV_WRN("%s", " it is not recommended to use this mode in untrusted environments\n");
|
||||
if (ctx_http.thread.joinable()) {
|
||||
ctx_http.thread.join(); // keep the main thread alive
|
||||
}
|
||||
@@ -332,8 +331,7 @@ int main(int argc, char ** argv) {
|
||||
// when the HTTP server stops, clean up and exit
|
||||
clean_up();
|
||||
} else {
|
||||
LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
|
||||
LOG_INF("%s: starting the main loop...\n", __func__);
|
||||
SRV_INF("server is listening on %s\n", ctx_http.listening_address.c_str());
|
||||
|
||||
// optionally, notify router server that this instance is ready
|
||||
std::thread monitor_thread;
|
||||
|
||||
+1
-1
@@ -604,8 +604,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl[0]));
|
||||
LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str());
|
||||
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl[0]).c_str());
|
||||
LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str());
|
||||
|
||||
LOG_INF("%s: loading done\n", __func__);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user