llama : enable layer input extraction
This commit is contained in:
+16
-1
@@ -65,6 +65,8 @@ llama_context::llama_context(
|
||||
cparams.cb_eval = params.cb_eval;
|
||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||
|
||||
cparams.output_layer_inp.resize(hparams.n_layer, false);
|
||||
|
||||
// Initialize backend samplers here so they are part of the sampling graph
|
||||
// before the reserve passes run later in this function. This avoids a later
|
||||
// re-reserve when graph nodes change.
|
||||
@@ -1168,6 +1170,16 @@ bool llama_context::set_adapter_cvec(
|
||||
return res;
|
||||
}
|
||||
|
||||
void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) {
|
||||
LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable);
|
||||
|
||||
GGML_ASSERT(layer_id < model.hparams.n_layer);
|
||||
|
||||
cparams.output_layer_inp[layer_id] = enable;
|
||||
|
||||
sched_need_reserve = true;
|
||||
}
|
||||
|
||||
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||
if (mctx && !mctx->apply()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
|
||||
@@ -1904,7 +1916,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
has_embd = true;
|
||||
}
|
||||
|
||||
|
||||
size_t backend_float_count = 0;
|
||||
size_t backend_token_count = 0;
|
||||
|
||||
@@ -3779,3 +3790,7 @@ void llama_opt_epoch(
|
||||
llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx) {
|
||||
return ctx->memory_breakdown();
|
||||
}
|
||||
|
||||
void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable) {
|
||||
ctx->set_output_layer_inp(layer_id, enable);
|
||||
}
|
||||
|
||||
@@ -121,6 +121,8 @@ struct llama_context {
|
||||
int32_t il_start,
|
||||
int32_t il_end);
|
||||
|
||||
void set_output_layer_inp(uint32_t layer_id, bool enable);
|
||||
|
||||
// process a single ubatch with a specific graph type
|
||||
// if memory_context is provided, it will be applied first to the context's memory
|
||||
// ret contains the status of the graph computation
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#define LLAMA_MAX_SEQ 256
|
||||
|
||||
@@ -40,6 +41,8 @@ struct llama_cparams {
|
||||
bool kv_unified;
|
||||
bool pipeline_parallel;
|
||||
|
||||
std::vector<bool> output_layer_inp;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
|
||||
@@ -88,3 +88,14 @@ LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
|
||||
LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);
|
||||
|
||||
LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// model/context data extraction
|
||||
//
|
||||
|
||||
// set if the layer input embeddings should be outputed
|
||||
LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable);
|
||||
|
||||
LLAMA_API ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model);
|
||||
LLAMA_API void llama_model_set_tok_embd( struct llama_model * model, ggml_tensor * tensor);
|
||||
|
||||
|
||||
+13
-1
@@ -810,6 +810,10 @@ void llm_graph_result::reset() {
|
||||
t_logits = nullptr;
|
||||
t_embd = nullptr;
|
||||
t_embd_pooled = nullptr;
|
||||
|
||||
t_layer_inp.resize(LLAMA_MAX_LAYERS);
|
||||
std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr);
|
||||
|
||||
t_sampled.clear();
|
||||
t_sampled_probs.clear();
|
||||
t_sampled_logits.clear();
|
||||
@@ -838,7 +842,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_result::set_outputs() {
|
||||
void llm_graph_result::set_outputs(const llm_graph_params & params) {
|
||||
if (t_logits != nullptr) {
|
||||
ggml_set_output(t_logits);
|
||||
}
|
||||
@@ -848,6 +852,14 @@ void llm_graph_result::set_outputs() {
|
||||
if (t_embd_pooled != nullptr) {
|
||||
ggml_set_output(t_embd_pooled);
|
||||
}
|
||||
{
|
||||
const auto & output_layer_inp = params.cparams.output_layer_inp;
|
||||
for (size_t il = 0; il < output_layer_inp.size(); ++il) {
|
||||
if (output_layer_inp[il]) {
|
||||
ggml_set_output(t_layer_inp[il]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto & [seq_id, t] : t_sampled) {
|
||||
if (t != nullptr) {
|
||||
ggml_set_output(t);
|
||||
|
||||
+9
-5
@@ -645,6 +645,8 @@ public:
|
||||
ggml_tensor * get_embd() const { return t_embd; }
|
||||
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
||||
|
||||
ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; }
|
||||
|
||||
ggml_cgraph * get_gf() const { return gf; }
|
||||
ggml_context * get_ctx() const { return ctx_compute.get(); }
|
||||
|
||||
@@ -653,7 +655,7 @@ public:
|
||||
void reset();
|
||||
|
||||
void set_inputs(const llama_ubatch * ubatch);
|
||||
void set_outputs();
|
||||
void set_outputs(const llm_graph_params & params);
|
||||
|
||||
// try to update the existing graph result using the new graph parameters in order to reuse it
|
||||
// this can only be done if we determine that the resulting graph using the new graph parameters
|
||||
@@ -673,10 +675,12 @@ public:
|
||||
ggml_tensor * t_embd = nullptr;
|
||||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_candidates;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
|
||||
std::vector<ggml_tensor *> t_layer_inp;
|
||||
|
||||
std::map<llama_seq_id, ggml_tensor *> t_sampled_logits;
|
||||
std::map<llama_seq_id, ggml_tensor *> t_candidates;
|
||||
std::map<llama_seq_id, ggml_tensor *> t_sampled;
|
||||
std::map<llama_seq_id, ggml_tensor *> t_sampled_probs;
|
||||
|
||||
std::vector<llm_graph_input_ptr> inputs;
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ uint32_t llama_hparams::n_rot(uint32_t il) const {
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_inp() const {
|
||||
uint32_t n_embd_inp = n_embd;
|
||||
uint32_t n_embd_inp = n_embd_inp_impl > 0 ? n_embd_inp_impl : n_embd;
|
||||
|
||||
if (n_deepstack_layers > 0) {
|
||||
n_embd_inp += n_embd * n_deepstack_layers;
|
||||
|
||||
@@ -42,6 +42,7 @@ struct llama_hparams {
|
||||
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
uint32_t n_embd_inp_impl = 0;
|
||||
uint32_t n_layer;
|
||||
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
||||
uint32_t n_expert = 0;
|
||||
|
||||
+9
-1
@@ -2071,7 +2071,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
// TODO: move reranking logic here and generalize
|
||||
llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
|
||||
|
||||
llm->res->set_outputs();
|
||||
llm->res->set_outputs(params);
|
||||
|
||||
return llm->res->get_gf();
|
||||
}
|
||||
@@ -2515,3 +2515,11 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
|
||||
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model) {
|
||||
return model->tok_embd;
|
||||
}
|
||||
|
||||
void llama_model_set_tok_embd(struct llama_model * model, ggml_tensor * tensor) {
|
||||
model->tok_embd = tensor;
|
||||
}
|
||||
|
||||
@@ -123,6 +123,8 @@ llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_grap
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
||||
@@ -75,6 +75,8 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
|
||||
@@ -68,6 +68,8 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
||||
@@ -78,6 +78,8 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
||||
Reference in New Issue
Block a user