ggml : add context enumeration functions (#3605 )

finetune : fix assert failure in ggml-alloc
CLBlast: Fix matrix-vector multiplication (#3544 )
2023-10-13 12:23:10 +02:00 · 2023-10-12 21:59:47 +02:00
8 changed files with 149 additions and 53 deletions
@@ -1,14 +1,50 @@
 #include "sampling.h"

+llama_sampling_context::~llama_sampling_context() {
+    for (auto & it : sequence_contexts) {
+        if (it.second.grammar != NULL) {
+            llama_grammar_free(it.second.grammar);
+            it.second.grammar = NULL;
+        }
+    }
+}
+
 llama_sampling_context llama_sampling_context_init(
        const struct gpt_params & params,
                  llama_grammar * grammar) {
-    llama_sampling_context result;
+  llama_sampling_context result;

-    result.params = params.sampling_params;
-    result.grammar = grammar;
+  result.params = params.sampling_params;
+  result.grammar = grammar;
+  return result;
+}

-    return result;
+// Note: Creates the context if it doesn't exist, so this always return something.
+llama_sampler_sequence_context & llama_sampling_get_sequence_context(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq) {
+    const auto it = ctx_sampling.sequence_contexts.find(seq);
+    if (it != ctx_sampling.sequence_contexts.end()) {
+        return it->second;
+    }
+    llama_sampler_sequence_context new_ctx = {
+        2.0f * ctx_sampling.params.mirostat_tau,
+        ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
+    };
+    return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
+}
+
+bool llama_sampling_context_reset(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq) {
+    const auto it = ctx_sampling.sequence_contexts.find(seq);
+    if (it == ctx_sampling.sequence_contexts.end()) return false;
+    if (it->second.grammar != NULL) {
+        llama_grammar_free(it->second.grammar);
+        it->second.grammar = NULL;
+    }
+    ctx_sampling.sequence_contexts.erase(it);
+    return true;
 }

 llama_token llama_sampling_sample(
@@ -17,7 +53,8 @@ llama_token llama_sampling_sample(
                  struct llama_sampling_context & ctx_sampling,
        const std::vector<llama_token> & last_tokens,
         std::vector<llama_token_data> & candidates,
-        const                      int   idx) {
+        const                      int   idx,
+                          llama_seq_id   seq) {
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));

@@ -78,8 +115,10 @@ llama_token llama_sampling_sample(
        }
    }

-    if (ctx_sampling.grammar != NULL) {
-        llama_sample_grammar(ctx, &cur_p, ctx_sampling.grammar);
+    llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
+
+    if (ctx_seq.grammar != NULL) {
+        llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
    }

    if (temp <= 0) {
@@ -89,10 +128,10 @@ llama_token llama_sampling_sample(
        if (mirostat == 1) {
            const int mirostat_m = 100;
            llama_sample_temp(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling.mirostat_mu);
+            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
        } else if (mirostat == 2) {
            llama_sample_temp(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling.mirostat_mu);
+            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
        } else {
            // Temperature sampling
            size_t min_keep = std::max(1, params.n_probs);
@@ -119,8 +158,8 @@ llama_token llama_sampling_sample(
        }
    }

-    if (ctx_sampling.grammar != NULL) {
-        llama_grammar_accept_token(ctx, ctx_sampling.grammar, id);
+    if (ctx_seq.grammar != NULL) {
+        llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
    }

    return id;
@@ -34,14 +34,27 @@ typedef struct llama_sampling_params {

 } llama_sampling_params;

+// per-sequence sampler context
+typedef struct llama_sampler_sequence_context {
+    float mirostat_mu; // mirostat sampler state
+    llama_grammar * grammar;
+} llama_sampler_sequence_context;
+
 // general sampler context
 typedef struct llama_sampling_context {
-    // parameters that will be used for sampling
+    ~llama_sampling_context();
+
+    // parameters that will be used for sampling and when creating
+    // new llama_sampler_sequence_context instances
    llama_sampling_params params;

-    // mirostat sampler state
-    float mirostat_mu;
+    // map of sequence ids to sampler contexts
+    std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;

+    // when non-NULL, new instances of llama_sampler_sequence_context
+    // will get a copy of the grammar here
+    // note: only the pointer is stored here, it is not a copy of
+    //       the grammar and shouldn't be freed
    llama_grammar * grammar;
 } llama_sampling_context;

@@ -52,6 +65,13 @@ llama_sampling_context llama_sampling_context_init(
        const struct gpt_params & params,
                  llama_grammar * grammar = NULL);

+// Fetches the sampler context for the specified sequence id (defaults to 0).
+// If the context for that sequence id doesn't already exist, it will be created with
+// default values based on the parameters in the ctx_sampling argument.
+llama_sampler_sequence_context & llama_sampling_get_sequence_context(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq = 0);
+
 // Reset the sampler context for the supplied sequence id (defaults to 0).
 // This is necessary to reuse a sequence id or free memory used by sequences
 // that are no longer required.
@@ -84,4 +104,5 @@ llama_token llama_sampling_sample(
                  struct llama_sampling_context & ctx_sampling,
        const std::vector<llama_token> & last_tokens,
         std::vector<llama_token_data> & candidates,
-        const                      int   idx = 0);
+        const                      int   idx = 0,
+                          llama_seq_id   seq = 0);
@@ -529,13 +529,14 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
    set_param_lora(lora);

    // measure data size
-    struct ggml_allocr * alloc = NULL;
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    alloc_lora(alloc, lora);
+    size_t size = 0;
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
+    }

    // allocate data
-    lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
-    ggml_allocr_free(alloc);
+    struct ggml_allocr * alloc = NULL;
+    lora->data.resize(size + tensor_alignment);
    alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
    alloc_lora(alloc, lora);
    ggml_allocr_free(alloc);
@@ -1714,11 +1715,9 @@ int main(int argc, char ** argv) {
    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);

    // measure required memory for input tensors
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    ggml_allocr_alloc(alloc, tokens_input);
-    ggml_allocr_alloc(alloc, target_probs);
-    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
-    ggml_allocr_free(alloc);
+    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
+                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
+                            tensor_alignment;
    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));

    // allocate input tensors
@@ -69,8 +69,6 @@ struct client {
    std::string response;

    std::vector<llama_token> tokens_prev;
-
-    llama_sampling_context ctx_sampling;
 };

 static void print_date_time() {
@@ -127,6 +125,8 @@ int main(int argc, char ** argv) {
    params.logits_all = true;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);

+    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
+
    // load the prompts from an external file if there are any
    if (params.prompt.empty()) {
        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@@ -156,7 +156,6 @@ int main(int argc, char ** argv) {
        client.id = i;
        client.tokens_prev.resize(std::max(256, params.n_predict));
        std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
-        client.ctx_sampling = llama_sampling_context_init(params, NULL);
    }

    std::vector<llama_token_data> candidates;
@@ -342,7 +341,7 @@ int main(int argc, char ** argv) {
                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);

-                const llama_token id = llama_sampling_sample(ctx, NULL, client.ctx_sampling, client.tokens_prev, candidates, client.i_batch - i);
+                const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);

                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@@ -387,7 +386,7 @@ int main(int argc, char ** argv) {

                    n_total_prompt += client.n_prompt;
                    n_total_gen    += client.n_decoded;
-
+                    llama_sampling_context_reset(ctx_sampling, client.seq_id);
                    client.seq_id = -1;
                }

@@ -9,12 +9,6 @@
 #include <string>
 #include <vector>

-struct seq_draft {
-    std::vector<llama_token> tokens;
-
-    struct llama_grammar * grammar = NULL;
-};
-
 int main(int argc, char ** argv) {
    gpt_params params;

@@ -219,8 +213,13 @@ int main(int argc, char ** argv) {
            if (grammar_dft) {
                llama_grammar_free(grammar_dft);
            }
-
-            grammar_dft = llama_grammar_copy(ctx_sampling.grammar);
+            // Note: Hardcoded to sequence id 0, if this ever supports parallel generation
+            //       that will need to change.
+            auto it = ctx_sampling.sequence_contexts.find(0);
+            GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
+            // This is necessary because each sequence id in sequence_contexts
+            // uses a copy of the original grammar.
+            grammar_dft = llama_grammar_copy(it->second.grammar);

            LOG("copied target grammar to draft grammar\n");
        }
@@ -19,7 +19,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#define CL_DMMV_BLOCK_SIZE 32
+#define CL_DMMV_LOCAL_SIZE 32

 #ifndef K_QUANTS_PER_ITERATION
 #define K_QUANTS_PER_ITERATION 1
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
    const int row = get_group_id(0);

    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);

    __global const struct block_q2_K * x = xx + ib0;

@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
    const int row = get_group_id(0);

    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);

    __global const struct block_q3_K * x = xx + ib0;

@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,

    const int row = get_group_id(0);
    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);

    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...15
    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,

    const int row = get_group_id(0);
    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);

    const int tid = get_local_id(0)/2;  // 0...15
    const int ix  = get_local_id(0)%2;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
    const int row = get_group_id(0);

    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);

    __global const struct block_q6_K * x = xx + ib0;

@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {

 std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
 __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int block_size = get_local_size(0);
+    const int local_size = get_local_size(0);
    const int row = get_group_id(0);
    const int tid = get_local_id(0);

    const uint qk = QUANT_K;
    const uint qr = QUANT_R;

+    const int col_step = local_size * 2;
    const int y_offset = qr == 1 ? 1 : qk/2;

+    x += get_global_offset(0);
+
    tmp[tid] = 0;

-    for (int i = 0; i < ncols/block_size; i += 2) {
-        const int col = i*block_size + 2*tid;
+    for (int col = tid*2; col < ncols; col += col_step) {
        const int ib = (row*ncols + col)/qk; // block index
        const int iqs = (col%qk)/qr; // quant index
        const int iybs = col - col%qk; // y block start index
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float

    // sum up partial sums and write back result
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=block_size/2; s>0; s>>=1) {
+    for (int s=local_size/2; s>0; s>>=1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
@@ -1704,7 +1706,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    const ggml_type type = src0->type;
-    const bool mul_mat_vec = ne11 == 1;
+    const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;

    const int64_t r2 = ne12 / ne02;
    const int64_t r3 = ne13 / ne03;
@@ -1737,7 +1739,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    GGML_ASSERT(to_fp32_cl != nullptr);

    const size_t global_denom = ggml_cl_global_denom(type);
-    const size_t local = ggml_cl_local_size(type);
+    const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);

    size_t ev_idx = 0;
    std::vector<cl_event> events;
@@ -1770,8 +1772,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));

                // compute
-                const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
-                const size_t local = CL_DMMV_BLOCK_SIZE;
+                const size_t global = ne01 * local;
+                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
                const cl_int ncols = ne00;
                events.emplace_back();
                CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1779,7 +1781,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
                CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
                CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
            } else { // general dequantization kernel + CLBlast matrix matrix multiplication
                // convert src0 to fp32 on device
                const size_t global = x_ne / global_denom;
@@ -5494,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
    return result;
 }

+struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
+    struct ggml_object * obj = ctx->objects_begin;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_OBJECT_TENSOR) {
+            return (struct ggml_tensor *)(mem_buffer + obj->offs);
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
+struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
+    struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
+    obj = obj->next;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_OBJECT_TENSOR) {
+            return (struct ggml_tensor *)(mem_buffer + obj->offs);
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
 struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
    struct ggml_object * obj = ctx->objects_begin;

@@ -8647,6 +8680,7 @@ void ggml_set_param(

    GGML_ASSERT(tensor->grad == NULL);
    tensor->grad = ggml_dup_tensor(ctx, tensor);
+    ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
 }

 // ggml_compute_forward_dup
@@ -704,6 +704,9 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);

+    // Context tensor enumeration and lookup
+    GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
+    GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);

    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
Author	SHA1	Message	Date
slaren	424b6381c4	ggml : add context enumeration functions (#3605 ) finetune : fix assert failure in ggml-alloc	2023-10-13 12:23:10 +02:00
shibe2	1e0e873c37	CLBlast: Fix matrix-vector multiplication (#3544 )	2023-10-12 21:59:47 +02:00