Compare commits

..

14 Commits

Author SHA1 Message Date
shibe2 40e5ce054f CLBlast: Fix temporary buffer size for f16 conversion (wsize)
Fix buffer overflow.
Reduce the size to fit just one 2D slice.
Assert sufficient size.
2023-10-17 21:02:30 +04:00
slaren a5e8c1d8c7 train-text-from-scratch : fix assert failure in ggml-alloc (#3618) 2023-10-17 20:00:58 +03:00
Georgi Gerganov e74c705e15 editorconfig : remove trailing spaces 2023-10-17 19:52:53 +03:00
coezbek 3ad1e3f1a1 server : documentation of JSON return value of /completion endpoint (#3632)
* Added documentation of JSON return value of /completion endpoint

* Update examples/server/README.md

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-10-17 19:51:02 +03:00
Georgi Gerganov 1142013da4 save-load-state : fix example + add ci test (#3655)
* save-load-state : fix example (close #3606)

* ci : add test for save-load-state example

ggml-ci
2023-10-17 19:12:46 +03:00
ldwang 5fe268a4d9 readme : add Aquila2 links (#3610)
Signed-off-by: ldwang <ftgreat@gmail.com>
Co-authored-by: ldwang <ftgreat@gmail.com>
2023-10-17 18:52:33 +03:00
staviq 1a159553f9 tokenizer : special token handling (#3538)
* Rewrite special token handling from #1931

* shorten param name, add st verification by type

* use offsets instead of copy by substr

* formatting, remove copying iterator on delete

* llama : normalize code-style

* swift fix

* print pfx/sfx if verb, main: split pfx input sfx

* dont add space when using special tokens

* minor : comment + spacing

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-10-17 18:11:01 +03:00
Georgi Gerganov 281ef73c25 k-quants : fix quantization ranges (#3646) 2023-10-17 09:19:28 +03:00
Georgi Gerganov 940efa95fe llava : fix tokenization to not add bos between image embeddings and user prompt (#3645)
* llava : fix tokenization to not add bos after system prompt

* set seed

---------

Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
2023-10-16 23:58:00 +03:00
cebtenzzre 11bff29045 MPT : support GQA for replit-code-v1.5 (#3627) 2023-10-15 09:32:06 +03:00
M. Yusuf Sarıgöz 11dc1091f6 Honor -ngl option for Cuda offloading in llava (#3621) 2023-10-14 04:52:44 -06:00
Daniel Bevenius 2a4bcbacea llama : remove n_threads from llama_decode_internal (#3614)
This commit removes `n_threads` from the `llama_decode_internal`
functions doc comment as it does not exist anymore.

It looks like this parameter was removed in
Commit 16bc66d947 ("llama.cpp : split
llama_context_params into model and context params").

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
2023-10-13 13:33:16 +03:00
slaren 424b6381c4 ggml : add context enumeration functions (#3605)
finetune : fix assert failure in ggml-alloc
2023-10-13 12:23:10 +02:00
shibe2 1e0e873c37 CLBlast: Fix matrix-vector multiplication (#3544) 2023-10-12 21:59:47 +02:00
24 changed files with 618 additions and 194 deletions
+1
View File
@@ -93,6 +93,7 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
- [X] [Aquila2-7B](https://huggingface.co/BAAI/Aquila2-7B) / [AquilaChat2-7B](https://huggingface.co/BAAI/AquilaChat2-7B) / [AquilaChat2-34B](https://huggingface.co/BAAI/AquilaChat2-34B) / [Aquila2-34B](https://huggingface.co/BAAI/Aquila2-34B)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
+6
View File
@@ -208,6 +208,8 @@ function gg_run_open_llama_3b_v2 {
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@@ -296,6 +298,7 @@ function gg_sum_open_llama_3b_v2 {
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
@@ -382,6 +385,8 @@ function gg_run_open_llama_7b_v2 {
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@@ -470,6 +475,7 @@ function gg_sum_open_llama_7b_v2 {
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
+7 -5
View File
@@ -879,21 +879,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_bos) {
return llama_tokenize(llama_get_model(ctx), text, add_bos);
bool add_bos,
bool special) {
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
}
std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_bos) {
bool add_bos,
bool special) {
// upper limit for the number of tokens
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
+4 -2
View File
@@ -137,12 +137,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_bos);
bool add_bos,
bool special = false);
std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_bos);
bool add_bos,
bool special = false);
// tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece`
+50 -11
View File
@@ -1,14 +1,50 @@
#include "sampling.h"
llama_sampling_context::~llama_sampling_context() {
for (auto & it : sequence_contexts) {
if (it.second.grammar != NULL) {
llama_grammar_free(it.second.grammar);
it.second.grammar = NULL;
}
}
}
llama_sampling_context llama_sampling_context_init(
const struct gpt_params & params,
llama_grammar * grammar) {
llama_sampling_context result;
llama_sampling_context result;
result.params = params.sampling_params;
result.grammar = grammar;
result.params = params.sampling_params;
result.grammar = grammar;
return result;
}
return result;
// Note: Creates the context if it doesn't exist, so this always return something.
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq) {
const auto it = ctx_sampling.sequence_contexts.find(seq);
if (it != ctx_sampling.sequence_contexts.end()) {
return it->second;
}
llama_sampler_sequence_context new_ctx = {
2.0f * ctx_sampling.params.mirostat_tau,
ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
};
return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
}
bool llama_sampling_context_reset(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq) {
const auto it = ctx_sampling.sequence_contexts.find(seq);
if (it == ctx_sampling.sequence_contexts.end()) return false;
if (it->second.grammar != NULL) {
llama_grammar_free(it->second.grammar);
it->second.grammar = NULL;
}
ctx_sampling.sequence_contexts.erase(it);
return true;
}
llama_token llama_sampling_sample(
@@ -17,7 +53,8 @@ llama_token llama_sampling_sample(
struct llama_sampling_context & ctx_sampling,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
const int idx) {
const int idx,
llama_seq_id seq) {
const int n_ctx = llama_n_ctx(ctx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
@@ -78,8 +115,10 @@ llama_token llama_sampling_sample(
}
}
if (ctx_sampling.grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, ctx_sampling.grammar);
llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
if (ctx_seq.grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
}
if (temp <= 0) {
@@ -89,10 +128,10 @@ llama_token llama_sampling_sample(
if (mirostat == 1) {
const int mirostat_m = 100;
llama_sample_temp(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling.mirostat_mu);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
} else if (mirostat == 2) {
llama_sample_temp(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling.mirostat_mu);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
} else {
// Temperature sampling
size_t min_keep = std::max(1, params.n_probs);
@@ -119,8 +158,8 @@ llama_token llama_sampling_sample(
}
}
if (ctx_sampling.grammar != NULL) {
llama_grammar_accept_token(ctx, ctx_sampling.grammar, id);
if (ctx_seq.grammar != NULL) {
llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
}
return id;
+25 -4
View File
@@ -34,14 +34,27 @@ typedef struct llama_sampling_params {
} llama_sampling_params;
// per-sequence sampler context
typedef struct llama_sampler_sequence_context {
float mirostat_mu; // mirostat sampler state
llama_grammar * grammar;
} llama_sampler_sequence_context;
// general sampler context
typedef struct llama_sampling_context {
// parameters that will be used for sampling
~llama_sampling_context();
// parameters that will be used for sampling and when creating
// new llama_sampler_sequence_context instances
llama_sampling_params params;
// mirostat sampler state
float mirostat_mu;
// map of sequence ids to sampler contexts
std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
// when non-NULL, new instances of llama_sampler_sequence_context
// will get a copy of the grammar here
// note: only the pointer is stored here, it is not a copy of
// the grammar and shouldn't be freed
llama_grammar * grammar;
} llama_sampling_context;
@@ -52,6 +65,13 @@ llama_sampling_context llama_sampling_context_init(
const struct gpt_params & params,
llama_grammar * grammar = NULL);
// Fetches the sampler context for the specified sequence id (defaults to 0).
// If the context for that sequence id doesn't already exist, it will be created with
// default values based on the parameters in the ctx_sampling argument.
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq = 0);
// Reset the sampler context for the supplied sequence id (defaults to 0).
// This is necessary to reuse a sequence id or free memory used by sequences
// that are no longer required.
@@ -84,4 +104,5 @@ llama_token llama_sampling_sample(
struct llama_sampling_context & ctx_sampling,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
const int idx = 0);
const int idx = 0,
llama_seq_id seq = 0);
+4 -4
View File
@@ -863,7 +863,7 @@ size_t tokenize_file(
(int) buf.size(),
out_tokens.data(),
(int) out_tokens.size(),
false);
false, false);
if (n_tokens < 0) {
out_tokens.resize(-n_tokens);
n_tokens = llama_tokenize(
@@ -872,7 +872,7 @@ size_t tokenize_file(
(int) buf.size(),
out_tokens.data(),
(int) out_tokens.size(),
false);
false, false);
}
if (n_tokens >= 0) {
out_tokens.resize(n_tokens);
@@ -966,7 +966,7 @@ size_t tokenize_file(
(int) buf_sample.size(),
tok_sample.data(),
(int) tok_sample.size(),
false);
false, false);
if (n_tokens < 0) {
tok_sample.resize(-n_tokens);
n_tokens = llama_tokenize(llama_get_model(lctx),
@@ -974,7 +974,7 @@ size_t tokenize_file(
(int) buf_sample.size(),
tok_sample.data(),
(int) tok_sample.size(),
false);
false, false);
GGML_ASSERT(n_tokens >= 0);
}
GGML_ASSERT(n_tokens <= (int) tok_sample.size());
+2
View File
@@ -98,6 +98,8 @@ gguf_writer.add_embedding_length(hparams["d_model"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
gguf_writer.add_head_count(hparams["n_heads"])
if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
gguf_writer.add_head_count_kv(kv_n_heads)
gguf_writer.add_layer_norm_eps(1e-05)
if hparams["attn_config"]["clip_qkv"] is not None:
gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
+1 -1
View File
@@ -209,7 +209,7 @@ llama_print_timings(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let n_tokens = text.count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
var swiftTokens: [llama_token] = []
for i in 0 ..< tokenCount {
swiftTokens.append(tokens[Int(i)])
+9 -10
View File
@@ -529,13 +529,14 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
set_param_lora(lora);
// measure data size
struct ggml_allocr * alloc = NULL;
alloc = ggml_allocr_new_measure(tensor_alignment);
alloc_lora(alloc, lora);
size_t size = 0;
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
}
// allocate data
lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
ggml_allocr_free(alloc);
struct ggml_allocr * alloc = NULL;
lora->data.resize(size + tensor_alignment);
alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
alloc_lora(alloc, lora);
ggml_allocr_free(alloc);
@@ -1714,11 +1715,9 @@ int main(int argc, char ** argv) {
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
// measure required memory for input tensors
alloc = ggml_allocr_new_measure(tensor_alignment);
ggml_allocr_alloc(alloc, tokens_input);
ggml_allocr_alloc(alloc, target_probs);
size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
ggml_allocr_free(alloc);
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
tensor_alignment;
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// allocate input tensors
+2 -2
View File
@@ -49,9 +49,9 @@ inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past){
inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, true);
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true;
}
+14 -6
View File
@@ -79,7 +79,13 @@ int main(int argc, char ** argv) {
llama_backend_init(params.numa);
llama_model_params model_params = llama_model_default_params();
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = params.n_gpu_layers;
model_params.main_gpu = params.main_gpu;
model_params.tensor_split = params.tensor_split;
model_params.use_mmap = params.use_mmap;
model_params.use_mlock = params.use_mlock;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@@ -91,6 +97,7 @@ int main(int argc, char ** argv) {
ctx_params.n_ctx = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
ctx_params.seed = params.seed;
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
@@ -100,7 +107,8 @@ int main(int argc, char ** argv) {
}
// make sure that the correct mmproj was used, i.e., compare apples to apples
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
if (n_img_embd != n_llama_embd) {
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
@@ -119,14 +127,14 @@ int main(int argc, char ** argv) {
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
// GG: are we sure that the should be a trailing whitespace at the end of this string?
eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params.n_batch, &n_past);
eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
eval_string(ctx_llama, "\nASSISTANT:", params.n_batch, &n_past);
eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
// generate the response
printf("\n");
printf("prompt: '%s'\n", params.prompt.c_str());
printf("\n");
for (int i = 0; i < max_tgt_len; i++) {
+30 -10
View File
@@ -238,7 +238,7 @@ int main(int argc, char ** argv) {
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
} else {
LOG("use session tokens\n");
embd_inp = session_tokens;
@@ -260,10 +260,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
original_prompt_len = original_inp.size();
@@ -320,8 +320,8 @@ int main(int argc, char ** argv) {
}
// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
@@ -383,6 +383,12 @@ int main(int argc, char ** argv) {
if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
}
@@ -392,10 +398,22 @@ int main(int argc, char ** argv) {
if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
}
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
@@ -717,7 +735,7 @@ int main(int argc, char ** argv) {
if (params.interactive) {
if (!params.antiprompt.empty()) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
is_antiprompt = true;
}
@@ -744,8 +762,7 @@ int main(int argc, char ** argv) {
std::string buffer;
if (!params.input_prefix.empty()) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
buffer += params.input_prefix;
printf("%s", buffer.c_str());
printf("%s", params.input_prefix.c_str());
}
// color user input only
@@ -767,7 +784,6 @@ int main(int argc, char ** argv) {
// append input suffix if any
if (!params.input_suffix.empty()) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix;
printf("%s", params.input_suffix.c_str());
}
@@ -782,10 +798,14 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
}
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
// instruct mode: insert response suffix
if (params.instruct) {
+4 -5
View File
@@ -69,8 +69,6 @@ struct client {
std::string response;
std::vector<llama_token> tokens_prev;
llama_sampling_context ctx_sampling;
};
static void print_date_time() {
@@ -127,6 +125,8 @@ int main(int argc, char ** argv) {
params.logits_all = true;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
// load the prompts from an external file if there are any
if (params.prompt.empty()) {
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@@ -156,7 +156,6 @@ int main(int argc, char ** argv) {
client.id = i;
client.tokens_prev.resize(std::max(256, params.n_predict));
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
client.ctx_sampling = llama_sampling_context_init(params, NULL);
}
std::vector<llama_token_data> candidates;
@@ -342,7 +341,7 @@ int main(int argc, char ** argv) {
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
const llama_token id = llama_sampling_sample(ctx, NULL, client.ctx_sampling, client.tokens_prev, candidates, client.i_batch - i);
const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
if (client.n_decoded == 1) {
// start measuring generation time after the first token to make sure all concurrent clients
@@ -387,7 +386,7 @@ int main(int argc, char ** argv) {
n_total_prompt += client.n_prompt;
n_total_gen += client.n_decoded;
llama_sampling_context_reset(ctx_sampling, client.seq_id);
client.seq_id = -1;
}
+46 -51
View File
@@ -8,10 +8,7 @@
int main(int argc, char ** argv) {
gpt_params params;
llama_sampling_params & sparams = params.sampling_params;
params.seed = 42;
params.n_threads = 4;
sparams.repeat_last_n = 64;
params.prompt = "The quick brown fox";
if (!gpt_params_parse(argc, argv, params)) {
@@ -25,56 +22,49 @@ int main(int argc, char ** argv) {
}
auto n_past = 0;
auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
std::string result0;
std::string result1;
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params );
if (model == nullptr) {
return 1;
}
if (ctx == nullptr) {
llama_free_model(model);
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}
// tokenize prompt
auto tokens = llama_tokenize(ctx, params.prompt, true);
auto n_prompt_tokens = tokens.size();
if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
llama_free(ctx);
llama_free_model(model);
return 1;
}
// evaluate prompt
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0));
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
n_past += tokens.size();
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
n_past += n_prompt_tokens;
const size_t state_size = llama_get_state_size(ctx);
uint8_t * state_mem = new uint8_t[state_size];
// Save state (rng, logits, embedding and kv_cache) to file
// save state (rng, logits, embedding and kv_cache) to file
{
FILE *fp_write = fopen("dump_state.bin", "wb");
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
fwrite(state_mem, 1, state_size, fp_write);
fclose(fp_write);
std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
{
FILE *fp_write = fopen("dump_state.bin", "wb");
llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
fclose(fp_write);
}
}
// save state (last tokens)
const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
const auto n_past_saved = n_past;
// first run
printf("\n%s", params.prompt.c_str());
printf("\nfirst run: %s", params.prompt.c_str());
for (auto i = 0; i < params.n_predict; i++) {
auto * logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(model);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -83,9 +73,10 @@ int main(int argc, char ** argv) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
auto next_token = llama_sample_token(ctx, &candidates_p);
auto next_token_str = llama_token_to_piece(ctx, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str.c_str());
result0 += next_token_str;
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx);
@@ -103,32 +94,28 @@ int main(int argc, char ** argv) {
// make new context
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
// Load state (rng, logits, embedding and kv_cache) from file
{
FILE *fp_read = fopen("dump_state.bin", "rb");
if (state_size != llama_get_state_size(ctx2)) {
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1;
}
printf("\nsecond run: %s", params.prompt.c_str());
const size_t ret = fread(state_mem, 1, state_size, fp_read);
if (ret != state_size) {
// load state (rng, logits, embedding and kv_cache) from file
{
std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
FILE * fp_read = fopen("dump_state.bin", "rb");
const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
if (ret != state_mem.size()) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1;
}
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
llama_set_state_data(ctx2, state_mem.data());
fclose(fp_read);
}
delete[] state_mem;
// restore state (last tokens)
last_n_tokens_data = last_n_tokens_data_saved;
n_past = n_past_saved;
// second run
@@ -143,10 +130,11 @@ int main(int argc, char ** argv) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
auto next_token = llama_sample_token(ctx2, &candidates_p);
auto next_token_str = llama_token_to_piece(ctx2, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str.c_str());
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
result1 += next_token_str;
if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx2);
llama_free_model(model);
@@ -155,10 +143,17 @@ int main(int argc, char ** argv) {
n_past += 1;
}
printf("\n\n");
printf("\n");
llama_free(ctx2);
llama_free_model(model);
if (result0 != result1) {
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
return 1;
}
fprintf(stderr, "\n%s : success\n", __func__);
return 0;
}
+36 -6
View File
@@ -106,25 +106,25 @@ node index.js
## API Endpoints
- **POST** `/completion`: Given a prompt, it returns the predicted completion.
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
*Options:*
`prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
`temperature`: Adjust the randomness of the generated text (default: 0.8).
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
`prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
`stop`: Specify a JSON array of stopping strings.
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
@@ -158,6 +158,36 @@ node index.js
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
*Result JSON:*
Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
`content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
`stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
`generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
`model`: The path to the model loaded with `-m`
`prompt`: The provided `prompt`
`stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
`stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
`stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
`stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
`timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
`tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
`tokens_evaluated`: Number of tokens evaluated in total from the prompt
`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
- **POST** `/tokenize`: Tokenize a given text.
*Options:*
+7 -8
View File
@@ -9,12 +9,6 @@
#include <string>
#include <vector>
struct seq_draft {
std::vector<llama_token> tokens;
struct llama_grammar * grammar = NULL;
};
int main(int argc, char ** argv) {
gpt_params params;
@@ -219,8 +213,13 @@ int main(int argc, char ** argv) {
if (grammar_dft) {
llama_grammar_free(grammar_dft);
}
grammar_dft = llama_grammar_copy(ctx_sampling.grammar);
// Note: Hardcoded to sequence id 0, if this ever supports parallel generation
// that will need to change.
auto it = ctx_sampling.sequence_contexts.find(0);
GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
// This is necessary because each sequence id in sequence_contexts
// uses a copy of the original grammar.
grammar_dft = llama_grammar_copy(it->second.grammar);
LOG("copied target grammar to draft grammar\n");
}
@@ -253,13 +253,14 @@ static void init_model(struct my_llama_model * model) {
set_param_model(model);
// measure data size
struct ggml_allocr * alloc = NULL;
alloc = ggml_allocr_new_measure(tensor_alignment);
alloc_model(alloc, model);
size_t size = 0;
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
}
// allocate data
model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
ggml_allocr_free(alloc);
struct ggml_allocr * alloc = NULL;
model->data.resize(size + tensor_alignment);
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
alloc_model(alloc, model);
ggml_allocr_free(alloc);
@@ -1094,11 +1095,9 @@ int main(int argc, char ** argv) {
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
// measure required memory for input tensors
alloc = ggml_allocr_new_measure(tensor_alignment);
ggml_allocr_alloc(alloc, tokens_input);
ggml_allocr_alloc(alloc, target_probs);
size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
ggml_allocr_free(alloc);
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
tensor_alignment;
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// allocate input tensors
+24 -19
View File
@@ -19,7 +19,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define CL_DMMV_BLOCK_SIZE 32
#define CL_DMMV_LOCAL_SIZE 32
#ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 1
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
__global const struct block_q2_K * x = xx + ib0;
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
__global const struct block_q3_K * x = xx + ib0;
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
const int tid = get_local_id(0)/2; // 0...15
const int ix = get_local_id(0)%2;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
__global const struct block_q6_K * x = xx + ib0;
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
const int block_size = get_local_size(0);
const int local_size = get_local_size(0);
const int row = get_group_id(0);
const int tid = get_local_id(0);
const uint qk = QUANT_K;
const uint qr = QUANT_R;
const int col_step = local_size * 2;
const int y_offset = qr == 1 ? 1 : qk/2;
x += get_global_offset(0);
tmp[tid] = 0;
for (int i = 0; i < ncols/block_size; i += 2) {
const int col = i*block_size + 2*tid;
for (int col = tid*2; col < ncols; col += col_step) {
const int ib = (row*ncols + col)/qk; // block index
const int iqs = (col%qk)/qr; // quant index
const int iybs = col - col%qk; // y block start index
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=block_size/2; s>0; s>>=1) {
for (int s=local_size/2; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
@@ -1566,7 +1568,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
ggml_cl_pool_free(d_D, d_size);
}
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
GGML_ASSERT(fp16_support);
const int64_t ne00 = src0->ne[0];
@@ -1596,6 +1598,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
size_t x_size;
size_t y_size;
size_t d_size;
@@ -1632,7 +1638,6 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
// convert src1 to fp16
// TODO: use multiple threads
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
if (src1_cont_rows) {
if (src1_cont_cols) {
@@ -1704,7 +1709,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
const ggml_type type = src0->type;
const bool mul_mat_vec = ne11 == 1;
const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
const int64_t r2 = ne12 / ne02;
const int64_t r3 = ne13 / ne03;
@@ -1737,7 +1742,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
GGML_ASSERT(to_fp32_cl != nullptr);
const size_t global_denom = ggml_cl_global_denom(type);
const size_t local = ggml_cl_local_size(type);
const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
size_t ev_idx = 0;
std::vector<cl_event> events;
@@ -1770,8 +1775,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
// compute
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
const size_t local = CL_DMMV_BLOCK_SIZE;
const size_t global = ne01 * local;
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
const cl_int ncols = ne00;
events.emplace_back();
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1779,7 +1784,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
// convert src0 to fp32 on device
const size_t global = x_ne / global_denom;
@@ -1895,8 +1900,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
}
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
return ggml_nelements(src1) * sizeof(ggml_fp16_t);
if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
}
return 0;
}
+34
View File
@@ -5494,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
return result;
}
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
struct ggml_object * obj = ctx->objects_begin;
char * const mem_buffer = ctx->mem_buffer;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
return (struct ggml_tensor *)(mem_buffer + obj->offs);
}
obj = obj->next;
}
return NULL;
}
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
obj = obj->next;
char * const mem_buffer = ctx->mem_buffer;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
return (struct ggml_tensor *)(mem_buffer + obj->offs);
}
obj = obj->next;
}
return NULL;
}
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
struct ggml_object * obj = ctx->objects_begin;
@@ -8647,6 +8680,7 @@ void ggml_set_param(
GGML_ASSERT(tensor->grad == NULL);
tensor->grad = ggml_dup_tensor(ctx, tensor);
ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
}
// ggml_compute_forward_dup
+3
View File
@@ -704,6 +704,9 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
// Context tensor enumeration and lookup
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+11 -19
View File
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
}
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
const int nb = k / QK_K;
(void)hist; // TODO: collect histograms
// TODO - collect histograms - although, at a second thought, I don't really care about them
(void)hist;
for (int j = 0; j < nb; j += k) {
for (int j = 0; j < n; j += k) {
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
quantize_row_q2_K_reference(src + j, y, k);
}
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
}
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
const int nb = k / QK_K;
(void)hist; // TODO: collect histograms
// TODO - collect histograms - although, at a second thought, I don't really care about them
(void)hist;
for (int j = 0; j < nb; j += k) {
for (int j = 0; j < n; j += k) {
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
quantize_row_q3_K_reference(src + j, y, k);
}
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
(void)hist; // TODO: collect histograms
for (int j = 0; j < nb; j += k) {
for (int j = 0; j < n; j += k) {
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
quantize_row_q4_K_reference(src + j, y, k);
}
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
(void)hist;
for (int j = 0; j < nb; j += k) {
(void)hist; // TODO: collect histograms
for (int j = 0; j < n; j += k) {
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
quantize_row_q5_K_reference(src + j, y, k);
}
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
(void)hist; // TODO: collect histograms
(void)hist; // TODO
for (int j = 0; j < nb; j += k) {
for (int j = 0; j < n; j += k) {
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
quantize_row_q6_K_reference(src + j, y, k);
}
+281 -16
View File
@@ -75,6 +75,7 @@
#include <thread>
#include <unordered_map>
#include <set>
#include <forward_list>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -1183,6 +1184,8 @@ struct llama_vocab {
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::unordered_map<token, id> special_tokens_cache;
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
@@ -2125,7 +2128,7 @@ static void llm_load_hparams(
}
// TODO: This should probably be in llama.h
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
static void llm_load_vocab(
@@ -2241,6 +2244,101 @@ static void llm_load_vocab(
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
// build special tokens cache
{
// TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
// and will always be correctly labeled in 'added_tokens.json' etc.
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
// are special tokens.
// From testing, this appears to corelate 1:1 with special tokens.
//
// Counting special tokens and verifying in only one direction
// is sufficient to detect difference in those two sets.
//
uint32_t special_tokens_count_by_type = 0;
uint32_t special_tokens_count_from_verification = 0;
bool special_tokens_definition_mismatch = false;
for (const auto & t : vocab.token_to_id) {
const auto & token = t.first;
const auto & id = t.second;
// Count all non-normal tokens in the vocab while iterating
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
special_tokens_count_by_type++;
}
// Skip single character tokens
if (token.length() > 1) {
bool is_tokenizable = false;
// Split token string representation in two, in all possible ways
// and check if both halves can be matched to a valid token
for (unsigned i = 1; i < token.length();) {
const auto left = token.substr(0, i);
const auto right = token.substr(i);
// check if we didnt partition in the middle of a utf sequence
auto utf = utf8_len(left.at(left.length() - 1));
if (utf == 1) {
if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
is_tokenizable = true;
break;
}
i++;
} else {
// skip over the rest of multibyte utf sequence
i += utf - 1;
}
}
if (!is_tokenizable) {
// Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
// it's faster to re-filter them here, since there are way less candidates now
// Calculate a total "utf" length of a token string representation
size_t utf8_str_len = 0;
for (unsigned i = 0; i < token.length();) {
utf8_str_len++;
i += utf8_len(token.at(i));
}
// And skip the ones which are one character
if (utf8_str_len > 1) {
// At this point what we have left are special tokens only
vocab.special_tokens_cache[token] = id;
// Count manually found special tokens
special_tokens_count_from_verification++;
// If this manually found special token is not marked as such, flag a mismatch
if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
special_tokens_definition_mismatch = true;
}
}
}
}
}
if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
fprintf(stderr, "%s: warning: Mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
__func__,
special_tokens_count_from_verification, vocab.id_to_token.size(),
special_tokens_count_by_type, vocab.id_to_token.size()
);
} else {
fprintf(stderr, "%s: Special tokens definition check successful ( %u/%zu ).\n",
__func__,
special_tokens_count_from_verification, vocab.id_to_token.size()
);
}
}
}
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -2839,8 +2937,8 @@ static void llm_load_tensors(
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
@@ -5368,7 +5466,7 @@ static struct ggml_cgraph * llm_build_mpt(
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = cparams.n_ctx;
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();
@@ -5721,7 +5819,6 @@ static struct ggml_cgraph * llama_build_graph(
//
// - lctx: llama context
// - batch: batch to evaluate
// - n_threads: number of threads to use
//
// return 0 on success
// return positive int on warning
@@ -6465,7 +6562,137 @@ private:
llm_bigram_bpe::queue work_queue;
};
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
} FRAGMENT_BUFFER_VARIANT_TYPE;
struct fragment_buffer_variant{
fragment_buffer_variant(llama_vocab::id _token)
:
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
token(_token),
raw_text(_dummy),
offset(0),
length(0){}
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
:
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
token((llama_vocab::id)-1),
raw_text(_raw_text),
offset(_offset),
length(_length){
GGML_ASSERT( _offset >= 0 );
GGML_ASSERT( _length >= 1 );
GGML_ASSERT( offset + length <= raw_text.length() );
}
const FRAGMENT_BUFFER_VARIANT_TYPE type;
const llama_vocab::id token;
const std::string _dummy;
const std::string & raw_text;
const uint64_t offset;
const uint64_t length;
};
// #define PRETOKENIZERDEBUG
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
{
// for each special token
for (const auto & st: vocab.special_tokens_cache) {
const auto & special_token = st.first;
const auto & special_id = st.second;
// for each text fragment
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
while (it != buffer.end()) {
auto & fragment = (*it);
// if a fragment is text ( not yet processed )
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto * raw_text = &(fragment.raw_text);
auto raw_text_base_offset = fragment.offset;
auto raw_text_base_length = fragment.length;
// loop over the text
while (true) {
// find the first occurence of a given special token in this fragment
// passing offset argument only limit the "search area" but match coordinates
// are still relative to the source full raw_text
auto match = raw_text->find(special_token, raw_text_base_offset);
// no occurences found, stop processing this fragment for a given special token
if (match == std::string::npos) break;
// check if match is within bounds of offset <-> length
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
#ifdef PRETOKENIZERDEBUG
fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif
auto source = std::distance(buffer.begin(), it);
// if match is further than base offset
// then we have some text to the left of it
if (match > raw_text_base_offset) {
// left
const int64_t left_reminder_offset = raw_text_base_offset + 0;
const int64_t left_reminder_length = match - raw_text_base_offset;
buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
#ifdef PRETOKENIZERDEBUG
fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
#endif
it++;
}
// special token
buffer.emplace_after(it, special_id);
it++;
// right
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
const int64_t right_reminder_offset = match + special_token.length();
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
#ifdef PRETOKENIZERDEBUG
fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
#endif
it++;
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source-1)));
}
// repeat for the right side
raw_text_base_offset = right_reminder_offset;
raw_text_base_length = right_reminder_length;
#ifdef PRETOKENIZERDEBUG
fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif
} else {
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source-1)));
}
break;
}
}
}
it++;
}
}
}
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
std::vector<llama_vocab::id> output;
// OG tokenizer behavior:
@@ -6481,20 +6708,58 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
return output;
}
std::forward_list<fragment_buffer_variant> fragment_buffer;
fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
if (special) tokenizer_st_partition( vocab, fragment_buffer );
switch (vocab.type) {
case LLAMA_VOCAB_TYPE_SPM:
{
// without adding this leading whitespace, we do not get the same results as the original tokenizer
raw_text = " " + raw_text;
for (const auto & fragment: fragment_buffer)
{
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
{
// without adding this leading whitespace, we do not get the same results as the original tokenizer
llm_tokenizer_spm tokenizer(vocab);
llama_escape_whitespace(raw_text);
tokenizer.tokenize(raw_text, output);
// TODO: It's likely possible to get rid of this string copy entirely
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
// and passing 'add space prefix' as bool argument
//
auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif
llm_tokenizer_spm tokenizer(vocab);
llama_escape_whitespace(raw_text);
tokenizer.tokenize(raw_text, output);
}
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
{
output.push_back(fragment.token);
}
}
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
llm_tokenizer_bpe tokenizer(vocab);
tokenizer.tokenize(raw_text, output);
for (const auto & fragment: fragment_buffer)
{
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
{
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif
llm_tokenizer_bpe tokenizer(vocab);
tokenizer.tokenize(raw_text, output);
}
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
{
output.push_back(fragment.token);
}
}
} break;
}
@@ -9408,15 +9673,15 @@ llama_token llama_token_eot(const struct llama_context * ctx) {
return ctx->model.vocab.special_eot_id;
}
int llama_tokenize(
const struct llama_model * model,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
bool add_bos,
bool special) {
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
if (n_max_tokens < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+8 -5
View File
@@ -511,17 +511,20 @@ extern "C" {
// Tokenization
//
// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens
// Returns a negative number on failure - the number of tokens that would have been returned
/// @details Convert the provided text into tokens.
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
/// @return Returns the number of tokens on success, no more than n_max_tokens
/// @return Returns a negative number on failure - the number of tokens that would have been returned
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
/// Does not insert a leading space.
LLAMA_API int llama_tokenize(
const struct llama_model * model,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
bool add_bos,
bool special);
// Token Id -> Piece.
// Uses the vocabulary in the provided context.