Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- diff --git a/tools/server/server.cpp b/tools/server/server.cpp
- index b23e35d3..53bdfd75 100644
- --- a/tools/server/server.cpp
- +++ b/tools/server/server.cpp
- @@ -4228,57 +4228,16 @@ int main(int argc, char ** argv) {
- // TODO: this log can become very long, put it behind a flag or think about a more compact format
- //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
- - // process files
- - mtmd::bitmaps bitmaps;
- - const bool has_mtmd = ctx_server.mctx != nullptr;
- - {
- - if (!has_mtmd && !files.empty()) {
- - throw std::runtime_error("This server does not support multimodal");
- - }
- - for (auto & file : files) {
- - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
- - if (!bmp.ptr) {
- - throw std::runtime_error("Failed to load image or audio file");
- - }
- - // calculate bitmap hash (for KV caching)
- - std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
- - bmp.set_id(hash.c_str());
- - bitmaps.entries.push_back(std::move(bmp));
- - }
- - }
- -
- // process prompt
- std::vector<server_tokens> inputs;
- - if (has_mtmd) {
- - // multimodal
- - std::string prompt_str = prompt.get<std::string>();
- - mtmd_input_text inp_txt = {
- - prompt_str.c_str(),
- - /* add_special */ true,
- - /* parse_special */ true,
- - };
- - mtmd::input_chunks chunks(mtmd_input_chunks_init());
- - auto bitmaps_c_ptr = bitmaps.c_ptr();
- - int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
- - chunks.ptr.get(),
- - &inp_txt,
- - bitmaps_c_ptr.data(),
- - bitmaps_c_ptr.size());
- - if (tokenized != 0) {
- - throw std::runtime_error("Failed to tokenize prompt");
- - }
- -
- - server_tokens tmp(chunks, true);
- - inputs.push_back(std::move(tmp));
- - } else {
- - // non-multimodal version
- - auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
- - for (auto & p : tokenized_prompts) {
- - auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
- - inputs.push_back(std::move(tmp));
- - }
- - }
- + if (oaicompat && ctx_server.mctx != nullptr) {
- + // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
- + inputs.push_back(std::move(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files)));
- + } else {
- + // Everything else, including multimodal completions.
- + inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
- + }
- tasks.reserve(inputs.size());
- for (size_t i = 0; i < inputs.size(); i++) {
- @@ -4451,7 +4410,7 @@ int main(int argc, char ** argv) {
- data["input_extra"] = input_extra; // default to empty array if it's not exist
- std::string prompt = json_value(data, "prompt", std::string());
- - std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true);
- + std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true);
- SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
- data["prompt"] = format_infill(
- ctx_server.vocab,
- @@ -4462,7 +4421,7 @@ int main(int argc, char ** argv) {
- ctx_server.params_base.n_predict,
- ctx_server.slots[0].n_ctx, // TODO: there should be a better way
- ctx_server.params_base.spm_infill,
- - tokenized_prompts[0]
- + tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
- );
- std::vector<raw_buffer> files; // dummy
- @@ -4640,7 +4599,7 @@ int main(int argc, char ** argv) {
- }
- }
- - auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
- + auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
- for (const auto & tokens : tokenized_prompts) {
- // this check is necessary for models that do not add BOS token to the input
- if (tokens.empty()) {
- @@ -4668,7 +4627,7 @@ int main(int argc, char ** argv) {
- task.id = ctx_server.queue_tasks.get_new_id();
- task.index = i;
- - task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
- + task.prompt_tokens = std::move(tokenized_prompts[i]);
- // OAI-compat
- task.params.oaicompat = oaicompat;
- @@ -4755,7 +4714,7 @@ int main(int argc, char ** argv) {
- return;
- }
- - llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0];
- + server_tokens tokenized_query = std::move(tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, query, /* add_special */ false, true)[0]);
- // create and queue the task
- json responses = json::array();
- @@ -4763,14 +4722,14 @@ int main(int argc, char ** argv) {
- std::unordered_set<int> task_ids;
- {
- std::vector<server_task> tasks;
- - auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
- + auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
- tasks.reserve(tokenized_docs.size());
- for (size_t i = 0; i < tokenized_docs.size(); i++) {
- auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
- server_task task = server_task(SERVER_TASK_TYPE_RERANK);
- task.id = ctx_server.queue_tasks.get_new_id();
- task.index = i;
- - task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
- + task.prompt_tokens = std::move(tmp);
- tasks.push_back(std::move(task));
- }
- diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py
- index be3a0052..c3317140 100644
- --- a/tools/server/tests/unit/test_completion.py
- +++ b/tools/server/tests/unit/test_completion.py
- @@ -231,6 +231,27 @@ def test_nocache_long_input_prompt():
- })
- assert res.status_code == 200
- +def test_nocache_json_prompt():
- + global server
- + server.start()
- + res = server.make_request("POST", "/completion", data={
- + "prompt": { "prompt": "I believe the meaning of life is" },
- + "seed": 42,
- + "temperature": 1.0,
- + "cache_prompt": False,
- + })
- + assert res.status_code == 200
- +
- +def test_nocache_multimodal_prompt():
- + global server
- + server.start()
- + res = server.make_request("POST", "/completion", data={
- + "prompt": { "prompt": "I believe the meaning of life is", "multimodal_data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" },
- + "seed": 42,
- + "temperature": 1.0,
- + "cache_prompt": False,
- + })
- + assert res.status_code == 200
- def test_completion_with_tokens_input():
- global server
- @@ -269,6 +290,15 @@ def test_completion_with_tokens_input():
- assert len(res.body) == 2
- assert res.body[0]["content"] == res.body[1]["content"]
- + # mixed multimodal and tokens
- + res = server.make_request("POST", "/completion", data={
- + "prompt": [tokens, { "prompt": "My name is ", "multimodal_data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" }],
- + })
- + assert res.status_code == 200
- + assert type(res.body) == list
- + assert len(res.body) == 2
- + assert res.body[0]["content"] == res.body[1]["content"]
- +
- # mixed string and tokens in one sequence
- res = server.make_request("POST", "/completion", data={
- "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
- diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
- index f3dfc822..2865d977 100644
- --- a/tools/server/utils.hpp
- +++ b/tools/server/utils.hpp
- @@ -186,48 +186,6 @@ static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_
- return prompt_tokens;
- }
- -/**
- - * break the input "prompt" object into multiple prompt if needed, then tokenize them
- - * this supports these cases:
- - * - "prompt": "string"
- - * - "prompt": [12, 34, 56]
- - * - "prompt": [12, 34, "string", 56, 78]
- - * and multiple prompts (multi-tasks):
- - * - "prompt": ["string1", "string2"]
- - * - "prompt": ["string1", [12, 34, 56]]
- - * - "prompt": [[12, 34, 56], [78, 90, 12]]
- - * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
- - */
- -static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
- - std::vector<llama_tokens> result;
- - if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
- - // string or mixed
- - result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
- - } else if (json_is_array_of_numbers(json_prompt)) {
- - // array of tokens
- - result.push_back(json_prompt.get<llama_tokens>());
- - } else if (json_prompt.is_array()) {
- - // array of prompts
- - result.reserve(json_prompt.size());
- - for (const auto & p : json_prompt) {
- - if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
- - result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
- - } else if (json_is_array_of_numbers(p)) {
- - // array of tokens
- - result.push_back(p.get<llama_tokens>());
- - } else {
- - throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
- - }
- - }
- - } else {
- - throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
- - }
- - if (result.empty()) {
- - throw std::runtime_error("\"prompt\" must not be empty");
- - }
- - return result;
- -}
- -
- // return the last index of character that can form a valid string
- // if the last character is potentially cut in half, return the index before the cut
- // if validate_utf8(text) == text.size(), then the whole text is valid utf8
- @@ -262,35 +220,6 @@ static size_t validate_utf8(const std::string& text) {
- // template utils
- //
- -// format rerank task: [BOS]query[EOS][SEP]doc[EOS]
- -static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
- - llama_tokens result;
- -
- - // Get EOS token - use SEP token as fallback if EOS is not available
- - llama_token eos_token = llama_vocab_eos(vocab);
- - if (eos_token == LLAMA_TOKEN_NULL) {
- - eos_token = llama_vocab_sep(vocab);
- - }
- -
- - result.reserve(doc.size() + query.size() + 4);
- - if (llama_vocab_get_add_bos(vocab)) {
- - result.push_back(llama_vocab_bos(vocab));
- - }
- - result.insert(result.end(), query.begin(), query.end());
- - if (llama_vocab_get_add_eos(vocab)) {
- - result.push_back(eos_token);
- - }
- - if (llama_vocab_get_add_sep(vocab)) {
- - result.push_back(llama_vocab_sep(vocab));
- - }
- - result.insert(result.end(), doc.begin(), doc.end());
- - if (llama_vocab_get_add_eos(vocab)) {
- - result.push_back(eos_token);
- - }
- -
- - return result;
- -}
- -
- // format infill task
- static llama_tokens format_infill(
- const llama_vocab * vocab,
- @@ -1186,6 +1115,18 @@ public:
- }
- }
- + // appends server tokens, updates the media map. destroys server tokens.
- + void push_back(const server_tokens & tokens) {
- + size_t start_size = tokens.size();
- + for (size_t i = 0; i < start_size; i++) {
- + push_back(tokens[i]);
- + }
- + // TODO, currently this breaks multimodal document ranking!
- + //for (auto it = tokens.map_pos_to_media.begin(); it != tokens.map_pos_to_media.end(); ) {
- + // map_pos_to_media[start_size+it->first]=std::move(it->second);
- + //}
- + }
- +
- // for compatibility with context shift and prompt truncation
- void insert(const llama_tokens & inp_tokens) {
- GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
- @@ -1356,3 +1297,146 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
- }
- return std::to_string(hash);
- }
- +
- +
- +// format rerank task: [BOS]query[EOS][SEP]doc[EOS]
- +static server_tokens format_rerank(const struct llama_vocab * vocab, const server_tokens & query, const server_tokens & doc) {
- + server_tokens result = {};
- +
- + // Get EOS token - use SEP token as fallback if EOS is not available
- + llama_token eos_token = llama_vocab_eos(vocab);
- + if (eos_token == LLAMA_TOKEN_NULL) {
- + eos_token = llama_vocab_sep(vocab);
- + }
- + if (llama_vocab_get_add_bos(vocab)) {
- + result.push_back(llama_vocab_bos(vocab));
- + }
- + result.push_back(query);
- + if (llama_vocab_get_add_eos(vocab)) {
- + result.push_back(eos_token);
- + }
- + if (llama_vocab_get_add_sep(vocab)) {
- + result.push_back(llama_vocab_sep(vocab));
- + }
- + result.push_back(doc);
- + if (llama_vocab_get_add_eos(vocab)) {
- + result.push_back(eos_token);
- + }
- + return result;
- +}
- +
- +
- +static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
- + mtmd::bitmaps bitmaps;
- + for (auto & file : files) {
- + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
- + if (!bmp.ptr) {
- + throw std::runtime_error("Failed to load image or audio file");
- + }
- + // calculate bitmap hash (for KV caching)
- + std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
- + bmp.set_id(hash.c_str());
- + bitmaps.entries.push_back(std::move(bmp));
- + }
- + // process prompt
- + std::vector<server_tokens> inputs;
- + // multimodal
- + mtmd_input_text inp_txt = {
- + prompt.c_str(),
- + /* add_special */ true,
- + /* parse_special */ true,
- + };
- + mtmd::input_chunks chunks(mtmd_input_chunks_init());
- + auto bitmaps_c_ptr = bitmaps.c_ptr();
- + int32_t tokenized = mtmd_tokenize(mctx,
- + chunks.ptr.get(),
- + &inp_txt,
- + bitmaps_c_ptr.data(),
- + bitmaps_c_ptr.size());
- + if (tokenized != 0) {
- + throw std::runtime_error("Failed to tokenize prompt");
- + }
- + auto result = server_tokens(chunks,true);
- + return result;
- +}
- +
- +/**
- + * break the input "prompt" object into multiple prompt if needed, then tokenize them
- + * this supports these cases:
- + * - "prompt": "string"
- + * - "prompt": [12, 34, 56]
- + * - "prompt": [12, 34, "string", 56, 78]
- + * and multiple prompts (multi-tasks):
- + * - "prompt": ["string1", "string2"]
- + * - "prompt": ["string1", [12, 34, 56]]
- + * - "prompt": [[12, 34, 56], [78, 90, 12]]
- + * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
- + */
- +static std::vector<server_tokens> tokenize_input_prompts(const llama_vocab* vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) {
- + std::vector<server_tokens> result;
- + bool has_mtmd = mctx != nullptr;
- + if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
- + // string or mixed
- + llama_tokens toks = tokenize_mixed(vocab, json_prompt, add_special, parse_special);
- + auto tmp = server_tokens(toks, false);
- + result.push_back(std::move(tmp));
- + } else if (json_is_array_of_numbers(json_prompt)) {
- + // array of tokens
- + llama_tokens toks = json_prompt.get<llama_tokens>();
- + auto tmp = server_tokens(toks, false);
- + result.push_back(std::move(tmp));
- + } else if (json_prompt.find("prompt") != json_prompt.end()) {
- + // JSON object with prompt key.
- + if (has_mtmd && json_prompt.find("multimodal_data") != json_prompt.end()) {
- + // JSON object with prompt and multimodal key.
- + std::vector<raw_buffer> files;
- + for (const auto& entry : json_prompt.at("multimodal_data")) {
- + files.push_back(base64_decode(entry));
- + }
- + result.push_back(std::move(process_mtmd_prompt(mctx, json_prompt.at("prompt"), files)));
- + } else {
- + // Not multimodal, but contains a subobject.
- + llama_tokens toks = tokenize_mixed(vocab, json_prompt.at("prompt"), add_special, parse_special);
- + auto tmp = server_tokens(toks, false);
- + result.push_back(std::move(tmp));
- + }
- + } else if (json_prompt.is_array()) {
- + // array of prompts
- + result.reserve(json_prompt.size());
- + for (const auto & p : json_prompt) {
- + if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
- + llama_tokens toks = tokenize_mixed(vocab, p, add_special, parse_special);
- + auto tmp = server_tokens(toks, false);
- + result.push_back(std::move(tmp));
- + } else if (json_is_array_of_numbers(p)) {
- + // array of tokens
- + llama_tokens toks = p.get<llama_tokens>();
- + auto tmp = server_tokens(toks,false);
- + result.push_back(std::move(tmp));
- + } else if (has_mtmd && p.find("prompt") != p.end()) {
- + if (p.find("multimodal_data") != p.end()) {
- + // Multimodal JSON object.
- + std::vector<raw_buffer> files;
- + for (const auto& entry : p.at("multimodal_data")) {
- + files.push_back(base64_decode(entry));
- + }
- + result.push_back(process_mtmd_prompt(mctx, json_prompt.at("prompt"), files));
- + } else {
- + // Non-multimodal JSON object.
- + llama_tokens toks = tokenize_mixed(vocab, p, add_special, parse_special);
- + auto tmp = server_tokens(toks, false);
- + result.push_back(std::move(tmp));
- + }
- + } else {
- + throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
- + }
- + }
- + } else {
- + throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
- + }
- + if (result.empty()) {
- + throw std::runtime_error("\"prompt\" must not be empty");
- + }
- + return result;
- +}
- +
Add Comment
Please, Sign In to add comment