Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "ggml.h"
- #include "gguf.h"
- #include "common.h"
- #include <algorithm>
- #include <cinttypes>
- #include <cstdio>
- #include <cstdlib>
- #include <stdexcept>
- #include <cstring>
- #include <fstream>
- #include <string>
- #include <vector>
- // Helper function to check if a string ends with another string
- bool ends_with(const std::string &str, const std::string &suffix) {
- if (str.length() < suffix.length()) return false;
- return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
- }
- // Helper function to replace all occurrences of a substring
- std::string replace_string(const std::string &str, const std::string &from, const std::string &to) {
- std::string result = str;
- size_t start_pos = 0;
- while ((start_pos = result.find(from, start_pos)) != std::string::npos) {
- result.replace(start_pos, from.length(), to);
- start_pos += to.length();
- }
- return result;
- }
- // Write zeros for padding
- void zeros(std::ofstream &file, size_t n) {
- char zero = 0;
- for (size_t i = 0; i < n; ++i) {
- file.write(&zero, 1);
- }
- }
- int main(int argc, const char **argv) {
- if (argc != 3) {
- fprintf(stderr, "Usage: %s <input.gguf> <output.gguf>\n", argv[0]);
- return EXIT_FAILURE;
- }
- const std::string input_path = argv[1];
- const std::string output_path = argv[2];
- // Load the original GGUF file
- struct ggml_context *ctx_meta = nullptr;
- struct gguf_init_params params = {
- /*.no_alloc = */ true,
- /*.ctx = */ &ctx_meta,
- };
- struct gguf_context *original_ctx = gguf_init_from_file(input_path.c_str(), params);
- if (!original_ctx) {
- fprintf(stderr, "Failed to load input GGUF file: %s\n", input_path.c_str());
- return EXIT_FAILURE;
- }
- // Create a new GGUF context
- struct gguf_context *new_ctx = gguf_init_empty();
- if (!new_ctx) {
- fprintf(stderr, "Failed to initialize new GGUF context\n");
- gguf_free(original_ctx);
- return EXIT_FAILURE;
- }
- // Copy metadata from original to new context (excluding tensors)
- const int n_kv = gguf_get_n_kv(original_ctx);
- for (int i = 0; i < n_kv; ++i) {
- const char *key = gguf_get_key(original_ctx, i);
- const enum gguf_type type = gguf_get_kv_type(original_ctx, i);
- switch (type) {
- case GGUF_TYPE_UINT8:
- gguf_set_val_u8(new_ctx, key, gguf_get_val_u8(original_ctx, i));
- break;
- case GGUF_TYPE_INT8:
- gguf_set_val_i8(new_ctx, key, gguf_get_val_i8(original_ctx, i));
- break;
- case GGUF_TYPE_UINT16:
- gguf_set_val_u16(new_ctx, key, gguf_get_val_u16(original_ctx, i));
- break;
- case GGUF_TYPE_INT16:
- gguf_set_val_i16(new_ctx, key, gguf_get_val_i16(original_ctx, i));
- break;
- case GGUF_TYPE_UINT32:
- gguf_set_val_u32(new_ctx, key, gguf_get_val_u32(original_ctx, i));
- break;
- case GGUF_TYPE_INT32:
- gguf_set_val_i32(new_ctx, key, gguf_get_val_i32(original_ctx, i));
- break;
- case GGUF_TYPE_FLOAT32:
- gguf_set_val_f32(new_ctx, key, gguf_get_val_f32(original_ctx, i));
- break;
- case GGUF_TYPE_UINT64:
- gguf_set_val_u64(new_ctx, key, gguf_get_val_u64(original_ctx, i));
- break;
- case GGUF_TYPE_INT64:
- gguf_set_val_i64(new_ctx, key, gguf_get_val_i64(original_ctx, i));
- break;
- case GGUF_TYPE_FLOAT64:
- gguf_set_val_f64(new_ctx, key, gguf_get_val_f64(original_ctx, i));
- break;
- case GGUF_TYPE_BOOL:
- gguf_set_val_bool(new_ctx, key, gguf_get_val_bool(original_ctx, i));
- break;
- case GGUF_TYPE_STRING:
- gguf_set_val_str(new_ctx, key, gguf_get_val_str(original_ctx, i));
- break;
- case GGUF_TYPE_ARRAY:
- // Handle arrays if needed (not typically required for this conversion)
- break;
- default:
- fprintf(stderr, "Unhandled metadata type: %d\n", type);
- break;
- }
- }
- std::vector<std::vector<uint8_t>> tensor_data_buffers;
- std::ifstream f_input(input_path, std::ios::binary);
- if (!f_input.is_open()) {
- fprintf(stderr, "Failed to open input file for reading: %s\n", input_path.c_str());
- gguf_free(original_ctx);
- gguf_free(new_ctx);
- return EXIT_FAILURE;
- }
- // Retrieve hyperparameters from metadata (replace keys if necessary)
- int num_key_value_heads = 128; // TODO: Don't hardcode
- int v_head_dim = 128; // TODO: Don't hardcode
- int qk_nope_head_dim = 128; // TODO: Don't hardcode
- const int n_tensors = gguf_get_n_tensors(original_ctx);
- int split_count = 0;
- for (int i = 0; i < n_tensors; ++i) {
- const char *tensor_name = gguf_get_tensor_name(original_ctx, i);
- fprintf(stderr, "Processing tensor_name: %s\n", tensor_name);
- struct ggml_tensor *tensor = ggml_get_tensor(ctx_meta, tensor_name);
- if (ends_with(tensor_name, "kv_b.weight")) {
- fprintf(stderr, "GOING TO SPLIT %s\n", tensor_name);
- // Read tensor data
- const size_t data_size = ggml_nbytes(tensor);
- std::vector<uint8_t> data(data_size);
- const size_t offset = gguf_get_data_offset(original_ctx) + gguf_get_tensor_offset(original_ctx, i);
- f_input.seekg(offset);
- f_input.read(reinterpret_cast<char*>(data.data()), data_size);
- // Assuming F32 data type
- float *f_data = reinterpret_cast<float*>(data.data());
- const int a = tensor->ne[0]; // rows
- const int b = tensor->ne[1]; // columns
- // Validate hyperparameters
- if (a != num_key_value_heads * (v_head_dim + qk_nope_head_dim)) {
- fprintf(stderr, "Tensor shape does not match hyperparameters, EXPECTED (num_key_value_heads * (v_head_dim + qk_nope_head_dim)):%d, ACTUAL:%d\n", num_key_value_heads * (v_head_dim + qk_nope_head_dim), a);
- gguf_free(original_ctx);
- gguf_free(new_ctx);
- return EXIT_FAILURE;
- }
- const int n_head_kv = num_key_value_heads;
- const int qkn = qk_nope_head_dim;
- const int vhd = v_head_dim;
- // Prepare new tensors' data
- std::vector<float> k_data(n_head_kv * b * qkn, 0.0f);
- std::vector<float> v_data(n_head_kv * vhd * b, 0.0f);
- for (int h = 0; h < n_head_kv; ++h) {
- // Process k part
- for (int q = 0; q < qkn; ++q) {
- for (int c = 0; c < b; ++c) {
- const size_t original_idx = h * (vhd + qkn) * b + q * b + c;
- const size_t new_idx = (h * b + c) * qkn + q;
- k_data[new_idx] = f_data[original_idx];
- }
- }
- // Process v part
- for (int v_row = 0; v_row < vhd; ++v_row) {
- const size_t original_start = h * (vhd + qkn) * b + (qkn + v_row) * b;
- const size_t new_start = (h * vhd + v_row) * b;
- memcpy(&v_data[new_start], &f_data[original_start], b * sizeof(float));
- }
- }
- // Create new tensor names
- std::string k_name = replace_string(tensor_name, "kv_b", "k_b");
- std::string v_name = replace_string(tensor_name, "kv_b", "v_b");
- // Add new tensors to the context
- struct ggml_tensor *k_tensor = ggml_new_tensor_2d(ctx_meta, GGML_TYPE_F32, qkn, n_head_kv * b);
- gguf_add_tensor(new_ctx, k_tensor);
- struct ggml_tensor *v_tensor = ggml_new_tensor_2d(ctx_meta, GGML_TYPE_F32, b, n_head_kv * vhd);
- gguf_add_tensor(new_ctx, v_tensor);
- // Store data buffers
- std::vector<uint8_t> k_buffer(reinterpret_cast<uint8_t*>(k_data.data()), reinterpret_cast<uint8_t*>(k_data.data() + k_data.size()));
- std::vector<uint8_t> v_buffer(reinterpret_cast<uint8_t*>(v_data.data()), reinterpret_cast<uint8_t*>(v_data.data() + v_data.size()));
- tensor_data_buffers.push_back(k_buffer);
- tensor_data_buffers.push_back(v_buffer);
- split_count++;
- }
- // Add original tensor to new context
- gguf_add_tensor(new_ctx, tensor);
- // Read data
- const size_t data_size = ggml_nbytes(tensor);
- std::vector<uint8_t> data(data_size);
- const size_t offset = gguf_get_data_offset(original_ctx) + gguf_get_tensor_offset(original_ctx, i);
- f_input.seekg(offset);
- f_input.read(reinterpret_cast<char*>(data.data()), data_size);
- tensor_data_buffers.push_back(data);
- }
- fprintf(stderr, "Finished Processing tensors, will now write output>\n");
- // Write the new GGUF file
- std::ofstream f_output(output_path, std::ios::binary);
- if (!f_output.is_open()) {
- fprintf(stderr, "Failed to open output file: %s\n", output_path.c_str());
- gguf_free(original_ctx);
- gguf_free(new_ctx);
- return EXIT_FAILURE;
- }
- // Write metadata
- const size_t meta_size = gguf_get_meta_size(new_ctx);
- std::vector<uint8_t> meta_buffer(meta_size);
- gguf_get_meta_data(new_ctx, meta_buffer.data());
- f_output.write(reinterpret_cast<const char*>(meta_buffer.data()), meta_size);
- // Write tensor data
- for (const auto &data : tensor_data_buffers) {
- f_output.write(reinterpret_cast<const char*>(data.data()), data.size());
- const size_t pad_size = GGML_PAD(data.size(), GGUF_DEFAULT_ALIGNMENT) - data.size();
- zeros(f_output, pad_size);
- }
- // Cleanup
- gguf_free(original_ctx);
- gguf_free(new_ctx);
- f_input.close();
- f_output.close();
- fprintf(stderr, "Successfully converted %d tensors. Output written to %s\n", split_count, output_path.c_str());
- return EXIT_SUCCESS;
- }
Advertisement
Add Comment
Please, Sign In to add comment