Skip to main content
1

Clone the repository

git clone https://github.com/ggml-org/ggml
cd ggml
2

Install Python dependencies (optional)

Some examples require Python tooling to download model weights. Skip this step if you only want to build the library.
python3.10 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
3

Build with CMake

mkdir build && cd build
cmake ..
cmake --build . --config Release -j 8
Compiled binaries are placed in build/bin/.
4

Run the simple example

The simple-ctx example performs a matrix multiplication using the CPU backend.
./build/bin/simple-ctx
Expected output:
mul mat (4 x 3) (transposed result):
[  60.00  90.00  42.00
   55.00  54.00  29.00
   50.00  54.00  28.00
  110.00 126.00  64.00 ]

Working examples

The two simple examples demonstrate the two main APIs.
This example allocates a context that owns tensor data, builds a matrix multiplication graph, and executes it on the CPU.
simple-ctx.cpp
#include "ggml.h"
#include "ggml-cpu.h"

#include <cassert>
#include <cstdio>
#include <cstring>
#include <vector>

struct simple_model {
    struct ggml_tensor * a;
    struct ggml_tensor * b;
    struct ggml_context * ctx;
};

void load_model(simple_model & model, float * a, float * b,
                int rows_A, int cols_A, int rows_B, int cols_B) {
    size_t ctx_size = 0;
    ctx_size += rows_A * cols_A * ggml_type_size(GGML_TYPE_F32);
    ctx_size += rows_B * cols_B * ggml_type_size(GGML_TYPE_F32);
    ctx_size += 2 * ggml_tensor_overhead();
    ctx_size += ggml_graph_overhead();
    ctx_size += 1024;

    struct ggml_init_params params {
        /*.mem_size   =*/ ctx_size,
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ false,
    };

    model.ctx = ggml_init(params);
    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
    memcpy(model.a->data, a, ggml_nbytes(model.a));
    memcpy(model.b->data, b, ggml_nbytes(model.b));
}

struct ggml_cgraph * build_graph(const simple_model & model) {
    struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
    // result = a * b^T
    struct ggml_tensor * result = ggml_mul_mat(model.ctx, model.a, model.b);
    ggml_build_forward_expand(gf, result);
    return gf;
}

struct ggml_tensor * compute(const simple_model & model) {
    struct ggml_cgraph * gf = build_graph(model);
    ggml_graph_compute_with_ctx(model.ctx, gf, /*n_threads=*/1);
    return ggml_graph_node(gf, -1);
}

int main(void) {
    ggml_time_init();

    const int rows_A = 4, cols_A = 2;
    float matrix_A[rows_A * cols_A] = { 2, 8, 5, 1, 4, 2, 8, 6 };

    const int rows_B = 3, cols_B = 2;
    float matrix_B[rows_B * cols_B] = { 10, 5, 9, 9, 5, 4 };

    simple_model model;
    load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);

    struct ggml_tensor * result = compute(model);

    std::vector<float> out_data(ggml_nelements(result));
    memcpy(out_data.data(), result->data, ggml_nbytes(result));

    printf("mul mat (%d x %d) (transposed result):\n[",
           (int)result->ne[0], (int)result->ne[1]);
    for (int j = 0; j < result->ne[1]; j++) {
        if (j > 0) printf("\n");
        for (int i = 0; i < result->ne[0]; i++)
            printf(" %.2f", out_data[j * result->ne[0] + i]);
    }
    printf(" ]\n");

    ggml_free(model.ctx);
    return 0;
}
Key points:
  • ggml_init() creates a context that owns tensor memory (no_alloc = false).
  • ggml_new_tensor_2d() allocates a tensor inside the context.
  • ggml_mul_mat() records the operation in the graph — no computation yet.
  • ggml_graph_compute_with_ctx() executes the graph on the CPU.
  • ggml_free() releases the entire context and all its tensors.