module Llama

Defined in:

llama.cr
llama/batch.cr
llama/batch/error.cr
llama/chat.cr
llama/context.cr
llama/context/error.cr
llama/error.cr
llama/kv_cache.cr
llama/kv_cache/error.cr
llama/lib_llama.cr
llama/model.cr
llama/model/error.cr
llama/sampler.cr
llama/sampler/base.cr
llama/sampler/dist.cr
llama/sampler/error.cr
llama/sampler/grammar.cr
llama/sampler/grammar_lazy_patterns.cr
llama/sampler/greedy.cr
llama/sampler/infill.cr
llama/sampler/min_p.cr
llama/sampler/mirostat.cr
llama/sampler/mirostat_v2.cr
llama/sampler/penalties.cr
llama/sampler/temp.cr
llama/sampler/temp_ext.cr
llama/sampler/top_k.cr
llama/sampler/top_n_sigma.cr
llama/sampler/top_p.cr
llama/sampler/typical.cr
llama/sampler/xtc.cr
llama/sampler_chain.cr
llama/state.cr
llama/state/error.cr
llama/vocab.cr

Constant Summary

DEFAULT_SEED = LibLlama::LLAMA_DEFAULT_SEED

==== Native constants (wrapped for user convenience) ====

FILE_MAGIC_GGLA = LibLlama::LLAMA_FILE_MAGIC_GGLA
FILE_MAGIC_GGSN = LibLlama::LLAMA_FILE_MAGIC_GGSN
FILE_MAGIC_GGSQ = LibLlama::LLAMA_FILE_MAGIC_GGSQ
LLAMA_CPP_COMPATIBLE_VERSION = (read_file("/srv/crystaldoc.info/github-kojix2-llama.cr-main/src/LLAMA_VERSION")).chomp
LOG_LEVEL_DEBUG = 0

Log level constants (from llama.cpp / ggml)

LOG_LEVEL_ERROR = 3
LOG_LEVEL_INFO = 1
LOG_LEVEL_NONE = 4
LOG_LEVEL_WARNING = 2
SESSION_MAGIC = LibLlama::LLAMA_SESSION_MAGIC
SESSION_VERSION = LibLlama::LLAMA_SESSION_VERSION
TOKEN_NULL = LibLlama::LLAMA_TOKEN_NULL
VERSION = "0.1.0"

Class Method Summary

Class Method Detail

def self.apply_chat_template(template : String | Nil, messages : Array(ChatMessage), add_assistant : Bool = true) : String #

Applies a chat template to a list of messages

Parameters:

  • template: The template string (nil to use model's default)
  • messages: Array of chat messages
  • add_assistant: Whether to end with an assistant message prefix

Returns:

  • The formatted prompt string

Raises:

  • Llama::Error if template application fails

[View source]
def self.builtin_chat_templates : Array(String) #

Gets the list of built-in chat templates

Returns:

  • Array of template names

[View source]
def self.error_message(code : Int32) : String #

[View source]
def self.format_error(message : String, code : Int32 | Nil = nil, context : String | Nil = nil) : String #

[View source]
def self.generate(model_path : String, prompt : String, max_tokens : Int32 = 128, temperature : Float32 = 0.8) : String #

Generates text from a prompt using a model

This is a convenience method that loads a model, creates a context, and generates text in a single call.

response = Llama.generate(
  "/path/to/model.gguf",
  "Once upon a time",
  max_tokens: 100,
  temperature: 0.7
)
puts response

Parameters:

  • model_path: Path to the model file (.gguf format)
  • prompt: The input prompt
  • max_tokens: Maximum number of tokens to generate (must be positive)
  • temperature: Sampling temperature (0.0 = greedy, 1.0 = more random)

Returns:

  • The generated text

Raises:

  • ArgumentError if parameters are invalid
  • Llama::Model::Error if model loading fails
  • Llama::Context::Error if text generation fails

[View source]
def self.init #

Thread-safe, idempotent initialization of the llama.cpp backend. You do not need to call this manually in most cases.


[View source]
def self.log_level #

Get the current log level

Returns:

  • The current log level

[View source]
def self.log_level=(level : Int32) #

Set the log level

Parameters:

  • level : Int32 - log level (0=DEBUG, 1=INFO, 2=WARNING, 3=ERROR, 4=NONE)

Example: Llama.log_level = Llama::LOG_LEVEL_ERROR # Only show errors Llama.log_level = Llama::LOG_LEVEL_NONE # Disable all logging


[View source]
def self.log_set(&block : Int32, String -> ) #

Set a custom log callback

The block receives:

  • level : Int32 - log level (0=DEBUG, 1=INFO, 2=WARNING, 3=ERROR)
  • message : String - log message

Example: Llama.log_set do |level, message| if level >= Llama::LOG_LEVEL_ERROR STDERR.print message end end


[View source]
def self.measure_ms(&) #

Measures elapsed time in milliseconds for a block using llama.cpp's clock.

elapsed = Llama.measure_ms do
  # ... code to measure ...
end
puts "Elapsed: #{elapsed} ms"

Returns:

  • Float64: elapsed milliseconds

[View source]
def self.process_escapes(text : String) : String #

Process escape sequences in a string

This method processes common escape sequences like \n, \t, etc. in a string, converting them to their actual character representations.

text = Llama.process_escapes("Hello\\nWorld")
puts text # Prints "Hello" and "World" on separate lines

Parameters:

  • text: The input string containing escape sequences

Returns:

  • A new string with escape sequences processed

[View source]
def self.system_info : String #

Returns the llama.cpp system information

This method provides information about the llama.cpp build, including BLAS configuration, CPU features, and GPU support.

info = Llama.system_info
puts info

Returns:

  • A string containing system information

[View source]
def self.time_ms : Int64 #

Returns the current time in milliseconds since the Unix epoch (llama.cpp compatible).

t0 = Llama.time_ms
# ... some processing ...
t1 = Llama.time_ms
elapsed = t1 - t0
puts "Elapsed: #{elapsed} ms"

Returns:

  • Int64: milliseconds since epoch

[View source]
def self.time_us : Int64 #

Returns the current time in microseconds since the Unix epoch (llama.cpp compatible).

This is a high-level wrapper for LibLlama.llama_time_us.

t0 = Llama.time_us
# ... some processing ...
t1 = Llama.time_us
elapsed_ms = (t1 - t0) / 1000.0
puts "Elapsed: #{elapsed_ms} ms"

Returns:

  • Int64: microseconds since epoch

[View source]
def self.tokenize_and_format(vocab : Vocab, text : String, add_bos : Bool = true, parse_special : Bool = true, ids_only : Bool = false) : String #

Tokenize text and return formatted output

This is a convenience method that tokenizes text and returns a formatted string representation of the tokens.

model = Llama::Model.new("/path/to/model.gguf")
result = Llama.tokenize_and_format(model.vocab, "Hello, world!", ids_only: true)
puts result # Prints "[1, 2, 3, ...]"

Parameters:

  • vocab: The vocabulary to use for tokenization
  • text: The text to tokenize
  • add_bos: Whether to add BOS token (default: true)
  • parse_special: Whether to parse special tokens (default: true)
  • ids_only: Whether to return only token IDs (default: false)

Returns:

  • A formatted string representation of the tokens

[View source]
def self.uninit #

Thread-safe, idempotent finalization of the llama.cpp backend. Call this if you want to explicitly release all backend resources before program exit.


[View source]