guide
practical patterns for using aruminium.
device setup
use ;
let device = open?;
println!;
enumerate all GPUs (Mac Pro with multiple GPUs):
for dev in all?
buffers
shared (CPU + GPU)
let buf = device.buffer?; // 1024 floats
// write from CPU
buf.write_f32;
// read back after GPU work
buf.read_f32;
private (GPU-only)
let gpu_buf = device.buffer_private?;
assert!; // no CPU access
// copy data in via blit
let staging = device.buffer_with_data?;
let cmd = queue.commands?;
let blit = cmd.copier?;
blit.copy;
blit.finish;
cmd.submit;
cmd.wait;
use private buffers for intermediate results that stay on GPU between kernel dispatches. higher bandwidth than shared.
compile and dispatch
let source = r#"
#include <metal_stdlib>
using namespace metal;
kernel void saxpy(device float *y buffer(0),
device const float *x buffer(1),
constant float &a buffer(2),
uint id thread_position_in_grid) {
y[id] = a * x[id] + y[id];
}
"#;
let lib = device.compile?;
let func = lib.function?;
let pipeline = device.pipeline?;
let queue = device.new_command_queue?;
let cmd = queue.commands?;
let enc = cmd.encoder?;
enc.bind;
enc.bind_buffer;
enc.bind_buffer;
let alpha: f32 = 2.0;
let alpha_bytes = alpha.to_ne_bytes;
enc.push;
enc.launch;
enc.finish;
cmd.submit;
cmd.wait;
choosing threadgroup size
query the pipeline for hardware limits:
let max = pipeline.max_total_threads_per_threadgroup; // e.g. 1024
let simd = pipeline.thread_execution_width; // e.g. 32
let tg_mem = pipeline.static_threadgroup_memory_length; // bytes used
rules of thumb:
- 1D work:
(256, 1, 1)or(simd, 1, 1)for small workloads - 2D work (matmul):
(16, 16, 1)= 256 threads - never exceed
max_total_threads_per_threadgroup launchhandles non-uniform grids (n not divisible by group size)launch_groupsneeds manualceil(n / group)calculation
GPU timing
cmd.submit;
cmd.wait;
let gpu_ms = cmd.gpu_time * 1000.0;
println!;
gpu_start_time() and gpu_end_time() return absolute seconds
since device boot. gpu_time() = end - start.
hot-loop dispatch (inference)
for repeated dispatches (inference decode loop), use Dispatch:
use Dispatch;
let disp = new;
// single dispatch
unsafe
batch dispatch (multiple kernels, one command buffer)
unsafe
pipelined dispatch (overlap CPU encoding with GPU execution)
let mut prev = None;
for layer in 0..num_layers
if let Some = prev
raw batch (caller manages autorelease pool)
autorelease_pool;
one pool for the entire loop instead of per-batch.
fp16 conversion
use ;
// single value
let half: u16 = f32_to_fp16;
let full: f32 = fp16_to_f32;
// bulk (NEON-optimized on aarch64)
let src: = weights_fp16;
let mut dst = vec!;
cast_f16_f32;
error handling
all fallible operations return Result<T, GpuError>. error variants
carry context strings from Metal.framework (e.g. shader compilation
errors include the MSL compiler diagnostic).
match device.compile