Ollama Guide

Add trace-level observability to AI agents powered by Ollama local LLMs — record latency, token counts, and errors for every model call.

What you'll get: one Nexus trace per agent run, one span per Ollama call, with model name, output token count (eval_count), latency, and error detection for empty responses.

Installation

Install the Nexus Python client and the requests library:

pip install nexus-client requests

Get your API key from Dashboard → API Keys and set it as an environment variable:

export NEXUS_API_KEY="nxs_..."

Pattern 1: Direct Ollama REST API

Call Ollama's /api/chat endpoint directly and wrap it in a Nexus span. This pattern gives you the most accurate token counts via Ollama's native eval_count and prompt_eval_count fields:

import os
import time
import requests
from nexus_client import NexusClient

nexus = NexusClient(
    api_key=os.environ["NEXUS_API_KEY"],
    agent_id="my-ollama-agent",
)

OLLAMA_URL = "http://localhost:11434"

def chat(prompt: str, model: str = "llama3") -> str:
    trace = nexus.start_trace(
        name=f"ollama: {prompt[:60]}",
        metadata={"model": model},
    )
    span = trace.add_span(
        name="ollama-chat",
        input={"prompt": prompt, "model": model},
    )
    start = time.time()
    try:
        resp = requests.post(
            f"{OLLAMA_URL}/api/chat",
            json={"model": model, "messages": [{"role": "user", "content": prompt}], "stream": False},
            timeout=120,
        )
        resp.raise_for_status()
        data = resp.json()
        content = data["message"]["content"]
        latency_ms = int((time.time() - start) * 1000)

        if not content.strip():
            span.end(status="error", output={"error": "empty_response", "model": model, "latency_ms": latency_ms})
            trace.end(status="error")
            raise ValueError("Ollama returned an empty response")

        span.end(status="ok", output={
            "model": model,
            "output_tokens": data.get("eval_count", 0),
            "prompt_tokens": data.get("prompt_eval_count", 0),
            "latency_ms": latency_ms,
        })
        trace.end(status="success")
        return content
    except requests.RequestException as e:
        span.end(status="error", output={"error": str(e), "model": model})
        trace.end(status="error")
        raise

Pattern 2: OpenAI-Compatible Endpoint

If your codebase already uses the OpenAI Python SDK, point it at Ollama's http://localhost:11434/v1 endpoint and wrap the call with Nexus the same way:

from openai import OpenAI
from nexus_client import NexusClient

nexus = NexusClient(api_key=os.environ["NEXUS_API_KEY"], agent_id="my-ollama-agent")

# Ollama's OpenAI-compatible endpoint
ollama = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

def chat_openai_compat(prompt: str, model: str = "llama3") -> str:
    trace = nexus.start_trace(name=f"ollama: {prompt[:60]}", metadata={"model": model})
    span = trace.add_span(name="ollama-chat", input={"prompt": prompt, "model": model})
    start = time.time()
    try:
        resp = ollama.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
        )
        content = resp.choices[0].message.content or ""
        latency_ms = int((time.time() - start) * 1000)
        # usage.completion_tokens is available; prompt_tokens may be None when served from KV cache
        out_tokens = resp.usage.completion_tokens if resp.usage else 0
        if not content.strip():
            span.end(status="error", output={"error": "empty_response", "latency_ms": latency_ms})
            trace.end(status="error")
            raise ValueError("Ollama returned empty response")
        span.end(status="ok", output={"output_tokens": out_tokens, "latency_ms": latency_ms})
        trace.end(status="success")
        return content
    except Exception as e:
        span.end(status="error", output={"error": str(e)})
        trace.end(status="error")
        raise

Note: usage.prompt_tokens may be None when Ollama serves from KV cache. The direct REST API pattern is more reliable for prompt token counts.

Multi-Turn Agent Loops

For agents that call Ollama multiple times, create one span per LLM call and share a single trace across all iterations:

def agent_loop(task: str, model: str = "llama3") -> str:
    trace = nexus.start_trace(name=f"agent: {task[:60]}", metadata={"model": model})
    messages = [{"role": "user", "content": task}]
    iteration = 0
    try:
        while iteration < 10:
            iteration += 1
            span = trace.add_span(
                name=f"llm-call-{iteration}",
                input={"iteration": iteration, "messages": len(messages)},
            )
            start = time.time()
            resp = requests.post(
                f"{OLLAMA_URL}/api/chat",
                json={"model": model, "messages": messages, "stream": False},
                timeout=120,
            )
            data = resp.json()
            content = data["message"]["content"]
            span.end(status="ok", output={
                "output_tokens": data.get("eval_count", 0),
                "prompt_tokens": data.get("prompt_eval_count", 0),
                "latency_ms": int((time.time() - start) * 1000),
            })
            messages.append({"role": "assistant", "content": content})
            if "DONE" in content or iteration >= 10:
                break
            messages.append({"role": "user", "content": "Continue."})
        trace.end(status="success")
        return messages[-1]["content"]
    except Exception as e:
        trace.end(status="error")
        raise

Ollama Token Field Reference

Ollama field Meaning Notes
eval_count Output tokens generated Always present in non-stream responses
prompt_eval_count Input tokens processed May be absent if Ollama served from KV cache
eval_duration Generation time in nanoseconds Use Date.now() for wall-clock latency instead
message.content Generated text Check for empty string — Ollama may return "" on resource exhaustion