Ollama Guide
Add trace-level observability to AI agents powered by Ollama local LLMs — record latency, token counts, and errors for every model call.
eval_count), latency, and error detection for empty responses.
Installation
Install the Nexus Python client and the requests library:
pip install nexus-client requests
Get your API key from Dashboard → API Keys and set it as an environment variable:
export NEXUS_API_KEY="nxs_..."
Pattern 1: Direct Ollama REST API
Call Ollama's /api/chat endpoint directly and wrap it in a Nexus span. This pattern gives you the most accurate token counts via Ollama's native eval_count and prompt_eval_count fields:
import os
import time
import requests
from nexus_client import NexusClient
nexus = NexusClient(
api_key=os.environ["NEXUS_API_KEY"],
agent_id="my-ollama-agent",
)
OLLAMA_URL = "http://localhost:11434"
def chat(prompt: str, model: str = "llama3") -> str:
trace = nexus.start_trace(
name=f"ollama: {prompt[:60]}",
metadata={"model": model},
)
span = trace.add_span(
name="ollama-chat",
input={"prompt": prompt, "model": model},
)
start = time.time()
try:
resp = requests.post(
f"{OLLAMA_URL}/api/chat",
json={"model": model, "messages": [{"role": "user", "content": prompt}], "stream": False},
timeout=120,
)
resp.raise_for_status()
data = resp.json()
content = data["message"]["content"]
latency_ms = int((time.time() - start) * 1000)
if not content.strip():
span.end(status="error", output={"error": "empty_response", "model": model, "latency_ms": latency_ms})
trace.end(status="error")
raise ValueError("Ollama returned an empty response")
span.end(status="ok", output={
"model": model,
"output_tokens": data.get("eval_count", 0),
"prompt_tokens": data.get("prompt_eval_count", 0),
"latency_ms": latency_ms,
})
trace.end(status="success")
return content
except requests.RequestException as e:
span.end(status="error", output={"error": str(e), "model": model})
trace.end(status="error")
raise
Pattern 2: OpenAI-Compatible Endpoint
If your codebase already uses the OpenAI Python SDK, point it at Ollama's http://localhost:11434/v1 endpoint and wrap the call with Nexus the same way:
from openai import OpenAI
from nexus_client import NexusClient
nexus = NexusClient(api_key=os.environ["NEXUS_API_KEY"], agent_id="my-ollama-agent")
# Ollama's OpenAI-compatible endpoint
ollama = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
def chat_openai_compat(prompt: str, model: str = "llama3") -> str:
trace = nexus.start_trace(name=f"ollama: {prompt[:60]}", metadata={"model": model})
span = trace.add_span(name="ollama-chat", input={"prompt": prompt, "model": model})
start = time.time()
try:
resp = ollama.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
)
content = resp.choices[0].message.content or ""
latency_ms = int((time.time() - start) * 1000)
# usage.completion_tokens is available; prompt_tokens may be None when served from KV cache
out_tokens = resp.usage.completion_tokens if resp.usage else 0
if not content.strip():
span.end(status="error", output={"error": "empty_response", "latency_ms": latency_ms})
trace.end(status="error")
raise ValueError("Ollama returned empty response")
span.end(status="ok", output={"output_tokens": out_tokens, "latency_ms": latency_ms})
trace.end(status="success")
return content
except Exception as e:
span.end(status="error", output={"error": str(e)})
trace.end(status="error")
raise
Note: usage.prompt_tokens may be None when Ollama serves from KV cache. The direct REST API pattern is more reliable for prompt token counts.
Multi-Turn Agent Loops
For agents that call Ollama multiple times, create one span per LLM call and share a single trace across all iterations:
def agent_loop(task: str, model: str = "llama3") -> str:
trace = nexus.start_trace(name=f"agent: {task[:60]}", metadata={"model": model})
messages = [{"role": "user", "content": task}]
iteration = 0
try:
while iteration < 10:
iteration += 1
span = trace.add_span(
name=f"llm-call-{iteration}",
input={"iteration": iteration, "messages": len(messages)},
)
start = time.time()
resp = requests.post(
f"{OLLAMA_URL}/api/chat",
json={"model": model, "messages": messages, "stream": False},
timeout=120,
)
data = resp.json()
content = data["message"]["content"]
span.end(status="ok", output={
"output_tokens": data.get("eval_count", 0),
"prompt_tokens": data.get("prompt_eval_count", 0),
"latency_ms": int((time.time() - start) * 1000),
})
messages.append({"role": "assistant", "content": content})
if "DONE" in content or iteration >= 10:
break
messages.append({"role": "user", "content": "Continue."})
trace.end(status="success")
return messages[-1]["content"]
except Exception as e:
trace.end(status="error")
raise
Ollama Token Field Reference
| Ollama field | Meaning | Notes |
|---|---|---|
eval_count |
Output tokens generated | Always present in non-stream responses |
prompt_eval_count |
Input tokens processed | May be absent if Ollama served from KV cache |
eval_duration |
Generation time in nanoseconds | Use Date.now() for wall-clock latency instead |
message.content |
Generated text | Check for empty string — Ollama may return "" on resource exhaustion |