Cloudflare Workers AI Guide

Add trace-level observability to Cloudflare Workers AI agents — record model name, latency, and token counts for every env.AI.run() call.

What you'll get: one Nexus trace per Worker invocation, one span per inference call, with model name, latency, and token counts when available. Error spans for empty responses and thrown exceptions.

Installation

npm install @keylightdigital/nexus

Add your API key as a Workers secret (never hard-code it in your Worker source):

# Add your API key as a Workers secret
wrangler secret put NEXUS_API_KEY

Add NEXUS_API_KEY: string to your Env interface so TypeScript knows about it.

Basic Pattern

Wrap every env.AI.run() call in a Nexus trace and span. Create a new NexusClient per request — Workers are stateless, so client initialization is cheap:

import { NexusClient } from '@keylightdigital/nexus'

export interface Env {
  AI: Ai
  NEXUS_API_KEY: string
}

export default {
  async fetch(request: Request, env: Env): Promise<Response> {
    const nexus = new NexusClient({
      apiKey: env.NEXUS_API_KEY,
      agentId: 'my-workers-ai-agent',
    })

    const { prompt } = await request.json<{ prompt: string }>()
    const model = '@cf/meta/llama-3-8b-instruct'

    const trace = await nexus.startTrace({
      name: `inference: ${prompt.slice(0, 60)}`,
      metadata: { model },
    })
    const span = await trace.addSpan({
      name: 'workers-ai-inference',
      input: { prompt, model },
    })

    const start = Date.now()
    try {
      const response = await env.AI.run(model, { prompt }) as {
        response: string
        usage?: { input_tokens: number; output_tokens: number }
      }
      const latencyMs = Date.now() - start

      if (!response.response?.trim()) {
        await span.end({ status: 'error', output: { error: 'empty_response', model, latency_ms: latencyMs } })
        await trace.end({ status: 'error' })
        return new Response('Empty response', { status: 500 })
      }

      await span.end({
        status: 'ok',
        output: {
          model,
          latency_ms: latencyMs,
          input_tokens: response.usage?.input_tokens ?? null,
          output_tokens: response.usage?.output_tokens ?? null,
        },
      })
      await trace.end({ status: 'success' })
      return Response.json({ result: response.response })
    } catch (err) {
      await span.end({ status: 'error', output: { error: String(err), model, latency_ms: Date.now() - start } })
      await trace.end({ status: 'error' })
      throw err
    }
  },
}

Agentic Loops

For Workers that call env.AI.run() multiple times in a loop, create one span per call and share a single trace:

export default {
  async fetch(request: Request, env: Env): Promise<Response> {
    const nexus = new NexusClient({ apiKey: env.NEXUS_API_KEY, agentId: 'workers-ai-agent' })
    const { task } = await request.json<{ task: string }>()
    const model = '@cf/meta/llama-3-8b-instruct'

    const trace = await nexus.startTrace({ name: `agent: ${task.slice(0, 60)}`, metadata: { model } })
    let messages = [{ role: 'user', content: task }]
    let iteration = 0

    try {
      while (iteration < 6) {
        iteration++
        const span = await trace.addSpan({ name: `llm-call-${iteration}`, input: { iteration } })
        const start = Date.now()
        const resp = await env.AI.run(model, { messages }) as {
          response: string; usage?: { input_tokens: number; output_tokens: number }
        }
        await span.end({
          status: 'ok',
          output: {
            latency_ms: Date.now() - start,
            output_tokens: resp.usage?.output_tokens ?? null,
          },
        })
        messages.push({ role: 'assistant', content: resp.response })
        if (resp.response.includes('DONE') || iteration >= 6) break
        messages.push({ role: 'user', content: 'Continue.' })
      }
      await trace.end({ status: 'success' })
      return Response.json({ result: messages[messages.length - 1].content })
    } catch (err) {
      await trace.end({ status: 'error' })
      throw err
    }
  },
}

Token Field Notes

Workers AI returns a usage object for some models and undefined for others. Always guard against undefined:

// Safe pattern — null recorded when usage is unavailable
const inputTokens = response.usage?.input_tokens ?? null
const outputTokens = response.usage?.output_tokens ?? null

The models that currently return usage include the Llama 3 family. Mistral 7B and some other models may return undefined — recording null in the span output is preferable to crashing.