Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ pnpm watch # Rebuild extension and webviews on changes
Press F5 to launch the Extension Development Host. Use "Developer: Reload
Webviews" to see webview changes.

## Telemetry

Local telemetry instrumentation follows a shared style: how spans are threaded,
how events and properties are named, and properties vs measurements. Read this
before adding new telemetry so it stays consistent across the codebase. It lives
next to the code:

**[`src/instrumentation/CONVENTIONS.md`](src/instrumentation/CONVENTIONS.md)**

## Testing

There are a few ways you can test the "Open in VS Code" flow:
Expand Down
16 changes: 8 additions & 8 deletions src/commands.ts
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ export class Commands {
return;
}
if (resolved.status === "failed") {
telemetry.fail(resolved.category);
telemetry.error(resolved.category);
return;
}

Expand Down Expand Up @@ -346,7 +346,7 @@ export class Commands {
telemetry.abort("progress");
return;
}
telemetry.fail();
telemetry.error();
this.logger.error("Speed test failed", result.error);
vscode.window.showErrorMessage(
`Speed test failed: ${toError(result.error).message}`,
Expand All @@ -364,14 +364,14 @@ export class Commands {
});
} catch (err) {
if (err instanceof ZodError || err instanceof SyntaxError) {
telemetry.fail("parse_error");
telemetry.error("parse_error");
this.logger.error("Failed to parse speedtest output", err);
vscode.window.showErrorMessage(
"Speed test output did not match the expected format. Check `Output > Coder` for details.",
);
return;
}
telemetry.fail();
telemetry.error();
this.logger.error("Failed to display speedtest results", err);
vscode.window.showErrorMessage(
`Speed test returned unexpected output: ${toError(err).message}`,
Expand All @@ -395,7 +395,7 @@ export class Commands {
return;
}
if (resolved.status === "failed") {
telemetry.fail(resolved.category);
telemetry.error(resolved.category);
return;
}

Expand Down Expand Up @@ -444,7 +444,7 @@ export class Commands {
telemetry.abort("progress");
return;
}
telemetry.fail(
telemetry.error(
result.error instanceof SupportBundleUnsupportedCliError
? "unsupported_cli"
: undefined,
Expand Down Expand Up @@ -915,7 +915,7 @@ export class Commands {
return false;
}
if (pick.status === "failed") {
telemetry.fail(pick.category);
telemetry.error(pick.category);
return false;
}
workspace = pick.workspace;
Expand Down Expand Up @@ -1224,7 +1224,7 @@ export class Commands {
}
})
.catch((ex) => {
fetchErrorCategory = "fetch_failed";
fetchErrorCategory = "fetch_error";
this.logger.error("Failed to fetch workspaces", ex);
if (ex instanceof CertificateError) {
void ex.showNotification();
Expand Down
4 changes: 2 additions & 2 deletions src/core/cliManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ export class CliManager {
);
verifySpan.setProperty("outcome", result.kind);
if (result.kind === "sig_not_found") {
verifySpan.setProperty("sigStatus", String(result.status));
verifySpan.setProperty("sig_status", String(result.status));
}
});
}
Expand Down Expand Up @@ -590,7 +590,7 @@ export class CliManager {
}
} finally {
if (bytesWritten > 0) {
downloadSpan.setMeasurement("downloadedBytes", bytesWritten);
downloadSpan.setMeasurement("downloaded_bytes", bytesWritten);
}
await downloadProgress.clearProgress(progressLogPath);
}
Expand Down
161 changes: 161 additions & 0 deletions src/instrumentation/CONVENTIONS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# Telemetry conventions

How to add telemetry so every instrumentation reads the same way. The framework
lives in `src/telemetry`; the per-domain instrumentation business code talks to
lives here in `src/instrumentation`.

## Checklist

- One instrumentation class per domain (`FooTelemetry`) wrapping
`TelemetryService`; business code imports that, never a raw span.
- Event name is `domain.snake_case`; point-in-time logs use past tense.
- Event names and attribute keys follow OTel: lowercase, `.` for hierarchy, `_`
to split words, never camelCase. Enumerated values are typed `snake_case`
unions, never bare `string`.
- Numbers go in `measurements` (raw), never pre-bucketed into string properties.
- Set attributes imperatively with `setProperty`/`setMeasurement`; never add a
return value that exists only to be logged.
- No secrets, tokens, query strings, or unbounded values in properties; routes
go through `normalizeRoute`.
- Let the framework set `result`; add a domain `outcome` only when an operation
has several success modes. Errors go to a typed `error.type` union; non-error
early exits call `markAborted`.

## Layers

- **Framework** (`src/telemetry`): `TelemetryService` (`trace`/`log`/`logError`)
hands out `Span` handles and owns IDs, timing, `result`, level-gating, and the
wire format. Telemetry-off is handled here (`NOOP_SPAN`), so instrumentation
never checks whether telemetry is enabled.
- **Instrumentation** (`src/instrumentation/*`): one typed class per domain, the
only telemetry surface business code sees.

## Structure

- Split instrumentation files along the same boundaries as the business code,
not one catch-all module.
- Shared span helpers (`recordError`, `recordAborted`) live in one shared module,
not duplicated per file.
- Record-error-then-rethrow-outside-the-span logic lives once per class, in a
single private helper, not in every `traceX` method.

## Threading

Spans are passed **explicitly** as a callback argument; there is no
ambient/active-span context. Two patterns keep telemetry out of business logic:

1. **Imperative attributes** — `span.setProperty("outcome", "cache_hit")` at the
point the value is known. This is the standard OpenTelemetry model.
2. **Typed phases** — wrap an async step in `span.phase(...)` and read one
property off its _natural_ return value, e.g.
`trace.versionCheck(() => this.checkBinary(...))`. Extraction stays out of
the business function.

Never return a value purely so a caller can log it; that couples the return type
to observability. Returning is fine when the business uses the value too.

## Callers

- Declare telemetry dimensions explicitly at the call site; pass `source: "uri"`
rather than inferring it from which arguments happen to be set.
- Keep business bodies in named private `runX(args, trace)` methods; the public
method just opens the span and wraps them. Small diffs, named telemetry seam.
- When sibling events share a correlating property, emit it on every event in the
family; don't drop it from new ones.

## Spans, phases, logs

- `trace(name, fn, props?, meas?)` — a span with framework-set `result` and
`durationMs`. Use for an operation with a start and end.
- `span.phase(name, fn, ...)` — the same, nested (composed as `parent.child`);
child names cannot contain `.`.
- `span.log(name, ...)` / `span.logError(name, error, ...)` — point-in-time
events under a span, no `result`/`durationMs`. Use for discrete signals.

## Naming

| Thing | Convention | Examples |
| -------------------------- | ------------------------------------------- | --------------------------------------------------------- |
| Event name | `domain.snake_case` | `cli.resolve`, `remote.setup`, `connection.dropped` |
| Point-in-time log | past tense | `connection.dropped`, `ssh.process.lost` |
| Child phase | bare `snake_case` | `cache_lookup`, `version_check`, `connection_handoff` |
| Property / measurement key | lowercase; `_` splits words, `.` for groups | `cache_source`, `error.type`, `status.from` |
| Enumerated value | typed `snake_case` union | `"cache_hit"`, `"session_token"`, `"unrecoverable_close"` |

This is the [OTel attribute convention](https://opentelemetry.io/docs/specs/semconv/general/naming/):
`.` is the namespace delimiter, `_` joins words within a segment, never
camelCase. Default to a flat `snake_case` key; use `.` only to group genuinely
related attributes (a `status.from` / `status.to` pair). Keep phase names
subject-first within a domain (`agent_resolve`, not `resolve_agent`).

**Methods.** Span-wrapper methods are `trace<Noun>` (`traceOpen`,
`traceConfirmationPrompt`); don't echo the event's past-tense suffix
(`traceUpdateConfirmationPrompted`), and drop qualifiers the class implies.

**Grouping.** Group related events under a shared dotted namespace so a prefix
query returns the whole family: `workspace.update.triggered` and
`workspace.update.prompted` both sit under `workspace.update.*`, as do
`auth.token_refresh.completed` / `.deduped` under `auth.token_refresh.*`. Keep
the namespace node a pure prefix; don't also emit an event with that exact name,
since dotted names otherwise read as span phases (`parent.child`). This is
[OTel namespacing](https://opentelemetry.io/docs/specs/semconv/general/naming/),
which exists precisely so related signals query together.

**Namespacing.** OTel suggests prefixing custom attributes (`com.acme.*`) to
avoid clashing with future semconv. We don't namespace event-level attributes:
each is already scoped by its (namespaced) event name and only flows into
Coder's own pipeline, so a bare `cache_source` can't collide with a future OTel
`cache.source`. Resource and provenance attributes stay namespaced
(`coder.deployment.url`, `coder.event.*`).

## Properties vs measurements

- **Properties** are low-cardinality string dimensions (the framework stringifies
`string | number | boolean`). Use them for what you group or filter by.
- **Measurements** are raw numbers. Don't pre-bucket into string labels: both
export as record attributes, and a query can bucket the raw number at read
time. `result` and `durationMs` are framework-managed and cannot be set.
- **Units.** There is no unit field at emit time, so put the unit in the
measurement key as a `_ms` / `_mbits` suffix, the same way for every event.
The OTLP exporter then resolves it per signal: for metric events
(`http.requests`, `ssh.network.sampled`) it moves the suffix into the OTLP
`unit` field and drops it from the metric name (`latency_ms` exports as metric
`latency`, unit `ms`, which Prometheus then suffixes itself); log and span
attributes keep the key as written. You always author `latency_ms`; only the
exported metric name changes.
- **Counts.** Name a count `<entity>.count`, singular entity (`agent.count`,
`workspace.count`), per OTel (`system.process.count`) — not flat `agent_count`.
Related counts share the namespace (`agent.count`, `agent.connected_count`); a
count with no entity (`retry_count`) stays flat.

## Outcomes, errors, aborts

- The framework sets `result` (`success` / `error` / `aborted`) on every span;
don't duplicate it.
- Add a domain `outcome` property only when an operation has several success
modes worth distinguishing (e.g. `cli.resolve`: `cache_hit`, `downloaded`).
- Classify errors into a typed `error.type` union via a `categorize*Error` helper
rather than emitting raw error strings; the framework captures the error block.
- For a non-error early exit (backed out, not-found), call `span.markAborted()`
rather than throwing, recording its reason in a separate key, not `error.type`.
- A trace exposes one outcome trio — `abort(stage)`, `error(category?)`,
`succeed*(payload)` — over the shared `recordAborted` / `recordError` helpers;
each outcome sets one, never two.
- Prefer **error** over "failure" and **abort** over "cancel" here (`recordError`
/ `error.type` / `error()`; `recordAborted` / `markAborted` / `abort()`).
Point-in-time logs keep past tense (`recovery_failed`) — they state what
happened, not a span's `error` result.

## Safety

Never put tokens, credentials, full URLs with query strings, or unbounded user
input into properties. Routes go through `normalizeRoute`
(`src/logging/routeNormalization.ts`). Prefer a closed union over a free-form
string for any property a dashboard groups by.

## Tests

- Telemetry-only tests of business code are `<subject>.telemetry.test.ts`;
instrumentation modules keep `<module>.test.ts` and split when the module does.
- Assert what privacy intends to omit, not only what is present (e.g.
`workspace_name` undefined on command events).
10 changes: 5 additions & 5 deletions src/instrumentation/activation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,25 @@ export interface ActivationTracer {
}

/**
* Emits `activation` with `authState`, plus a sibling `activation.deployment_init`
* Emits `activation` with `auth_state`, plus a sibling `activation.deployment_init`
* trace (sibling, not child, because deployment init outlives the activation span).
*/
export class ActivationTelemetry {
public constructor(private readonly telemetry: TelemetryService) {}

public trace<T>(fn: (tracer: ActivationTracer) => Promise<T>): Promise<T> {
return this.telemetry.trace("activation", async (span) => {
span.setProperty("authState", "none");
span.setProperty("auth_state", "none");
return fn({
setAuthState: (state) => span.setProperty("authState", state),
setAuthState: (state) => span.setProperty("auth_state", state),
traceDeploymentInit: (initFn) =>
this.telemetry.trace(
"activation.deployment_init",
async (childSpan) => {
childSpan.setProperty("authState", "unknown");
childSpan.setProperty("auth_state", "unknown");
const success = await initFn();
childSpan.setProperty(
"authState",
"auth_state",
success ? "valid_token" : "auth_failed",
);
return success;
Expand Down
14 changes: 8 additions & 6 deletions src/instrumentation/auth.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ export class AuthTelemetry {
}
return result;
} catch (error) {
span.setProperty("reason", "exception");
span.setProperty("error.type", "exception");
throw error;
}
},
Expand All @@ -78,7 +78,7 @@ export class AuthTelemetry {
}
return result;
} catch (error) {
span.setProperty("reason", "exception");
span.setProperty("error.type", "exception");
throw error;
}
});
Expand All @@ -88,7 +88,9 @@ export class AuthTelemetry {
trigger: AuthTokenRefreshTrigger,
fn: () => Promise<T>,
): Promise<T> {
return this.telemetry.trace("auth.token_refreshed", fn, { trigger });
return this.telemetry.trace("auth.token_refresh.completed", fn, {
trigger,
});
}

/** Logged when a refresh call joins an in-flight refresh and emits no span of its own. */
Expand All @@ -110,9 +112,9 @@ export class AuthTelemetry {
logReceived: () => span.log("received"),
setRecovery: (recovery) => span.setProperty("recovery", recovery),
setRefreshAttempted: (attempted) =>
span.setProperty("refreshAttempted", attempted),
span.setProperty("refresh_attempted", attempted),
}),
{ recovery: "none", refreshAttempted: false },
{ recovery: "none", refresh_attempted: false },
);
}

Expand Down Expand Up @@ -141,11 +143,11 @@ export class AuthTelemetry {

/** `auth_failed` is a real error; user/URL dismissals are intentional aborts. */
function recordReason(span: Span, reason: LoginPromptReason): void {
span.setProperty("reason", reason);
if (reason === "auth_failed") {
span.setProperty("error.type", reason);
span.markError();
} else {
span.setProperty("reason", reason);
span.markAborted();
}
}
8 changes: 4 additions & 4 deletions src/instrumentation/diagnostics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ export type DiagnosticAbortStage =

export interface DiagnosticTrace {
abort(stage: DiagnosticAbortStage): void;
fail(category?: DiagnosticErrorCategory): void;
error(category?: DiagnosticErrorCategory): void;
setRequestedDuration(seconds: number): void;
succeedSpeedtest(result: SpeedtestResult): void;
succeedExport(format: string, eventCount: number): void;
Expand Down Expand Up @@ -54,7 +54,7 @@ class SpanDiagnosticTrace implements DiagnosticTrace {
recordAborted(this.span, stage);
}

public fail(category: DiagnosticErrorCategory = "error"): void {
public error(category: DiagnosticErrorCategory = "error"): void {
recordError(this.span, category);
}

Expand All @@ -63,7 +63,7 @@ class SpanDiagnosticTrace implements DiagnosticTrace {
}

public succeedSpeedtest(result: SpeedtestResult): void {
this.span.setMeasurement("interval_count", result.intervals.length);
this.span.setMeasurement("interval.count", result.intervals.length);
this.span.setMeasurement(
"throughput_mbits",
result.overall.throughput_mbits,
Expand All @@ -72,6 +72,6 @@ class SpanDiagnosticTrace implements DiagnosticTrace {

public succeedExport(format: string, eventCount: number): void {
this.span.setProperty("format", format);
this.span.setMeasurement("event_count", eventCount);
this.span.setMeasurement("event.count", eventCount);
}
}
Loading