# OpenTelemetry Specification — Possibility Integration Profile # This file documents OpenTelemetry as an external standard that Possibility # integrates with for distributed tracing and metrics collection. # # Version: 0.3.0 # Status: Active # Last Updated: 2026-02-15 # # THREE-GATE TEST: # Gate 1 (Distinctness): OTel is an observability standard — fundamentally # different from activity logging (AS2), workflows, or connectors. # Gate 2 (Reusability): Referenced by every instrumented service and the # SigNoz connector. Central to the telemetry architecture. # Gate 3 (Clarity): Makes the OTel integration discoverable through the # .kno ontology alongside the SigNoz connector entity. # # EMBRACE-AND-EXTEND: This spec references the OpenTelemetry Specification. # https://opentelemetry.io/docs/specs/otel/ # # KEY RELATIONSHIP: OTel provides the instrumentation layer; SigNoz provides # the backend. Activities (AS2) log semantic meaning; traces log execution. # The activity_bridge concept links these two worlds via traceId correlation. # ============================================================================= # @kno:manifest # ============================================================================= $schema: kno@0.0.9 id: 01KHCBQ2H2R5XQ2BZSEX0JKRT5 slug: otel-spec type: spec version: 0.3.0 title: "OpenTelemetry Integration Profile" purpose: | Document how Possibility integrates with OpenTelemetry for distributed tracing and metrics collection across all backend services. **What this IS:** A specification for how OTel is configured, what signals are collected, and how telemetry data relates to .kno activity records. **What this is NOT:** A schema for telemetry entities. Traces, spans, and metrics are infrastructure exhaust — they live in SigNoz (ClickHouse), not in the .kno knowledge graph. Per the Three-Gate Test, telemetry data fails Gates 2 & 3 for .kno entity status. **Dual Observability Model:** ``` ┌─────────────────────────────────────────────────────────────┐ │ Possibility Observability │ ├──────────────────────────┬──────────────────────────────────┤ │ Activity Logging (AS2) │ Distributed Tracing (OTel) │ ├──────────────────────────┼──────────────────────────────────┤ │ WHAT happened │ HOW it happened │ │ Semantic events │ Execution paths │ │ .kno entities │ ClickHouse time-series │ │ pspace-activity service │ SigNoz backend │ │ User-facing audit trail │ Operator-facing diagnostics │ ├──────────────────────────┴──────────────────────────────────┤ │ Bridge: activity.auditMetadata.trace_id → SigNoz trace │ └─────────────────────────────────────────────────────────────┘ ``` # ============================================================================= # @kno:taxonomy # ============================================================================= taxonomy: topics: - observability - distributed-tracing - metrics - instrumentation - telemetry keywords: - opentelemetry - otel - traces - spans - metrics - OTLP - collector - SigNoz - instrumentation # ============================================================================= # @kno:relationships # ============================================================================= provenance: origin: id: 01KHCBQ2H2R5XQ2BZSEX0JKRT5 timestamp: "2026-07-18T00:00:00Z" tool: manual relationships: extends: - url: "https://opentelemetry.io/docs/specs/otel/" reason: "OpenTelemetry Specification (embrace-and-extend)" - url: "https://opentelemetry.io/docs/specs/otlp/" reason: "OTLP wire protocol for trace/metric export" depends_on: - xri: "kno://specs/kno-spec" reason: "Conforms to KNO format specification" enables: - xri: "kno://specs/activity-schema" reason: "OTel traces enable activity-to-trace correlation via trace_id in pspace:auditMetadata" related_to: - xri: "kno://specs/activity-schema" reason: "Activity logging captures WHAT; OTel captures HOW. The activity_bridge links them." - xri: "kno://connectors/signoz" reason: "SigNoz is the OTel backend — receives, stores, and visualizes telemetry data" - xri: "kno://services/pspace-api" reason: "Instrumented service" - xri: "kno://services/pspace-activity" reason: "Instrumented service" - xri: "kno://services/pspace-spec-registry" reason: "Instrumented service" # ============================================================================= # @kno:quality # ============================================================================= quality: completeness: 0.90 last_reviewed: "2026-02-15" review_status: active reviewed_by: "claude" # ============================================================================= # @kno:history # ============================================================================= _history: version: 3 created: "2026-07-18T00:00:00Z" created_by: "claude" modified: "2026-02-15T00:00:00Z" modified_by: "claude" changelog: - version: "0.3.0" date: "2026-02-15" summary: "Comprehensive update per #119 — documented ESM --import pattern, per-service CMD differences, postgres.js traced-sql wrapper, Hono middleware enrichment, Activity Bridge marked Active, removed pspace-inference reference" - version: "0.2.0" date: "2026-02-13" summary: "Activity Bridge implementation — added enables relationship, marked bridge as Implementing" # ============================================================================= # @kno:index # ============================================================================= _index: - path: "overview" line: 200 keywords: [otel, architecture, signals, dual-model] - path: "signals" line: 254 keywords: [traces, metrics, spans, auto-instrumentation] - path: "instrumented-services" line: 304 keywords: [api, activity, spec-registry] - path: "collector-pipeline" line: 375 keywords: [OTLP, batch, ClickHouse, collector] - path: "activity-bridge" line: 417 keywords: [traceId, auditMetadata, correlation] - path: "configuration" line: 468 keywords: [env-vars, sampling, OTEL_SERVICE_NAME] # ============================================================================= # @kno:contains # ============================================================================= contains: - xri: "#overview" role: section title: "Overview" keywords: [architecture, dual-model, signals] - xri: "#signals" role: section title: "Signals Collected" keywords: [traces, metrics, auto-instrumentation] - xri: "#instrumented-services" role: section title: "Instrumented Services" keywords: [api, activity, spec-registry] - xri: "#collector-pipeline" role: section title: "Collector Pipeline" keywords: [OTLP, batch, ClickHouse] - xri: "#activity-bridge" role: section title: "Activity Bridge" keywords: [traceId, correlation, auditMetadata] - xri: "#configuration" role: section title: "Configuration" keywords: [env-vars, sampling] # ============================================================================= # SECTION: Overview # ============================================================================= # @section: overview # @title: Overview # ============================================================================= # OpenTelemetry Integration Architecture # # Possibility uses OpenTelemetry (OTel) for distributed tracing and metrics # collection across all backend services. The integration follows an # embrace-and-extend pattern: # # - EMBRACE: Standard OTel SDK, OTLP protocol, auto-instrumentation # - EXTEND: Activity bridge (traceId in AS2 auditMetadata), Possibility- # specific resource attributes, health endpoint filtering # # Architecture: # # ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ # │ pspace-api │ │pspace-activity│ │spec-registry │ # │ (Hono) │ │ (Hono) │ │ (Hono) │ # │ OTel SDK │ │ OTel SDK │ │ OTel SDK │ # └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ # │ │ │ # └─────────────────┴──────────────────┘ # │ # OTLP/gRPC :4317 # │ # ▼ # ┌──────────────────────────┐ # │ OTel Collector │ # │ (signoz-otel-collector) │ # │ receivers → processors │ # │ → exporters │ # └────────────┬──────────────┘ # │ # ClickHouse native # │ # ▼ # ┌──────────────────────────┐ # │ ClickHouse │ # │ (signoz-clickhouse) │ # │ Time-series storage │ # └────────────┬──────────────┘ # │ # ▼ # ┌──────────────────────────┐ # │ SigNoz UI │ # │ (signoz-app :3301) │ # │ Dashboards, traces, │ # │ metrics explorer │ # └──────────────────────────┘ # ============================================================================= # SECTION: Signals Collected # ============================================================================= # @section: signals # @title: Signals Collected # ============================================================================= # Traces (Distributed Tracing) # # Auto-instrumented via @opentelemetry/auto-instrumentations-node: # - HTTP requests (incoming and outgoing) # - DNS lookups # - Express/Hono middleware chains # - fetch() calls (undici instrumentation) # # PostgreSQL queries (postgres.js workaround): # postgres.js uses its own wire protocol, not node-postgres (pg). # The standard @opentelemetry/instrumentation-pg targets the `pg` module # and cannot instrument postgres.js. # # WORKAROUND: `src/traced-sql.ts` — a Proxy-based wrapper around the # postgres.js `sql` tagged template. # # Pattern (pspace-api and pspace-activity): # import { tracedSql as sql } from "./traced-sql.js"; # # The Proxy intercepts `sql` template calls and `sql.unsafe()`, creating # OTel spans with DB semantic convention attributes: # - db.system: "postgresql" # - db.operation.name: e.g. "SELECT", "INSERT" # - db.collection.name: extracted table name (best-effort) # - db.response.returned_rows: result count # # This provides full query-level observability without patching postgres.js # internals. Services that don't use SQL (pspace-spec-registry) don't # include this wrapper. # # Metrics (Service Metrics) # # Exported via PeriodicExportingMetricReader (30s interval): # - Runtime metrics (event loop lag, heap usage, GC) # - HTTP request metrics (duration, count, status codes) # - Custom metrics can be added per-service as needed # # Logs # # NOT currently collected via OTel. Services use Hono's built-in logger # which writes to stdout. Container logs are available via `docker logs`. # Future: OTel log bridge for structured log correlation. # ============================================================================= # SECTION: Instrumented Services # ============================================================================= # @section: instrumented-services # @title: Instrumented Services # ============================================================================= # Three Hono backend services are instrumented with OTel. # (pspace-inference was removed — see #120.) # # Service | OTEL_SERVICE_NAME | Port | Runtime # ---------------------|----------------------|------|------------------------ # pspace-api | pspace-api | 3000 | tsx (ESM) # pspace-activity | pspace-activity | 3005 | node (CJS build) # pspace-spec-registry | pspace-spec-registry | 8081 | tsx (ESM) # # Instrumentation loading — ESM --import pattern # # OTel MUST be imported before any other module so that monkey-patching # takes effect. All services use the Node.js --import flag (NOT --require). # # Two runtime variants exist: # # Variant A — tsx (ESM, used by pspace-api, pspace-spec-registry): # CMD: npx tsx --import ./src/instrumentation.ts src/index.ts # The tsx loader handles TypeScript natively. OTel SDK registers # auto-instrumentations at import-time before index.ts runs. # NOTE: src/index.ts does NOT import instrumentation — the --import flag # ensures it loads first, before any application code. # # Variant B — node (CJS build, used by pspace-activity): # CMD: node --import @opentelemetry/instrumentation/hook.mjs \ # --import ./dist/instrumentation.js dist/index.js # Uses the OTel ESM loader hook (hook.mjs) for ESM module patching. # The CJS require workaround in instrumentation.ts: # import { createRequire } from "module"; # const require = createRequire(import.meta.url); # require("http"); require("https"); # This triggers CJS resolution so @opentelemetry/instrumentation-http # can patch the modules before ES imports reference them. # # Common instrumentation.ts structure (both variants): # 1. Conditional guard: only initializes if OTEL_EXPORTER_OTLP_ENDPOINT set # 2. resourceFromAttributes({ "service.name": ... }) — v2.x API # 3. OTLPTraceExporter (gRPC) + BatchSpanProcessor # 4. PeriodicExportingMetricReader (OTLPMetricExporter, 30s interval) # 5. getNodeAutoInstrumentations() with endpoint filtering # 6. Graceful shutdown on SIGTERM/SIGINT # # SDK version: @opentelemetry/sdk-node v0.200.0 # Resources API: v2.x uses resourceFromAttributes() (not new Resource()) # # Hono Middleware Enrichment # # All services register a Hono middleware that enriches spans: # app.use("*", async (c, next) => { # const span = trace.getActiveSpan(); # span?.setAttribute("http.route", c.req.path); # await next(); # span?.setAttribute("http.response.status_code", c.res.status); # }); # # pspace-api additionally sets `enduser.id` from the auth context: # span?.setAttribute("enduser.id", c.get("userId")); # Other services do not set enduser.id. # # Filtered endpoints (excluded from traces to reduce noise): # All services filter: GET /health # pspace-api also filters: GET /version # No service currently filters /favicon.ico. # ============================================================================= # SECTION: Collector Pipeline # ============================================================================= # @section: collector-pipeline # @title: Collector Pipeline # ============================================================================= # The OTel Collector receives telemetry from all services and routes it to # ClickHouse for storage. Pipeline configuration: # # Config file: services/signoz/otel-collector-config.yaml # # Receivers: # otlp: # protocols: # grpc: 0.0.0.0:4317 # http: 0.0.0.0:4318 # # Processors: # batch: # send_batch_size: 10000 # send_batch_max_size: 11000 # timeout: 10s # resourcedetection: # detectors: [env, system] # override: false # # Exporters: # clickhousetraces: # dsn: tcp://signoz-clickhouse:9000 # docker_multi_node: false # low_cardinal_exception_grouping: true # clickhousemetrics: # dsn: tcp://signoz-clickhouse:9000 # clickhouselogs: # dsn: tcp://signoz-clickhouse:9000 # # Service pipelines: # traces: otlp → batch, resourcedetection → clickhousetraces # metrics: otlp → batch, resourcedetection → clickhousemetrics # logs: otlp → batch, resourcedetection → clickhouselogs # ============================================================================= # SECTION: Activity Bridge # ============================================================================= # @section: activity-bridge # @title: Activity Bridge — Linking Activities to Traces # ============================================================================= # The Activity Bridge is the key Possibility extension to OTel — it links # semantic activity records (AS2) to execution traces (OTel) via traceId. # # STATUS: Active (implemented in pspace-activity, GitHub issue #108 closed) # # Ontological significance: # This is a CROSS-SYSTEM SEMANTIC BRIDGE. It connects: # - .kno-modeled entity (activity in PostgreSQL) — WHAT happened # - External execution trace (OTel spans in ClickHouse) — HOW it happened # Per P7, this link carries semantic meaning: an activity REFERENCES its trace. # The trace is not a .kno entity (it's infrastructure exhaust), but the # reference TO it from a .kno entity IS semantically meaningful. # # Implementation (pspace-activity/src/routes/activities.ts): # When an activity is POSTed: # 1. The OTel SDK has already created a trace context for the HTTP request # 2. Extract traceId: trace.getActiveSpan()?.spanContext().traceId # 3. Store it in the activity's trace_id column (VARCHAR(32)) # 4. A partial index exists on trace_id WHERE trace_id IS NOT NULL # 5. Activities are queryable by traceId: GET /activities?traceId=abc123... # # All pspace:auditMetadata fields are stored as dedicated indexed columns: # # Schema impact (implemented in activity-schema.kno v0.2.0): # pspace:auditMetadata: # properties: # trace_id: { type: string } # OTel trace ID (32-char hex) # ip_address: { type: string } # Client IP # user_agent: { type: string } # Client user agent # session_id: { type: string } # Auth session # request_id: { type: string } # Request correlation UUID # # Query flow: # 1. User sees activity: "User created possibility 'XYZ'" # 2. Activity record contains trace_id: "abc123..." # 3. Click "View trace" → opens SigNoz filtered to that trace # 4. SigNoz shows: HTTP request → auth check → DB write → response # # Implementation: # - @opentelemetry/api: trace.getActiveSpan()?.spanContext().traceId # - Hono context: c.req.header('x-forwarded-for'), c.req.header('user-agent') # - Request ID: crypto.randomUUID() # - SigNoz deep link: /trace/{traceId} # ============================================================================= # SECTION: Configuration # ============================================================================= # @section: configuration # @title: Configuration Reference # ============================================================================= # Environment Variables (set per-service in docker-compose.yml): # # Variable | Default | Purpose # ------------------------------|--------------------------------------|--------------------------- # OTEL_SERVICE_NAME | (service-specific) | Service identity in traces # OTEL_EXPORTER_OTLP_ENDPOINT | http://signoz-otel-collector:4317 | Collector gRPC endpoint # OTEL_TRACES_SAMPLER | parentbased_always_on | Sampling strategy # # Sampling Strategies: # - parentbased_always_on — Trace everything (development, low traffic) # - parentbased_traceidratio — Sample a percentage (production, high traffic) # Set OTEL_TRACES_SAMPLER_ARG=0.1 for 10% sampling # - always_off — Disable tracing (emergency, performance issues) # # Metric Export Interval: 30 seconds (hardcoded in instrumentation.ts) # # Resource Attributes (auto-detected): # - service.name — From OTEL_SERVICE_NAME # - host.name — From system hostname # - process.pid — From Node.js process # - telemetry.sdk.language — "nodejs" # - telemetry.sdk.name — "@opentelemetry/sdk-node" # # Docker Compose Profiles: # - Default (no profile): Services run without OTel (no collector endpoint) # - `--profile telemetry`: Starts SigNoz stack, services export to collector # # Production startup: # ./scripts/start-pspace.sh --telemetry # Starts SigNoz in Phase 3 after app services are healthy.