# Agent Policy Schema — Layer 3 Site Schema # KNO Schema Version: 0.1.0 # # Layer 3 schema for declaring how AI agents and crawlers may interact with # a Possibility-served site. One canonical entity (`content/agent-policy.kno`) # drives the platform's `/robots.txt`, the "For agents" panel on each page, # and the Cloudflare WAF/cache decisions documented in the Cloudflare runbook. # # DOMAIN: Site policy for AI agents and web crawlers # PURPOSE: Single source of truth that distinguishes search indexing, # on-demand agent fetching, and training-data crawling # # KEY CONCEPT: Agent policy is a first-class .kno entity. The `robots.txt` # served at the site root is a thin transformation of this entity (see # specs/robots-txt-format.kno and the produces edge below). # # NOTE: Per-entity training opt-in is NOT modeled here. When/if individual # entities need to override the global training stance, an `agent_visibility` # field will be added to the relevant content schemas. The agent-policy # entity remains the global default. # ============================================================================= # SCHEMA DECLARATION # ============================================================================= $schema: kno@0.0.9 # ============================================================================= # BASIC TIER # ============================================================================= id: 01KPY2MSKXSGA76X3TPEPPH8M1 slug: agent-policy-schema type: spec version: 0.1.0 # ============================================================================= # STANDARD TIER # ============================================================================= title: "Agent Policy Schema" purpose: | Schema for site-level agent and crawler policy. A single canonical `agent-policy` entity declares which AI agents, search crawlers, and training-data crawlers may access the site. **Three crawler roles:** - `on-demand` — A user asked an agent to fetch a specific URL right now (e.g., ChatGPT-User, Claude-User, PerplexityBot user-initiated). These are the agents that make "tell my LLM to look at this page" work. - `search` — Crawlers building a search index (Googlebot, Bingbot, OAI-SearchBot, Claude-SearchBot, PerplexityBot index). - `training` — Crawlers gathering training data for foundation models (GPTBot, ClaudeBot, CCBot, Google-Extended). **Three policy verdicts per agent:** - `allow` — Permitted to fetch all paths (subject to per-path overrides) - `block` — Disallowed via robots.txt - `unspecified` — Falls through to `default_policy` **Production:** This schema declares `produces: [robots-txt-format]`. The `/robots.txt` served at the site root is a thin transformation of the policy entity (see `specs/robots-txt-format.kno` and the route at `services/pspace-site/src/pages/robots.txt.ts`). # ============================================================================= # RICH TIER # ============================================================================= provenance: origin: id: 01KPY2MSKXSGA76X3TPEPPH8M1 timestamp: "2026-04-23T00:00:00Z" tool: manual-authoring issue: "https://github.com/PossibilityTruthy/possibility-space/issues/1807" taxonomy: topics: - site-policy - agent-discovery - crawlers - robots-txt - ai-agents keywords: - agent-policy - robots - crawler - user-agent - on-demand - training - search-index relationships: depends_on: - xri: "kno://specs/kno-spec" reason: "Conforms to KNO format specification" produces: # Schema-level declaration: any entity conforming to agent-policy-schema # produces a robots.txt artifact via thin transformation. The producer # edge lives on the schema (per kno-foundational-principles § Forward # vs Reverse Edge Storage v1.20.0). - xri: "kno://specs/robots-txt-format" reason: "Agent policy is materialized as a robots.txt at the site root via thin transformation" related_to: - xri: "kno://specs/llms-txt-format" reason: "Companion well-known artifact for AI agents (corpus index)" - xri: "kno://specs/page-schema" reason: "Pages surface 'For agents' affordances driven by this policy" - xri: "kno://content/agent-discovery-page" reason: "Documentation page that explains the discovery surfaces this policy enables" quality: completeness: 0.85 last_reviewed: "2026-04-23" review_status: draft reviewed_by: "claude" # ============================================================================= # HISTORY # ============================================================================= _history: retention: full format: changelog changelog: - version: "0.1.0" date: "2026-04-23" author: "claude" summary: "Initial agent-policy-schema (#1807 Phase 1)" changes: - "Defined three crawler roles (on-demand, search, training) and three policy verdicts (allow, block, unspecified)" - "Declared produces: [robots-txt-format] for thin transformation" - "Documented per-entity training opt-in as a deferred extension on content schemas (not on agent-policy itself)" # ============================================================================= # SCHEMA DEFINITION # ============================================================================= schema: name: agent-policy version: 0.1.0 description: | Site-level policy declaring which AI agents and crawlers may access the site. One entity per site (or per tenant in a future per-tenant extension); drives robots.txt and other agent-facing surfaces. # --------------------------------------------------------------------------- # Required Fields # --------------------------------------------------------------------------- required_fields: - name: default_policy description: "Verdict applied to user-agents NOT explicitly listed in the `agents` array. Use `block` for a deny-by-default stance." type: string enum: - allow - block - name: search_indexing description: "Whether search-engine crawlers (Googlebot, Bingbot, etc.) may index the site. When false, these crawlers MUST appear in `agents` with policy=block (or be covered by default_policy=block)." type: boolean - name: agents description: "Explicit per-user-agent policy entries. Order is preserved in the rendered robots.txt." type: array items: type: object required_children: - name: name description: "User-Agent token as it appears in the UA string (case-sensitive match for robots.txt). Use `*` for the default entry only; prefer `default_policy` instead." type: string - name: role description: "Crawler classification." type: string enum: - on-demand - search - training - unknown - name: policy description: "Verdict for this agent." type: string enum: - allow - block - unspecified optional_children: - name: vendor description: "Organization operating the crawler (e.g., OpenAI, Anthropic, Google, Common Crawl)." type: string - name: documentation_url description: "Vendor's published documentation for this UA token." type: string - name: notes description: "Free-form rationale for the policy decision." type: string - name: disallow_paths description: "Optional per-agent path overrides. Defaults to `/` (everything) when policy=block, none when policy=allow." type: array items: type: string # --------------------------------------------------------------------------- # Optional Fields # --------------------------------------------------------------------------- optional_fields: - name: sitemap_url description: "URL of the sitemap.xml, emitted as a Sitemap: directive in robots.txt. Omit if no sitemap is published." type: string - name: llms_txt_url description: "URL of the llms.txt corpus, emitted as a comment in robots.txt for discoverability. Convention: site root /llms.txt." type: string - name: contact description: "Contact information for crawler operators with policy questions or abuse reports." type: object children: - name: email type: string - name: url type: string # ============================================================================= # CONTAINER NAVIGATION # ============================================================================= contains: - xri: "#schema/required_fields" role: section title: "Required Fields" keywords: [ default_policy, search_indexing, agents ] - xri: "#schema/optional_fields" role: section title: "Optional Fields" keywords: [ sitemap_url, llms_txt_url, contact ] _index: - path: "identity" line: 28 keywords: [ id, agent-policy-schema, layer-3 ] - path: "schema/required_fields" line: 145 keywords: [ default_policy, agents, role, policy ] - path: "schema/optional_fields" line: 210 keywords: [ sitemap, llms_txt, contact ]