# Agent Policy Schema — Layer 3 Site Schema
# KNO Schema Version: 0.1.0
#
# Layer 3 schema for declaring how AI agents and crawlers may interact with
# a Possibility-served site. One canonical entity (`content/agent-policy.kno`)
# drives the platform's `/robots.txt`, the "For agents" panel on each page,
# and the Cloudflare WAF/cache decisions documented in the Cloudflare runbook.
#
# DOMAIN: Site policy for AI agents and web crawlers
# PURPOSE: Single source of truth that distinguishes search indexing,
#          on-demand agent fetching, and training-data crawling
#
# KEY CONCEPT: Agent policy is a first-class .kno entity. The `robots.txt`
# served at the site root is a thin transformation of this entity (see
# specs/robots-txt-format.kno and the produces edge below).
#
# NOTE: Per-entity training opt-in is NOT modeled here. When/if individual
# entities need to override the global training stance, an `agent_visibility`
# field will be added to the relevant content schemas. The agent-policy
# entity remains the global default.

# =============================================================================
# SCHEMA DECLARATION
# =============================================================================
$schema: kno@0.0.9

# =============================================================================
# BASIC TIER
# =============================================================================
id: 01KPY2MSKXSGA76X3TPEPPH8M1
slug: agent-policy-schema
type: spec
version: 0.1.0

# =============================================================================
# STANDARD TIER
# =============================================================================
title: "Agent Policy Schema"
purpose: |
  Schema for site-level agent and crawler policy. A single canonical
  `agent-policy` entity declares which AI agents, search crawlers, and
  training-data crawlers may access the site.

  **Three crawler roles:**
  - `on-demand` — A user asked an agent to fetch a specific URL right now
    (e.g., ChatGPT-User, Claude-User, PerplexityBot user-initiated). These
    are the agents that make "tell my LLM to look at this page" work.
  - `search` — Crawlers building a search index (Googlebot, Bingbot,
    OAI-SearchBot, Claude-SearchBot, PerplexityBot index).
  - `training` — Crawlers gathering training data for foundation models
    (GPTBot, ClaudeBot, CCBot, Google-Extended).

  **Three policy verdicts per agent:**
  - `allow` — Permitted to fetch all paths (subject to per-path overrides)
  - `block` — Disallowed via robots.txt
  - `unspecified` — Falls through to `default_policy`

  **Production:** This schema declares `produces: [robots-txt-format]`. The
  `/robots.txt` served at the site root is a thin transformation of the
  policy entity (see `specs/robots-txt-format.kno` and the route at
  `services/pspace-site/src/pages/robots.txt.ts`).

# =============================================================================
# RICH TIER
# =============================================================================
provenance:
  origin:
    id: 01KPY2MSKXSGA76X3TPEPPH8M1
    timestamp: "2026-04-23T00:00:00Z"
    tool: manual-authoring
    issue: "https://github.com/PossibilityTruthy/possibility-space/issues/1807"

taxonomy:
  topics:
    - site-policy
    - agent-discovery
    - crawlers
    - robots-txt
    - ai-agents
  keywords:
    - agent-policy
    - robots
    - crawler
    - user-agent
    - on-demand
    - training
    - search-index

relationships:
  depends_on:
    - xri: "kno://specs/kno-spec"
      reason: "Conforms to KNO format specification"

  produces:
    # Schema-level declaration: any entity conforming to agent-policy-schema
    # produces a robots.txt artifact via thin transformation. The producer
    # edge lives on the schema (per kno-foundational-principles § Forward
    # vs Reverse Edge Storage v1.20.0).
    - xri: "kno://specs/robots-txt-format"
      reason: "Agent policy is materialized as a robots.txt at the site root via thin
        transformation"

  related_to:
    - xri: "kno://specs/llms-txt-format"
      reason: "Companion well-known artifact for AI agents (corpus index)"
    - xri: "kno://specs/page-schema"
      reason: "Pages surface 'For agents' affordances driven by this policy"
    - xri: "kno://content/agent-discovery-page"
      reason: "Documentation page that explains the discovery surfaces this policy
        enables"

quality:
  completeness: 0.85
  last_reviewed: "2026-04-23"
  review_status: draft
  reviewed_by: "claude"

# =============================================================================
# HISTORY
# =============================================================================
_history:
  retention: full
  format: changelog

  changelog:
    - version: "0.1.0"
      date: "2026-04-23"
      author: "claude"
      summary: "Initial agent-policy-schema (#1807 Phase 1)"
      changes:
        - "Defined three crawler roles (on-demand, search, training) and three
          policy verdicts (allow, block, unspecified)"
        - "Declared produces: [robots-txt-format] for thin transformation"
        - "Documented per-entity training opt-in as a deferred extension on
          content schemas (not on agent-policy itself)"

# =============================================================================
# SCHEMA DEFINITION
# =============================================================================
schema:
  name: agent-policy
  version: 0.1.0
  description: |
    Site-level policy declaring which AI agents and crawlers may access
    the site. One entity per site (or per tenant in a future per-tenant
    extension); drives robots.txt and other agent-facing surfaces.

  # ---------------------------------------------------------------------------
  # Required Fields
  # ---------------------------------------------------------------------------
  required_fields:
    - name: default_policy
      description: "Verdict applied to user-agents NOT explicitly listed in the
        `agents` array. Use `block` for a deny-by-default stance."
      type: string
      enum:
        - allow
        - block

    - name: search_indexing
      description: "Whether search-engine crawlers (Googlebot, Bingbot, etc.) may
        index the site. When false, these crawlers MUST appear in `agents` with
        policy=block (or be covered by default_policy=block)."
      type: boolean

    - name: agents
      description: "Explicit per-user-agent policy entries. Order is preserved in the
        rendered robots.txt."
      type: array
      items:
        type: object
        required_children:
          - name: name
            description: "User-Agent token as it appears in the UA string (case-sensitive
              match for robots.txt). Use `*` for the default entry only; prefer
              `default_policy` instead."
            type: string

          - name: role
            description: "Crawler classification."
            type: string
            enum:
              - on-demand
              - search
              - training
              - unknown

          - name: policy
            description: "Verdict for this agent."
            type: string
            enum:
              - allow
              - block
              - unspecified
        optional_children:
          - name: vendor
            description: "Organization operating the crawler (e.g., OpenAI, Anthropic,
              Google, Common Crawl)."
            type: string

          - name: documentation_url
            description: "Vendor's published documentation for this UA token."
            type: string

          - name: notes
            description: "Free-form rationale for the policy decision."
            type: string

          - name: disallow_paths
            description: "Optional per-agent path overrides. Defaults to `/` (everything)
              when policy=block, none when policy=allow."
            type: array
            items:
              type: string

  # ---------------------------------------------------------------------------
  # Optional Fields
  # ---------------------------------------------------------------------------
  optional_fields:
    - name: sitemap_url
      description: "URL of the sitemap.xml, emitted as a Sitemap: directive in
        robots.txt. Omit if no sitemap is published."
      type: string

    - name: llms_txt_url
      description: "URL of the llms.txt corpus, emitted as a comment in robots.txt for
        discoverability. Convention: site root /llms.txt."
      type: string

    - name: contact
      description: "Contact information for crawler operators with policy questions or
        abuse reports."
      type: object
      children:
        - name: email
          type: string
        - name: url
          type: string

# =============================================================================
# CONTAINER NAVIGATION
# =============================================================================
contains:
  - xri: "#schema/required_fields"
    role: section
    title: "Required Fields"
    keywords: [ default_policy, search_indexing, agents ]
  - xri: "#schema/optional_fields"
    role: section
    title: "Optional Fields"
    keywords: [ sitemap_url, llms_txt_url, contact ]

_index:
  - path: "identity"
    line: 28
    keywords: [ id, agent-policy-schema, layer-3 ]
  - path: "schema/required_fields"
    line: 145
    keywords: [ default_policy, agents, role, policy ]
  - path: "schema/optional_fields"
    line: 210
    keywords: [ sitemap, llms_txt, contact ]