# llms-txt-format — Aggregate-Corpus Format Specification
# KNO Schema Version: 0.1.0
#
# Describes the `llms.txt` aggregate-corpus format — concatenates N source
# entities (e.g., a guide collection) into a single markdown corpus artifact
# for LLM ingestion.
#
# Added to the bedrock File Type Catalog in v0.5.0 (ADAPT-03).
#
# INDUSTRY CONTEXT:
#   Popularized by Jeremy Howard / Answer.AI (2024); adopted by Stripe, WorkOS,
#   Vercel, Microsoft Learn, Cloudflare, and others as a canonical
#   "AI-readable site map" format.
#
# POSSIBILITY POSITIONING:
#   Produced by thin transformation from collection-level .kno schemas that
#   declare `produces: [llms-txt-format]`. No conversion spec; the production
#   rule IS the schema-graph edge (see kno-system_architecture § Thin
#   Transformations).

# =============================================================================
# SCHEMA DECLARATION
# =============================================================================
$schema: kno@0.0.9

# =============================================================================
# BASIC TIER
# =============================================================================
id: 01KPCQAP9KXRS0ZKSW4MGHV0DJ
slug: llms-txt-format
type: spec
version: 0.1.0

# =============================================================================
# STANDARD TIER
# =============================================================================
title: "llms.txt — Aggregate Corpus Format"
purpose: |
  Define `llms-txt-format` as an aggregate-corpus output format for .kno systems.

  **What is llms.txt?** A plain-text / markdown file served at a well-known
  path (conventionally `/llms.txt` or `/docs/llms.txt`) that concatenates N
  source entities into a single LLM-ingestible artifact. It is a "site map
  for AI agents" — optimized for retrieval and context-window efficiency,
  not for human navigation.

  **Why an aggregate format?** Most .kno formats describe a single entity
  (one markdown document, one YAML file). `llms-txt-format` describes a
  *collection* projection: N entities → 1 corpus. Marking it as a distinct
  format makes aggregate production a first-class pattern rather than
  ad-hoc generation code.

  **Ingest-only:** Unlike markdown or YAML, `llms.txt` is NOT round-trippable.
  It is a one-way projection from a structured collection (source-of-truth
  entities) into a flattened corpus. Parsing `llms.txt` back into individual
  entities is out of scope.

  **Possibility's implementation:** The guide collection schema
  (`specs/guide-schema.kno`) declares `produces: [llms-txt-format]`. A thin
  transformation reads the forward `documents` edges across guide entities
  and concatenates markdown blocks into `/docs/llms.txt`. Adding a guide
  regenerates the corpus deterministically; no side-table required.

# =============================================================================
# RICH TIER — Relationships
# =============================================================================
provenance:
  origin:
    id: 01KPCQAP9KXRS0ZKSW4MGHV0DJ
    timestamp: "2026-04-16T00:00:00Z"
    tool: manual-authoring
taxonomy:
  topics:
    - file-formats
    - aggregate-corpora
    - llm-ingest
    - documentation
    - ai-agents
  keywords:
    - llms.txt
    - aggregate
    - corpus
    - concatenation
    - llm
    - markdown

relationships:
  depends_on:
    - xri: "kno://specs/kno-spec"
      reason: "Conforms to KNO format specification"
    - xri: "kno://specs/markdown-format"
      reason: "Corpus body is markdown"

  related_to:
    - xri: "kno://specs/document-schema"
      reason: "Source entities are typically documents or guides"
    - xri: "kno://specs/guide-schema"
      reason: "Primary producer of llms.txt corpora in Possibility (Phase 2)"

  enables:
    - xri: "kno://capabilities/llm-ingest"
      reason: "Canonical site map for AI agents consuming Possibility docs"
  # This spec is REFERENCED by producer schemas via `produces: [llms-txt-format]`.
  # See foundational-principles § Forward vs Reverse Edge Storage (v1.20.0):
  # the producer edge is stored on the producer; this spec does not carry a
  # reverse `produced_by` edge.

quality:
  completeness: 0.85
  last_reviewed: "2026-04-16"
  review_status: draft
  reviewed_by: "claude"

# =============================================================================
# HISTORY
# =============================================================================
_history:
  retention: full
  format: changelog

  changelog:
    - version: "0.1.0"
      date: "2026-04-16"
      author: "claude"
      summary: "Initial llms-txt-format spec (M38 Phase 2, CONFORMANCE-08 / ADAPT-03)"
      changes:
        - "Created to back the bedrock File Type Catalog v0.5.0 llms-txt-format
          entry"
        - "Defined aggregate-corpus semantics, ingest-only nature, and
          production via thin transformation"
        - "Documented markdown body, optional domain grouping, provenance
          headers"

# =============================================================================
# SPECIFICATION CONTENT
# =============================================================================
spec:
  status: Draft

  # ---------------------------------------------------------------------------
  # Industry References
  # ---------------------------------------------------------------------------
  standards:
    - name: "llms.txt proposal"
      url: "https://llmstxt.org/"
      author: "Jeremy Howard (Answer.AI)"
      year: 2024
      description: |
        Original public proposal for a well-known /llms.txt path serving a
        markdown site map optimized for LLM ingestion. Defines the convention
        followed by Stripe, WorkOS, Vercel, MS Learn, Cloudflare, and others.

  # ---------------------------------------------------------------------------
  # Format Definition
  # ---------------------------------------------------------------------------
  format:
    name: "llms.txt"
    mime_type: "text/markdown; charset=utf-8"
    extensions:
      - ".txt" # Conventional extension for /llms.txt path
      - ".md" # Acceptable; body is markdown
    encoding: "utf-8"
    round_trippable: false
    category: aggregate-corpus # See kno-file_type_catalog v0.5.0

  # ---------------------------------------------------------------------------
  # Corpus Structure
  # ---------------------------------------------------------------------------
  corpus_structure:
    description: |
      An llms.txt corpus is a single UTF-8 markdown document composed of:
        1. A corpus preamble (title, purpose, generation metadata)
        2. Zero or more group sections (by domain, category, or other axis)
        3. For each group, one markdown block per source entity

      Reverse edges are NOT embedded in the corpus. The corpus is a flattened
      projection; graph structure lives in the source entities, not here.

    preamble:
      required:
        - title # e.g., "Possibility Developer Guides"
        - purpose # One paragraph
      optional:
        - generated_at # ISO 8601 timestamp
        - source_xri # Identity XRI of the producer collection
        - total_entities # Count of concatenated entities

    group_section:
      marker: "## Group: {name}"
      optional: true
      description: |
        OPTIONAL intermediate heading grouping entities by domain, category,
        or other axis. Producers MAY omit if the corpus is flat.

    entity_block:
      required:
        - heading # "## {entity.title}"
        - provenance_line # "Source: kno://...  •  Identity: {ULID}"
        - body # Entity content rendered as markdown
      optional:
        - documents_line # "Documents: kno://specs/..." (forward edges from source)
        - related_line # "Related: kno://..."

  # ---------------------------------------------------------------------------
  # Production Rules
  # ---------------------------------------------------------------------------
  production:
    rule: |
      A domain or collection-level schema declares:
        produces:
          - xri: kno://specs/llms-txt-format
            reason: "Aggregate corpus materialized via thin transformation"

      A thin transformation reads the graph:
        1. Resolve all entities conforming to the producer's schema
        2. Group by a configured axis (default: `domain`)
        3. Emit preamble, group sections, and entity blocks
        4. Serve at a well-known path (convention: `/llms.txt` or `/docs/llms.txt`)

    determinism: |
      Output MUST be deterministic given the same input set and ordering.
      Ordering is stable by (group, slug) unless the producer specifies
      otherwise. This enables content-addressed caching.

    freshness: |
      The corpus is regenerated whenever any source entity mutates. Freshness
      strategy is up to the producer (on-demand, scheduled, or on-write).

  # ---------------------------------------------------------------------------
  # Serving
  # ---------------------------------------------------------------------------
  serving:
    well_known_path: "/llms.txt"
    possibility_path: "/docs/llms.txt"
    content_type: "text/markdown; charset=utf-8"
    cache_control: |
      Producers SHOULD set a reasonable `Cache-Control` and a content-hash
      ETag so LLM crawlers can check freshness cheaply.

  # ---------------------------------------------------------------------------
  # Example Corpus (truncated)
  # ---------------------------------------------------------------------------
  examples:
    - title: "Guide corpus (excerpt)"
      description: "Illustrative `/docs/llms.txt` produced from guide-schema"
      code: |
        # Possibility Developer Guides

        Aggregate corpus of all `.kno` guides in the Possibility developer docs.
        Generated from `content/guides/*.kno` via the guide-schema
        `produces: [llms-txt-format]` edge.

        - Generated: 2026-04-16T12:00:00Z
        - Source: kno://specs/guide-schema
        - Total entities: 7

        ## Group: auth

        ## Login with Possibility — Overview

        Source: kno://guides/login-with-possibility-overview  •  Identity: 01KM...
        Documents: kno://capabilities/login-with-possibility

        Login with Possibility is an OIDC-based authentication ...

        ## Group: capabilities

        ## Capability Authoring Guide

        Source: kno://guides/capability-authoring-guide  •  Identity: 01KM...
        Documents: kno://capabilities/capability-authoring

        Capabilities are .kno entities that ...

  # ---------------------------------------------------------------------------
  # Notes
  # ---------------------------------------------------------------------------
  notes: |
    ## Why a distinct format (not just markdown)?

    A markdown file and an llms.txt file share a MIME type but differ in
    **purpose and lifecycle**:

    - Markdown describes one entity; llms.txt describes a collection
    - Markdown is authored; llms.txt is produced
    - Markdown is round-trippable; llms.txt is ingest-only
    - Markdown has no required structure beyond CommonMark; llms.txt has
      a required preamble + entity-block pattern

    Treating llms.txt as a distinct format lets us reason about the
    producer relationship (which schemas emit it) and the freshness
    contract independently of markdown authoring.

    ## Relationship to /robots.txt and /sitemap.xml

    Parallel to `/robots.txt` (crawler directives) and `/sitemap.xml`
    (URL index), `/llms.txt` serves LLM consumers specifically. All three
    are well-known-path conventions; none are mandated but all are widely
    observed.

contains:
  - xri: "#identity"
    role: section
    title: "Schema Metadata"
    keywords: [ id, type, version ]
  - xri: "#spec/corpus_structure"
    role: section
    title: "Corpus Structure"
    keywords: [ preamble, group, entity-block ]
  - xri: "#spec/production"
    role: section
    title: "Production Rules"
    keywords: [ produces, thin-transformation, determinism ]
  - xri: "#spec/serving"
    role: section
    title: "Serving"
    keywords: [ well-known, llms.txt, content-type ]

# =============================================================================
# CONTAINER TIER — Navigation Index
# =============================================================================
_index:
  - path: "identity"
    line: 25
    keywords: [ id, llms-txt-format, aggregate-corpus ]
  - path: "spec/corpus_structure"
    line: 155
    keywords: [ preamble, group, entity-block ]
  - path: "spec/production"
    line: 200
    keywords: [ produces, thin-transformation ]
  - path: "spec/serving"
    line: 230
    keywords: [ well-known-path, content-type, cache-control ]
  - path: "notes"
    line: 280
    keywords: [ markdown, sitemap, robots ]