# Corpus Domain Schema — Layer 3 Specialized Entity Type # KNO Schema Version: 0.0.8 # # A corpus is a collection of source files imported for analysis. # It is a **source definition** (which repos to pull from), not a frozen snapshot. # When refreshed, the corpus re-imports from its configured sources. # # EXTENDS: document-schema.kno (which composes identity, history, quality) # ENABLES: Analysis jobs, playbook inference, methodology extraction # # DESIGN PRINCIPLE: Timeless Corpora # - Corpus defines WHAT sources to analyze (repo URLs) # - Refresh updates the corpus in place (no version explosion) # - Playbooks link to corpora; refresh updates both # ============================================================================= # SCHEMA DECLARATION (RFC-007) # ============================================================================= $schema: kno@0.0.9 # ============================================================================= # IDENTITY (composed from identity-schema.kno) # ============================================================================= id: 01KGK3V73NNZHVSVW3J3B2A5P7 slug: corpus-schema type: spec version: 0.2.0 # ============================================================================= # STANDARD TIER # ============================================================================= title: "Corpus Domain Schema" purpose: | Define the schema for Corpus entities — collections of source documents imported for analysis. **What is a Corpus?** A corpus is a source definition that specifies: - Which repositories to import from - What file types to include (docs, wiki, instructions, prompts) - Provenance tracking (git commits at import time) **Timeless Model:** | Concept | Traditional | Timeless (Possibility) | |---------|-------------|----------------| | Corpus | Frozen snapshot (v1, v2, v3...) | Source definition | | Refresh | Creates new version | Updates in place | | Playbook link | Links to specific version | Links to corpus definition | **Layer 3 Position**: Corpus extends document (Layer 2), which composes identity, history, and quality (Layer 1). # ============================================================================= # RICH TIER — Relationships (Edge Maximization) # ============================================================================= provenance: origin: id: 01KGK3V73NNZHVSVW3J3B2A5P7 timestamp: "2026-02-04T01:47:56Z" tool: manual-migration taxonomy: topics: - analysis - document-collection - provenance - import keywords: - corpus - documents - import - repository - wiki - instructions - prompts - methodology relationships: extends: - xri: "kno://specs/document-schema" reason: "Layer 2 base type for structured entities" depends_on: - xri: "kno://specs/kno-spec" reason: "RFC-001 defines kno@0.0.9 schema" composes: # Inherited through document-schema.kno: - xri: "kno://specs/identity-schema" reason: "Layer 1: id, canonical_id, local_ids, equiv_ids" - xri: "kno://specs/history-schema" reason: "Layer 1: _history, changelog" - xri: "kno://specs/quality-schema" reason: "Layer 1: quality, validation, confidence" enables: - xri: "kno://specs/playbook-schema" reason: "Playbooks are generated from corpus analysis" - xri: "kno://concepts/analysis-job" reason: "Analysis jobs operate on corpora" related_to: - xri: "kno://specs/user-schema" reason: "Users own corpora" - xri: "kno://specs/organization-schema" reason: "Organizations can own corpora" implements: - xri: "kno://principles/P9" reason: "Temporal/Historical — corpora in Hive are versioned" - xri: "kno://principles/VCS-Mandatory-Rule" reason: "Corpora stored in Hive MUST be recorded in a VCS backend" quality: completeness: 0.85 last_reviewed: "2026-01-25" review_status: draft # ============================================================================= # HISTORY (P9 Temporal — composed from history-schema.kno) # ============================================================================= _history: version: 2 created: "2025-12-14T00:00:00Z" created_by: "pspace-core-team" modified: "2026-02-21T00:00:00Z" modified_by: "claude" # ============================================================================= # SPECIFICATION CONTENT # ============================================================================= spec: status: Draft changelog: - version: "0.2.0" date: "2026-02-21" changes: - "Added VCS-Mandatory Rule cross-reference (implements relationship)" - "Corpora stored in Hive must be VCS-recorded" - version: "0.1.0" date: "2026-01-25" changes: - "Promoted from holding-pen to specs/" - "Updated to kno@0.0.9 schema format" - "Aligned with bedrock principles (no facets, edge inference)" - "Added Layer 3 positioning in relationships" - "Timeless corpus model as design principle" description: | ## Timeless Model ``` ┌─────────────────────────────────────────────────────────────────────┐ │ Corpus = Source Definition (not a snapshot) │ ├─────────────────────────────────────────────────────────────────────┤ │ - Defines WHICH repos to pull from │ │ - Captures git commit at LAST import (provenance) │ │ - Refresh updates the corpus in place │ │ - Linked playbooks can also refresh │ └─────────────────────────────────────────────────────────────────────┘ ``` ## File Types Corpora categorize imported files by type: | Type | Description | Typical Location | |------|-------------|------------------| | `docs` | Documentation files | `docs/`, `README.md` | | `wiki` | Wiki pages | `*.wiki` repo | | `instructions` | Agent instruction files | `.github/instructions/` | | `prompts` | Prompt templates | `.github/prompts/`, `.claude/` | ## Storage Layout ``` pspace-storage:///hive/corpora/ └── {corpus_id}/ ├── manifest.kno # Corpus entity ├── catalog.json # File manifest └── files/ ├── docs/ # Documentation files ├── wiki/ # Wiki pages ├── instructions/ # Instruction files └── prompts/ # Prompt templates ``` ## Lifecycle ``` ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ importing │ ──▶ │ ready │ ──▶ │ archived │ └──────────────┘ └──────────────┘ └──────────────┘ │ ▼ (refresh) ┌──────────────┐ │ importing │ └──────────────┘ ``` # =========================================================================== # SCHEMA # =========================================================================== schema: type: object required: - id - type - version - name - sources properties: # ----------------------------------------------------------------------- # BASIC TIER (from kno-spec) # ----------------------------------------------------------------------- id: type: string pattern: "^corpus_[a-zA-Z0-9]+$" description: | Unique identifier for this corpus. Convention: corpus_{ulid} examples: - "corpus_01HXYZ123" - "corpus_01JKM456" type: const: corpus description: "Always 'corpus' for this schema" version: type: string pattern: "^\\d+\\.\\d+\\.\\d+$" description: "Version of this corpus entity" default: "0.1.0" # ----------------------------------------------------------------------- # STANDARD TIER # ----------------------------------------------------------------------- name: type: string description: "Human-readable corpus name" examples: - "Howl Repository" - "Awecelot Combined" description: type: string description: "What this corpus contains and its purpose" # ----------------------------------------------------------------------- # SOURCES (Core of timeless model) # ----------------------------------------------------------------------- sources: type: array description: | Source repositories to import. This is the DEFINITION of what to pull; git commits are captured at import time. items: type: object required: - repo properties: repo: type: string description: "GitHub repository (owner/name)" examples: - "howl-app/howl" - "PossibilityInc/possibility-space" branch: type: string default: "main" description: "Branch to import from" commit: type: string description: "Git commit SHA (captured at import time)" include_wiki: type: boolean default: true description: "Whether to import the wiki repo" wiki_commit: type: string description: "Wiki commit SHA (captured at import time)" # ----------------------------------------------------------------------- # FILE STATISTICS # ----------------------------------------------------------------------- stats: type: object description: "File counts and sizes" properties: total_files: type: integer description: "Total number of files" by_type: type: object properties: docs: type: integer wiki: type: integer instructions: type: integer prompts: type: integer total_size_bytes: type: integer description: "Total size of all files in bytes" total_tokens_estimate: type: integer description: "Estimated token count (chars/4 approximation)" # ----------------------------------------------------------------------- # IMPORT METADATA # ----------------------------------------------------------------------- imported_at: type: string format: date-time description: "When the corpus was last imported" imported_by: type: string description: "User XRI or system that performed the import" owner_id: type: string description: "User ID who owns this corpus" # ----------------------------------------------------------------------- # STORAGE # ----------------------------------------------------------------------- storage_uri: type: string description: "MinIO URI for corpus files" examples: - "pspace-storage:///hive/corpora/corpus_01HXYZ123/" catalog_uri: type: string description: "URI to catalog.json manifest" # ----------------------------------------------------------------------- # STATUS # ----------------------------------------------------------------------- status: type: string description: | Corpus lifecycle status: - importing: Import in progress - ready: Available for analysis - archived: No longer in active use examples: - "importing" - "ready" - "archived" visibility: type: string description: "Access control" examples: - "private" - "organization" - "public" # ----------------------------------------------------------------------- # METADATA # ----------------------------------------------------------------------- tags: type: array items: type: string description: "Optional tags for filtering" # ============================================================================= # EXAMPLES # ============================================================================= examples: - name: "Minimal Corpus" description: "Simplest valid corpus" value: $schema: "kno://specs/corpus-schema@0.1" id: "corpus_01HXYZ123" type: "corpus" version: "0.1.0" name: "My Repository" sources: - repo: "myorg/myrepo" branch: "main" status: "ready" - name: "Full Corpus" description: "Corpus with all fields" value: $schema: "kno://specs/corpus-schema@0.1" id: "corpus_01HOWL456" type: "corpus" version: "0.1.0" name: "Howl Repository" description: "Complete methodology and documentation from Howl app" sources: - repo: "howl-app/howl" branch: "main" commit: "dbebd0f04dbef8417343dbf35e2e7127a5a105b6" # pragma: allowlist secret include_wiki: true wiki_commit: "80e7c4aa42cdad7ef03a4c04ce0619dd7e62161c" # pragma: allowlist secret stats: total_files: 477 by_type: docs: 314 wiki: 98 instructions: 53 prompts: 12 total_size_bytes: 2500000 total_tokens_estimate: 625000 imported_at: "2026-01-25T10:00:00Z" imported_by: "pspace://user:usr_01ADMIN" owner_id: "usr_01ADMIN" storage_uri: "pspace-storage:///hive/corpora/corpus_01HOWL456/" catalog_uri: "pspace-storage:///hive/corpora/corpus_01HOWL456/catalog.json" status: "ready" visibility: "private" tags: - "methodology" - "pilot"