# Corpus Domain Schema — Layer 3 Specialized Entity Type
# KNO Schema Version: 0.0.8
#
# A corpus is a collection of source files imported for analysis.
# It is a **source definition** (which repos to pull from), not a frozen snapshot.
# When refreshed, the corpus re-imports from its configured sources.
#
# EXTENDS: document-schema.kno (which composes identity, history, quality)
# ENABLES: Analysis jobs, playbook inference, methodology extraction
#
# DESIGN PRINCIPLE: Timeless Corpora
# - Corpus defines WHAT sources to analyze (repo URLs)
# - Refresh updates the corpus in place (no version explosion)
# - Playbooks link to corpora; refresh updates both

# =============================================================================
# SCHEMA DECLARATION (RFC-007)
# =============================================================================
$schema: kno@0.0.9

# =============================================================================
# IDENTITY (composed from identity-schema.kno)
# =============================================================================
id: 01KGK3V73NNZHVSVW3J3B2A5P7
slug: corpus-schema
type: spec
version: 0.2.0

# =============================================================================
# STANDARD TIER
# =============================================================================
title: "Corpus Domain Schema"
purpose: |
  Define the schema for Corpus entities — collections of source documents
  imported for analysis.

  **What is a Corpus?** A corpus is a source definition that specifies:
  - Which repositories to import from
  - What file types to include (docs, wiki, instructions, prompts)
  - Provenance tracking (git commits at import time)

  **Timeless Model:**
  | Concept | Traditional | Timeless (Possibility) |
  |---------|-------------|----------------|
  | Corpus | Frozen snapshot (v1, v2, v3...) | Source definition |
  | Refresh | Creates new version | Updates in place |
  | Playbook link | Links to specific version | Links to corpus definition |

  **Layer 3 Position**: Corpus extends document (Layer 2), which composes
  identity, history, and quality (Layer 1).

# =============================================================================
# RICH TIER — Relationships (Edge Maximization)
# =============================================================================
provenance:
  origin:
    id: 01KGK3V73NNZHVSVW3J3B2A5P7
    timestamp: "2026-02-04T01:47:56Z"
    tool: manual-migration
taxonomy:
  topics:
    - analysis
    - document-collection
    - provenance
    - import
  keywords:
    - corpus
    - documents
    - import
    - repository
    - wiki
    - instructions
    - prompts
    - methodology

relationships:
  extends:
    - xri: "kno://specs/document-schema"
      reason: "Layer 2 base type for structured entities"

  depends_on:
    - xri: "kno://specs/kno-spec"
      reason: "RFC-001 defines kno@0.0.9 schema"

  composes:
    # Inherited through document-schema.kno:
    - xri: "kno://specs/identity-schema"
      reason: "Layer 1: id, canonical_id, local_ids, equiv_ids"
    - xri: "kno://specs/history-schema"
      reason: "Layer 1: _history, changelog"
    - xri: "kno://specs/quality-schema"
      reason: "Layer 1: quality, validation, confidence"

  enables:
    - xri: "kno://specs/playbook-schema"
      reason: "Playbooks are generated from corpus analysis"
    - xri: "kno://concepts/analysis-job"
      reason: "Analysis jobs operate on corpora"

  related_to:
    - xri: "kno://specs/user-schema"
      reason: "Users own corpora"
    - xri: "kno://specs/organization-schema"
      reason: "Organizations can own corpora"

  implements:
    - xri: "kno://principles/P9"
      reason: "Temporal/Historical — corpora in Hive are versioned"
    - xri: "kno://principles/VCS-Mandatory-Rule"
      reason: "Corpora stored in Hive MUST be recorded in a VCS backend"

quality:
  completeness: 0.85
  last_reviewed: "2026-01-25"
  review_status: draft

# =============================================================================
# HISTORY (P9 Temporal — composed from history-schema.kno)
# =============================================================================
_history:
  version: 2
  created: "2025-12-14T00:00:00Z"
  created_by: "pspace-core-team"
  modified: "2026-02-21T00:00:00Z"
  modified_by: "claude"

# =============================================================================
# SPECIFICATION CONTENT
# =============================================================================
spec:
  status: Draft

  changelog:
    - version: "0.2.0"
      date: "2026-02-21"
      changes:
        - "Added VCS-Mandatory Rule cross-reference (implements relationship)"
        - "Corpora stored in Hive must be VCS-recorded"
    - version: "0.1.0"
      date: "2026-01-25"
      changes:
        - "Promoted from holding-pen to specs/"
        - "Updated to kno@0.0.9 schema format"
        - "Aligned with bedrock principles (no facets, edge inference)"
        - "Added Layer 3 positioning in relationships"
        - "Timeless corpus model as design principle"

  description: |
    ## Timeless Model

    ```
    ┌─────────────────────────────────────────────────────────────────────┐
    │  Corpus = Source Definition (not a snapshot)                        │
    ├─────────────────────────────────────────────────────────────────────┤
    │  - Defines WHICH repos to pull from                                 │
    │  - Captures git commit at LAST import (provenance)                  │
    │  - Refresh updates the corpus in place                              │
    │  - Linked playbooks can also refresh                                │
    └─────────────────────────────────────────────────────────────────────┘
    ```

    ## File Types

    Corpora categorize imported files by type:

    | Type | Description | Typical Location |
    |------|-------------|------------------|
    | `docs` | Documentation files | `docs/`, `README.md` |
    | `wiki` | Wiki pages | `*.wiki` repo |
    | `instructions` | Agent instruction files | `.github/instructions/` |
    | `prompts` | Prompt templates | `.github/prompts/`, `.claude/` |

    ## Storage Layout

    ```
    pspace-storage:///hive/corpora/
    └── {corpus_id}/
        ├── manifest.kno           # Corpus entity
        ├── catalog.json           # File manifest
        └── files/
            ├── docs/              # Documentation files
            ├── wiki/              # Wiki pages
            ├── instructions/      # Instruction files
            └── prompts/           # Prompt templates
    ```

    ## Lifecycle

    ```
    ┌──────────────┐     ┌──────────────┐     ┌──────────────┐
    │  importing   │ ──▶ │    ready     │ ──▶ │   archived   │
    └──────────────┘     └──────────────┘     └──────────────┘
                               │
                               ▼ (refresh)
                         ┌──────────────┐
                         │  importing   │
                         └──────────────┘
    ```

  # ===========================================================================
  # SCHEMA
  # ===========================================================================
  schema:
    type: object
    required:
      - id
      - type
      - version
      - name
      - sources
    properties:
      # -----------------------------------------------------------------------
      # BASIC TIER (from kno-spec)
      # -----------------------------------------------------------------------
      id:
        type: string
        pattern: "^corpus_[a-zA-Z0-9]+$"
        description: |
          Unique identifier for this corpus.
          Convention: corpus_{ulid}
        examples:
          - "corpus_01HXYZ123"
          - "corpus_01JKM456"

      type:
        const: corpus
        description: "Always 'corpus' for this schema"

      version:
        type: string
        pattern: "^\\d+\\.\\d+\\.\\d+$"
        description: "Version of this corpus entity"
        default: "0.1.0"

      # -----------------------------------------------------------------------
      # STANDARD TIER
      # -----------------------------------------------------------------------
      name:
        type: string
        description: "Human-readable corpus name"
        examples:
          - "Howl Repository"
          - "Awecelot Combined"

      description:
        type: string
        description: "What this corpus contains and its purpose"

      # -----------------------------------------------------------------------
      # SOURCES (Core of timeless model)
      # -----------------------------------------------------------------------
      sources:
        type: array
        description: |
          Source repositories to import. This is the DEFINITION of what
          to pull; git commits are captured at import time.
        items:
          type: object
          required:
            - repo
          properties:
            repo:
              type: string
              description: "GitHub repository (owner/name)"
              examples:
                - "howl-app/howl"
                - "PossibilityInc/possibility-space"
            branch:
              type: string
              default: "main"
              description: "Branch to import from"
            commit:
              type: string
              description: "Git commit SHA (captured at import time)"
            include_wiki:
              type: boolean
              default: true
              description: "Whether to import the wiki repo"
            wiki_commit:
              type: string
              description: "Wiki commit SHA (captured at import time)"

      # -----------------------------------------------------------------------
      # FILE STATISTICS
      # -----------------------------------------------------------------------
      stats:
        type: object
        description: "File counts and sizes"
        properties:
          total_files:
            type: integer
            description: "Total number of files"
          by_type:
            type: object
            properties:
              docs:
                type: integer
              wiki:
                type: integer
              instructions:
                type: integer
              prompts:
                type: integer
          total_size_bytes:
            type: integer
            description: "Total size of all files in bytes"
          total_tokens_estimate:
            type: integer
            description: "Estimated token count (chars/4 approximation)"

      # -----------------------------------------------------------------------
      # IMPORT METADATA
      # -----------------------------------------------------------------------
      imported_at:
        type: string
        format: date-time
        description: "When the corpus was last imported"

      imported_by:
        type: string
        description: "User XRI or system that performed the import"

      owner_id:
        type: string
        description: "User ID who owns this corpus"

      # -----------------------------------------------------------------------
      # STORAGE
      # -----------------------------------------------------------------------
      storage_uri:
        type: string
        description: "MinIO URI for corpus files"
        examples:
          - "pspace-storage:///hive/corpora/corpus_01HXYZ123/"

      catalog_uri:
        type: string
        description: "URI to catalog.json manifest"

      # -----------------------------------------------------------------------
      # STATUS
      # -----------------------------------------------------------------------
      status:
        type: string
        description: |
          Corpus lifecycle status:
          - importing: Import in progress
          - ready: Available for analysis
          - archived: No longer in active use
        examples:
          - "importing"
          - "ready"
          - "archived"

      visibility:
        type: string
        description: "Access control"
        examples:
          - "private"
          - "organization"
          - "public"

      # -----------------------------------------------------------------------
      # METADATA
      # -----------------------------------------------------------------------
      tags:
        type: array
        items:
          type: string
        description: "Optional tags for filtering"

# =============================================================================
# EXAMPLES
# =============================================================================
examples:
  - name: "Minimal Corpus"
    description: "Simplest valid corpus"
    value:
      $schema: "kno://specs/corpus-schema@0.1"
      id: "corpus_01HXYZ123"
      type: "corpus"
      version: "0.1.0"
      name: "My Repository"
      sources:
        - repo: "myorg/myrepo"
          branch: "main"
      status: "ready"

  - name: "Full Corpus"
    description: "Corpus with all fields"
    value:
      $schema: "kno://specs/corpus-schema@0.1"
      id: "corpus_01HOWL456"
      type: "corpus"
      version: "0.1.0"
      name: "Howl Repository"
      description: "Complete methodology and documentation from Howl app"
      sources:
        - repo: "howl-app/howl"
          branch: "main"
          commit: "dbebd0f04dbef8417343dbf35e2e7127a5a105b6" # pragma: allowlist secret
          include_wiki: true
          wiki_commit: "80e7c4aa42cdad7ef03a4c04ce0619dd7e62161c" # pragma: allowlist secret
      stats:
        total_files: 477
        by_type:
          docs: 314
          wiki: 98
          instructions: 53
          prompts: 12
        total_size_bytes: 2500000
        total_tokens_estimate: 625000
      imported_at: "2026-01-25T10:00:00Z"
      imported_by: "pspace://user:usr_01ADMIN"
      owner_id: "usr_01ADMIN"
      storage_uri: "pspace-storage:///hive/corpora/corpus_01HOWL456/"
      catalog_uri: "pspace-storage:///hive/corpora/corpus_01HOWL456/catalog.json"
      status: "ready"
      visibility: "private"
      tags:
        - "methodology"
        - "pilot"