src/pqc_training_data/record.py
| 1 | """Data record representation. |
| 2 | |
| 3 | A training record is an arbitrary piece of training data - a document, an |
| 4 | image, an audio file, a row in a structured dataset. What matters to the |
| 5 | commitment is its content hash. |
| 6 | """ |
| 7 | |
| 8 | from __future__ import annotations |
| 9 | |
| 10 | import hashlib |
| 11 | import json |
| 12 | from dataclasses import dataclass, field |
| 13 | from typing import Any |
| 14 | |
| 15 | |
| 16 | @dataclass(frozen=True) |
| 17 | class RecordHash: |
| 18 | """Wrapper around a leaf hash for type clarity.""" |
| 19 | |
| 20 | hex: str |
| 21 | |
| 22 | @property |
| 23 | def bytes(self) -> bytes: |
| 24 | return bytes.fromhex(self.hex) |
| 25 | |
| 26 | def __str__(self) -> str: |
| 27 | return self.hex |
| 28 | |
| 29 | |
| 30 | @dataclass(frozen=True) |
| 31 | class DataRecord: |
| 32 | """A single training record - content + optional metadata. |
| 33 | |
| 34 | We hash `content` + canonical JSON of `metadata` to produce the leaf hash. |
| 35 | The same record (same bytes, same metadata) always produces the same hash. |
| 36 | """ |
| 37 | |
| 38 | content: bytes |
| 39 | metadata: dict = field(default_factory=dict) |
| 40 | |
| 41 | def canonical_bytes(self) -> bytes: |
| 42 | """Deterministic serialization combining content and metadata. |
| 43 | |
| 44 | Format: SHA3-256(content) + "|" + canonical_json(metadata) |
| 45 | We hash content first to handle binary data cleanly. |
| 46 | """ |
| 47 | content_hash = hashlib.sha3_256(self.content).hexdigest() |
| 48 | meta_json = json.dumps( |
| 49 | self.metadata, sort_keys=True, separators=(",", ":"), ensure_ascii=False |
| 50 | ) |
| 51 | return f"{content_hash}|{meta_json}".encode("utf-8") |
| 52 | |
| 53 | def leaf_hash(self) -> RecordHash: |
| 54 | """SHA3-256 of the canonical bytes - this is the Merkle leaf value.""" |
| 55 | h = hashlib.sha3_256(self.canonical_bytes()).hexdigest() |
| 56 | return RecordHash(hex=h) |
| 57 | |
| 58 | def to_dict(self) -> dict[str, Any]: |
| 59 | """Safe serialization - does NOT include raw content (privacy).""" |
| 60 | return { |
| 61 | "content_sha3_256": hashlib.sha3_256(self.content).hexdigest(), |
| 62 | "content_size": len(self.content), |
| 63 | "metadata": dict(self.metadata), |
| 64 | "leaf_hash": self.leaf_hash().hex, |
| 65 | } |
| 66 | |