src/pqc_training_data/record.py
2.0 KB · 66 lines · python Raw
1 """Data record representation.
2
3 A training record is an arbitrary piece of training data - a document, an
4 image, an audio file, a row in a structured dataset. What matters to the
5 commitment is its content hash.
6 """
7
8 from __future__ import annotations
9
10 import hashlib
11 import json
12 from dataclasses import dataclass, field
13 from typing import Any
14
15
16 @dataclass(frozen=True)
17 class RecordHash:
18 """Wrapper around a leaf hash for type clarity."""
19
20 hex: str
21
22 @property
23 def bytes(self) -> bytes:
24 return bytes.fromhex(self.hex)
25
26 def __str__(self) -> str:
27 return self.hex
28
29
30 @dataclass(frozen=True)
31 class DataRecord:
32 """A single training record - content + optional metadata.
33
34 We hash `content` + canonical JSON of `metadata` to produce the leaf hash.
35 The same record (same bytes, same metadata) always produces the same hash.
36 """
37
38 content: bytes
39 metadata: dict = field(default_factory=dict)
40
41 def canonical_bytes(self) -> bytes:
42 """Deterministic serialization combining content and metadata.
43
44 Format: SHA3-256(content) + "|" + canonical_json(metadata)
45 We hash content first to handle binary data cleanly.
46 """
47 content_hash = hashlib.sha3_256(self.content).hexdigest()
48 meta_json = json.dumps(
49 self.metadata, sort_keys=True, separators=(",", ":"), ensure_ascii=False
50 )
51 return f"{content_hash}|{meta_json}".encode("utf-8")
52
53 def leaf_hash(self) -> RecordHash:
54 """SHA3-256 of the canonical bytes - this is the Merkle leaf value."""
55 h = hashlib.sha3_256(self.canonical_bytes()).hexdigest()
56 return RecordHash(hex=h)
57
58 def to_dict(self) -> dict[str, Any]:
59 """Safe serialization - does NOT include raw content (privacy)."""
60 return {
61 "content_sha3_256": hashlib.sha3_256(self.content).hexdigest(),
62 "content_size": len(self.content),
63 "metadata": dict(self.metadata),
64 "leaf_hash": self.leaf_hash().hex,
65 }
66