src/pqc_rag_signing/chunk.py
3.1 KB · 106 lines · python Raw
1 """Signed chunk data structures."""
2
3 from __future__ import annotations
4
5 import hashlib
6 import json
7 from dataclasses import asdict, dataclass, field
8 from typing import Any
9
10
11 @dataclass(frozen=True)
12 class ChunkMetadata:
13 """Metadata about a chunk's source (document, position, etc.)."""
14
15 source: str
16 chunk_index: int
17 total_chunks: int
18 start_offset: int = 0
19 end_offset: int = 0
20 extra: dict[str, Any] = field(default_factory=dict)
21
22 def to_dict(self) -> dict[str, Any]:
23 return asdict(self)
24
25
26 @dataclass
27 class SignedChunk:
28 """A chunk of text with a PQC signature envelope.
29
30 This is the unit stored in a vector DB. The ``text`` is the content used
31 for embedding; the ``_pqc`` envelope is verified at retrieval time.
32 """
33
34 chunk_id: str
35 text: str
36 metadata: ChunkMetadata
37 content_hash: str
38 signer_did: str
39 algorithm: str
40 signature: str
41 public_key: str
42 signed_at: str
43 corpus_id: str | None = None
44 nonce: str = ""
45
46 @staticmethod
47 def compute_content_hash(text: str, metadata: ChunkMetadata, nonce: str) -> str:
48 """Canonical SHA3-256 of chunk content + metadata + nonce.
49
50 Deterministic: same (text, metadata, nonce) produces the same hash, always.
51 """
52 canonical = json.dumps(
53 {
54 "text": text,
55 "metadata": metadata.to_dict(),
56 "nonce": nonce,
57 },
58 sort_keys=True,
59 separators=(",", ":"),
60 ensure_ascii=False,
61 ).encode("utf-8")
62 return hashlib.sha3_256(canonical).hexdigest()
63
64 def to_dict(self) -> dict[str, Any]:
65 """Serialize for storage in a vector DB (the whole envelope)."""
66 return {
67 "chunk_id": self.chunk_id,
68 "text": self.text,
69 "metadata": self.metadata.to_dict(),
70 "content_hash": self.content_hash,
71 "signer_did": self.signer_did,
72 "algorithm": self.algorithm,
73 "signature": self.signature,
74 "public_key": self.public_key,
75 "signed_at": self.signed_at,
76 "corpus_id": self.corpus_id,
77 "nonce": self.nonce,
78 }
79
80 @classmethod
81 def from_dict(cls, data: dict[str, Any]) -> SignedChunk:
82 """Deserialize from stored dict."""
83 meta = data["metadata"]
84 if isinstance(meta, dict):
85 meta = ChunkMetadata(
86 source=meta["source"],
87 chunk_index=meta["chunk_index"],
88 total_chunks=meta["total_chunks"],
89 start_offset=meta.get("start_offset", 0),
90 end_offset=meta.get("end_offset", 0),
91 extra=meta.get("extra", {}),
92 )
93 return cls(
94 chunk_id=data["chunk_id"],
95 text=data["text"],
96 metadata=meta,
97 content_hash=data["content_hash"],
98 signer_did=data["signer_did"],
99 algorithm=data["algorithm"],
100 signature=data["signature"],
101 public_key=data["public_key"],
102 signed_at=data["signed_at"],
103 corpus_id=data.get("corpus_id"),
104 nonce=data.get("nonce", ""),
105 )
106