src/pqc_content_provenance/embed.py
2.9 KB · 80 lines · python Raw
1 """Embed/extract manifests in/from content.
2
3 Two embedding modes:
4 1. Sidecar: manifest is stored next to content as a separate .c2pa.json file.
5 2. Inline header: for text, we can prepend a <!--PQC-PROV--> JSON block.
6
7 For images/video, real C2PA stores inside metadata (XMP, EXIF). Here we keep it
8 simple: we provide sidecar packaging helpers. Callers can embed the bytes
9 however they want for their own file format.
10 """
11
12 from __future__ import annotations
13
14 import base64
15 import json
16
17 from pqc_content_provenance.errors import InvalidManifestError
18 from pqc_content_provenance.manifest import ContentManifest
19
20
21 SIDECAR_EXTENSION = ".c2pa.json"
22 TEXT_MARKER_BEGIN = "<!--PQC-PROV-BEGIN-->"
23 TEXT_MARKER_END = "<!--PQC-PROV-END-->"
24
25
26 def embed_manifest(content: bytes, manifest: ContentManifest, mode: str = "sidecar") -> bytes:
27 """Produce an embedded form of (content, manifest).
28
29 mode='sidecar': returns a JSON blob containing both; save to .c2pa.json.
30 mode='text-header': prepends a marker-bracketed JSON to text content.
31 """
32 if mode == "sidecar":
33 envelope = {
34 "manifest": manifest.to_dict(),
35 "content_base64": _to_base64(content),
36 }
37 return json.dumps(envelope, indent=2).encode("utf-8")
38
39 if mode == "text-header":
40 header = f"{TEXT_MARKER_BEGIN}{manifest.to_json()}{TEXT_MARKER_END}"
41 return header.encode("utf-8") + b"\n" + content
42
43 raise ValueError(f"unknown embed mode: {mode}")
44
45
46 def extract_manifest(blob: bytes, mode: str = "sidecar") -> tuple[ContentManifest, bytes]:
47 """Extract (manifest, content) from an embedded blob. Inverse of embed_manifest."""
48 if mode == "sidecar":
49 try:
50 envelope = json.loads(blob.decode("utf-8"))
51 manifest = ContentManifest.from_dict(envelope["manifest"])
52 content = _from_base64(envelope["content_base64"])
53 return manifest, content
54 except (ValueError, KeyError) as e:
55 raise InvalidManifestError(f"invalid sidecar envelope: {e}") from e
56
57 if mode == "text-header":
58 text = blob.decode("utf-8", errors="replace")
59 if TEXT_MARKER_BEGIN not in text or TEXT_MARKER_END not in text:
60 raise InvalidManifestError("text-header markers not found")
61 start = text.index(TEXT_MARKER_BEGIN) + len(TEXT_MARKER_BEGIN)
62 end = text.index(TEXT_MARKER_END, start)
63 manifest_json = text[start:end]
64 manifest = ContentManifest.from_json(manifest_json)
65 # content is everything after the end marker (skip the trailing newline)
66 rest = text[end + len(TEXT_MARKER_END):]
67 if rest.startswith("\n"):
68 rest = rest[1:]
69 return manifest, rest.encode("utf-8")
70
71 raise ValueError(f"unknown embed mode: {mode}")
72
73
74 def _to_base64(data: bytes) -> str:
75 return base64.b64encode(data).decode("ascii")
76
77
78 def _from_base64(s: str) -> bytes:
79 return base64.b64decode(s.encode("ascii"))
80