src/pqc_content_provenance/embed.py
| 1 | """Embed/extract manifests in/from content. |
| 2 | |
| 3 | Two embedding modes: |
| 4 | 1. Sidecar: manifest is stored next to content as a separate .c2pa.json file. |
| 5 | 2. Inline header: for text, we can prepend a <!--PQC-PROV--> JSON block. |
| 6 | |
| 7 | For images/video, real C2PA stores inside metadata (XMP, EXIF). Here we keep it |
| 8 | simple: we provide sidecar packaging helpers. Callers can embed the bytes |
| 9 | however they want for their own file format. |
| 10 | """ |
| 11 | |
| 12 | from __future__ import annotations |
| 13 | |
| 14 | import base64 |
| 15 | import json |
| 16 | |
| 17 | from pqc_content_provenance.errors import InvalidManifestError |
| 18 | from pqc_content_provenance.manifest import ContentManifest |
| 19 | |
| 20 | |
| 21 | SIDECAR_EXTENSION = ".c2pa.json" |
| 22 | TEXT_MARKER_BEGIN = "<!--PQC-PROV-BEGIN-->" |
| 23 | TEXT_MARKER_END = "<!--PQC-PROV-END-->" |
| 24 | |
| 25 | |
| 26 | def embed_manifest(content: bytes, manifest: ContentManifest, mode: str = "sidecar") -> bytes: |
| 27 | """Produce an embedded form of (content, manifest). |
| 28 | |
| 29 | mode='sidecar': returns a JSON blob containing both; save to .c2pa.json. |
| 30 | mode='text-header': prepends a marker-bracketed JSON to text content. |
| 31 | """ |
| 32 | if mode == "sidecar": |
| 33 | envelope = { |
| 34 | "manifest": manifest.to_dict(), |
| 35 | "content_base64": _to_base64(content), |
| 36 | } |
| 37 | return json.dumps(envelope, indent=2).encode("utf-8") |
| 38 | |
| 39 | if mode == "text-header": |
| 40 | header = f"{TEXT_MARKER_BEGIN}{manifest.to_json()}{TEXT_MARKER_END}" |
| 41 | return header.encode("utf-8") + b"\n" + content |
| 42 | |
| 43 | raise ValueError(f"unknown embed mode: {mode}") |
| 44 | |
| 45 | |
| 46 | def extract_manifest(blob: bytes, mode: str = "sidecar") -> tuple[ContentManifest, bytes]: |
| 47 | """Extract (manifest, content) from an embedded blob. Inverse of embed_manifest.""" |
| 48 | if mode == "sidecar": |
| 49 | try: |
| 50 | envelope = json.loads(blob.decode("utf-8")) |
| 51 | manifest = ContentManifest.from_dict(envelope["manifest"]) |
| 52 | content = _from_base64(envelope["content_base64"]) |
| 53 | return manifest, content |
| 54 | except (ValueError, KeyError) as e: |
| 55 | raise InvalidManifestError(f"invalid sidecar envelope: {e}") from e |
| 56 | |
| 57 | if mode == "text-header": |
| 58 | text = blob.decode("utf-8", errors="replace") |
| 59 | if TEXT_MARKER_BEGIN not in text or TEXT_MARKER_END not in text: |
| 60 | raise InvalidManifestError("text-header markers not found") |
| 61 | start = text.index(TEXT_MARKER_BEGIN) + len(TEXT_MARKER_BEGIN) |
| 62 | end = text.index(TEXT_MARKER_END, start) |
| 63 | manifest_json = text[start:end] |
| 64 | manifest = ContentManifest.from_json(manifest_json) |
| 65 | # content is everything after the end marker (skip the trailing newline) |
| 66 | rest = text[end + len(TEXT_MARKER_END):] |
| 67 | if rest.startswith("\n"): |
| 68 | rest = rest[1:] |
| 69 | return manifest, rest.encode("utf-8") |
| 70 | |
| 71 | raise ValueError(f"unknown embed mode: {mode}") |
| 72 | |
| 73 | |
| 74 | def _to_base64(data: bytes) -> str: |
| 75 | return base64.b64encode(data).decode("ascii") |
| 76 | |
| 77 | |
| 78 | def _from_base64(s: str) -> bytes: |
| 79 | return base64.b64decode(s.encode("ascii")) |
| 80 | |