src/pqc_mbom/spdx.py
11.9 KB · 312 lines · python Raw
1 """SPDX 2.3 JSON interoperability for MBOMs.
2
3 SPDX doesn't natively model AI-specific component types (training data,
4 RLHF data, quantization methods, etc.), so we map our MBOM to a superset
5 of SPDX that:
6
7 - Emits each ModelComponent as an SPDX Package.
8 - Records the pqc-mbom ComponentType in `annotations` and in the Package's
9 `externalRefs` as a purl-like identifier.
10 - Stores the SHA3-256 content hash under `checksums`.
11
12 The roundtrip is lossy for fields SPDX doesn't model (commercial_use flag,
13 arbitrary `properties`, ML-DSA signature) - those live in annotations.
14 """
15
16 from __future__ import annotations
17
18 import json
19 from datetime import datetime, timezone
20 from typing import Any
21
22 from pqc_mbom.component import (
23 ComponentReference,
24 ComponentType,
25 LicenseInfo,
26 ModelComponent,
27 )
28 from pqc_mbom.errors import SPDXConversionError
29 from pqc_mbom.mbom import MBOM, SCHEMA_VERSION
30
31
32 SPDX_VERSION = "SPDX-2.3"
33 DATA_LICENSE = "CC0-1.0"
34 _CREATOR = "Tool: pqc-mbom"
35 _COMPONENT_TYPE_ANNOTATION = "pqc-mbom:component_type"
36 _MBOM_ROOT_ANNOTATION = "pqc-mbom:root_hash"
37 _MBOM_SIGNATURE_ANNOTATION = "pqc-mbom:signature"
38 _MBOM_SIGNER_ANNOTATION = "pqc-mbom:signer_did"
39 _MBOM_ALGORITHM_ANNOTATION = "pqc-mbom:algorithm"
40 _MBOM_PUBKEY_ANNOTATION = "pqc-mbom:public_key"
41 _MBOM_SIGNED_AT_ANNOTATION = "pqc-mbom:signed_at"
42 _MBOM_SUPPLIER_ANNOTATION = "pqc-mbom:supplier"
43 _MBOM_PROPERTIES_ANNOTATION = "pqc-mbom:properties"
44 _MBOM_REFERENCES_ANNOTATION = "pqc-mbom:references"
45 _MBOM_LICENSE_EXTRA_ANNOTATION = "pqc-mbom:license_extra"
46
47
48 def _spdx_id(raw: str) -> str:
49 """Produce a valid SPDXID (alnum, `.`, `-`)."""
50 sanitized = "".join(ch if (ch.isalnum() or ch in ".-") else "-" for ch in raw)
51 return f"SPDXRef-{sanitized or 'UNKNOWN'}"
52
53
54 def _annotation(comment: str) -> dict[str, Any]:
55 return {
56 "annotationDate": datetime.now(timezone.utc).isoformat(),
57 "annotationType": "OTHER",
58 "annotator": _CREATOR,
59 "comment": comment,
60 }
61
62
63 def _component_to_package(component: ModelComponent) -> dict[str, Any]:
64 pkg: dict[str, Any] = {
65 "SPDXID": _spdx_id(component.component_id),
66 "name": component.name,
67 "versionInfo": component.version or "NOASSERTION",
68 "downloadLocation": component.external_url or "NOASSERTION",
69 "filesAnalyzed": False,
70 "supplier": f"Organization: {component.supplier}" if component.supplier else "NOASSERTION",
71 "originator": f"Person: {component.author}" if component.author else "NOASSERTION",
72 "licenseConcluded": component.license.spdx_id or "NOASSERTION",
73 "licenseDeclared": component.license.spdx_id or "NOASSERTION",
74 "copyrightText": "NOASSERTION",
75 }
76 if component.content_hash:
77 pkg["checksums"] = [{"algorithm": "SHA3-256", "checksumValue": component.content_hash}]
78 if component.content_size:
79 pkg["packageVerificationCode"] = {"packageVerificationCodeValue": str(component.content_size)}
80
81 annotations: list[dict[str, Any]] = [
82 _annotation(f"{_COMPONENT_TYPE_ANNOTATION}={component.component_type.value}")
83 ]
84 if component.properties:
85 annotations.append(
86 _annotation(f"{_MBOM_PROPERTIES_ANNOTATION}={json.dumps(component.properties, sort_keys=True)}")
87 )
88 if component.references:
89 annotations.append(
90 _annotation(
91 f"{_MBOM_REFERENCES_ANNOTATION}="
92 f"{json.dumps([r.to_dict() for r in component.references], sort_keys=True)}"
93 )
94 )
95 license_extra = {
96 "name": component.license.name,
97 "url": component.license.url,
98 "commercial_use": component.license.commercial_use,
99 "attribution_required": component.license.attribution_required,
100 }
101 annotations.append(
102 _annotation(f"{_MBOM_LICENSE_EXTRA_ANNOTATION}={json.dumps(license_extra, sort_keys=True)}")
103 )
104 pkg["annotations"] = annotations
105 return pkg
106
107
108 def _extract_annotation(pkg: dict[str, Any], prefix: str) -> str | None:
109 for ann in pkg.get("annotations", []):
110 comment = ann.get("comment", "")
111 if comment.startswith(f"{prefix}="):
112 return comment.split("=", 1)[1]
113 return None
114
115
116 def _package_to_component(pkg: dict[str, Any]) -> ModelComponent:
117 spdx_id = pkg.get("SPDXID", "")
118 if not spdx_id.startswith("SPDXRef-"):
119 raise SPDXConversionError(f"invalid SPDXID: {spdx_id!r}")
120 component_id = spdx_id[len("SPDXRef-"):]
121
122 ctype_raw = _extract_annotation(pkg, _COMPONENT_TYPE_ANNOTATION) or ComponentType.OTHER.value
123 try:
124 ctype = ComponentType(ctype_raw)
125 except ValueError:
126 ctype = ComponentType.OTHER
127
128 content_hash = ""
129 for cs in pkg.get("checksums", []):
130 if cs.get("algorithm") == "SHA3-256":
131 content_hash = cs.get("checksumValue", "")
132 break
133
134 content_size = 0
135 pvc = pkg.get("packageVerificationCode", {}).get("packageVerificationCodeValue", "")
136 if pvc.isdigit():
137 content_size = int(pvc)
138
139 supplier_raw = pkg.get("supplier", "")
140 supplier = supplier_raw.split(":", 1)[1].strip() if supplier_raw.startswith("Organization:") else ""
141
142 author_raw = pkg.get("originator", "")
143 author = author_raw.split(":", 1)[1].strip() if author_raw.startswith("Person:") else ""
144
145 license_extra_raw = _extract_annotation(pkg, _MBOM_LICENSE_EXTRA_ANNOTATION)
146 lic_extra: dict[str, Any] = {}
147 if license_extra_raw:
148 try:
149 lic_extra = json.loads(license_extra_raw)
150 except json.JSONDecodeError as e:
151 raise SPDXConversionError(f"malformed license_extra annotation: {e}") from e
152
153 spdx_id_val = pkg.get("licenseDeclared", "") or pkg.get("licenseConcluded", "")
154 if spdx_id_val in ("NOASSERTION", ""):
155 spdx_id_val = ""
156 license_info = LicenseInfo(
157 spdx_id=spdx_id_val,
158 name=lic_extra.get("name", ""),
159 url=lic_extra.get("url", ""),
160 commercial_use=bool(lic_extra.get("commercial_use", False)),
161 attribution_required=bool(lic_extra.get("attribution_required", True)),
162 )
163
164 properties_raw = _extract_annotation(pkg, _MBOM_PROPERTIES_ANNOTATION)
165 properties: dict[str, str] = {}
166 if properties_raw:
167 try:
168 properties = {str(k): str(v) for k, v in json.loads(properties_raw).items()}
169 except json.JSONDecodeError as e:
170 raise SPDXConversionError(f"malformed properties annotation: {e}") from e
171
172 references_raw = _extract_annotation(pkg, _MBOM_REFERENCES_ANNOTATION)
173 references: list[ComponentReference] = []
174 if references_raw:
175 try:
176 for r in json.loads(references_raw):
177 references.append(ComponentReference(**r))
178 except (json.JSONDecodeError, TypeError) as e:
179 raise SPDXConversionError(f"malformed references annotation: {e}") from e
180
181 download = pkg.get("downloadLocation", "")
182 external_url = "" if download == "NOASSERTION" else download
183 version = pkg.get("versionInfo", "")
184 if version == "NOASSERTION":
185 version = ""
186
187 return ModelComponent(
188 component_id=component_id,
189 component_type=ctype,
190 name=pkg.get("name", ""),
191 version=version,
192 content_hash=content_hash,
193 content_size=content_size,
194 supplier=supplier,
195 author=author,
196 external_url=external_url,
197 license=license_info,
198 references=references,
199 properties=properties,
200 )
201
202
203 def to_spdx_json(mbom: MBOM, *, indent: int = 2) -> str:
204 """Serialize an MBOM as an SPDX 2.3 JSON document."""
205 document_namespace = f"https://pqc-mbom.dyber.io/{mbom.mbom_id}"
206 creation_info = {
207 "created": mbom.created_at or datetime.now(timezone.utc).isoformat(),
208 "creators": [_CREATOR, f"Tool: pqc-mbom-schema-{SCHEMA_VERSION}"],
209 }
210
211 doc: dict[str, Any] = {
212 "spdxVersion": SPDX_VERSION,
213 "dataLicense": DATA_LICENSE,
214 "SPDXID": "SPDXRef-DOCUMENT",
215 "name": f"{mbom.model_name}-{mbom.model_version}-mbom",
216 "documentNamespace": document_namespace,
217 "creationInfo": creation_info,
218 "packages": [_component_to_package(c) for c in mbom.components],
219 "relationships": [
220 {
221 "spdxElementId": "SPDXRef-DOCUMENT",
222 "relatedSpdxElement": _spdx_id(c.component_id),
223 "relationshipType": "DESCRIBES",
224 }
225 for c in mbom.components
226 ],
227 }
228
229 doc_annotations: list[dict[str, Any]] = [
230 _annotation(f"pqc-mbom:mbom_id={mbom.mbom_id}"),
231 _annotation(f"pqc-mbom:model_name={mbom.model_name}"),
232 _annotation(f"pqc-mbom:model_version={mbom.model_version}"),
233 _annotation(f"pqc-mbom:schema_version={mbom.schema_version}"),
234 _annotation(f"pqc-mbom:description={mbom.description}"),
235 _annotation(f"{_MBOM_SUPPLIER_ANNOTATION}={mbom.supplier}"),
236 _annotation(f"{_MBOM_ROOT_ANNOTATION}={mbom.components_root_hash}"),
237 ]
238 if mbom.signature:
239 doc_annotations.extend([
240 _annotation(f"{_MBOM_SIGNER_ANNOTATION}={mbom.signer_did}"),
241 _annotation(f"{_MBOM_ALGORITHM_ANNOTATION}={mbom.algorithm}"),
242 _annotation(f"{_MBOM_SIGNATURE_ANNOTATION}={mbom.signature}"),
243 _annotation(f"{_MBOM_PUBKEY_ANNOTATION}={mbom.public_key}"),
244 _annotation(f"{_MBOM_SIGNED_AT_ANNOTATION}={mbom.signed_at}"),
245 ])
246 doc["annotations"] = doc_annotations
247
248 return json.dumps(doc, indent=indent, ensure_ascii=False)
249
250
251 def from_spdx_json(blob: str) -> MBOM:
252 """Parse an SPDX JSON document produced by `to_spdx_json` back into an MBOM.
253
254 Lossy for non-pqc-mbom SPDX docs - components without the
255 `pqc-mbom:component_type` annotation are mapped to ComponentType.OTHER.
256 """
257 try:
258 doc = json.loads(blob)
259 except json.JSONDecodeError as e:
260 raise SPDXConversionError(f"invalid SPDX JSON: {e}") from e
261
262 if doc.get("spdxVersion") != SPDX_VERSION:
263 raise SPDXConversionError(
264 f"unsupported spdxVersion: {doc.get('spdxVersion')!r} (expected {SPDX_VERSION!r})"
265 )
266 if doc.get("SPDXID") != "SPDXRef-DOCUMENT":
267 raise SPDXConversionError(f"missing or wrong document SPDXID: {doc.get('SPDXID')!r}")
268 if "packages" not in doc:
269 raise SPDXConversionError("SPDX document has no packages")
270
271 components = [_package_to_component(p) for p in doc["packages"]]
272
273 def _doc_ann(prefix: str, default: str = "") -> str:
274 for ann in doc.get("annotations", []):
275 comment = ann.get("comment", "")
276 if comment.startswith(f"{prefix}="):
277 return comment.split("=", 1)[1]
278 return default
279
280 mbom_id = _doc_ann("pqc-mbom:mbom_id") or doc.get("documentNamespace", "")
281 model_name = _doc_ann("pqc-mbom:model_name")
282 model_version = _doc_ann("pqc-mbom:model_version")
283 if not model_name or not model_version:
284 name = doc.get("name", "")
285 if "-" in name and name.endswith("-mbom"):
286 stripped = name[: -len("-mbom")]
287 parts = stripped.rsplit("-", 1)
288 if len(parts) == 2:
289 model_name = model_name or parts[0]
290 model_version = model_version or parts[1]
291
292 created_at = doc.get("creationInfo", {}).get("created", "")
293 mbom = MBOM(
294 mbom_id=mbom_id or "urn:pqc-mbom:spdx-import",
295 schema_version=_doc_ann("pqc-mbom:schema_version", SCHEMA_VERSION),
296 model_name=model_name or "unknown",
297 model_version=model_version or "0",
298 supplier=_doc_ann(_MBOM_SUPPLIER_ANNOTATION),
299 description=_doc_ann("pqc-mbom:description"),
300 components=components,
301 created_at=created_at,
302 components_root_hash=_doc_ann(_MBOM_ROOT_ANNOTATION),
303 signer_did=_doc_ann(_MBOM_SIGNER_ANNOTATION),
304 algorithm=_doc_ann(_MBOM_ALGORITHM_ANNOTATION),
305 signature=_doc_ann(_MBOM_SIGNATURE_ANNOTATION),
306 public_key=_doc_ann(_MBOM_PUBKEY_ANNOTATION),
307 signed_at=_doc_ann(_MBOM_SIGNED_AT_ANNOTATION),
308 )
309 if not mbom.components_root_hash:
310 mbom.recompute_root()
311 return mbom
312