From 1a30ba00624fcd83e70225cb8eb07a15e1f2a31a Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 14 May 2026 16:04:53 -0500 Subject: [PATCH 01/19] serviceability base plugin --- .../plugins/serviceability/__init__.py | 42 ++++ .../plugins/serviceability/collector_args.py | 134 ++++++++++++ .../serviceability_collector.py | 195 ++++++++++++++++++ .../serviceability/serviceability_data.py | 80 +++++++ .../serviceability_plugin_base.py | 45 ++++ 5 files changed, 496 insertions(+) create mode 100644 nodescraper/plugins/serviceability/__init__.py create mode 100644 nodescraper/plugins/serviceability/collector_args.py create mode 100644 nodescraper/plugins/serviceability/serviceability_collector.py create mode 100644 nodescraper/plugins/serviceability/serviceability_data.py create mode 100644 nodescraper/plugins/serviceability/serviceability_plugin_base.py diff --git a/nodescraper/plugins/serviceability/__init__.py b/nodescraper/plugins/serviceability/__init__.py new file mode 100644 index 00000000..af181362 --- /dev/null +++ b/nodescraper/plugins/serviceability/__init__.py @@ -0,0 +1,42 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .collector_args import ServiceabilityCollectorArgs +from .serviceability_collector import ServiceabilityCollectorBase +from .serviceability_data import ( + DeviceInfo, + ServiceabilityDataModel, + ServiceabilityResult, +) +from .serviceability_plugin_base import ServiceabilityPluginBase + +__all__ = [ + "DeviceInfo", + "ServiceabilityCollectorArgs", + "ServiceabilityCollectorBase", + "ServiceabilityDataModel", + "ServiceabilityPluginBase", + "ServiceabilityResult", +] diff --git a/nodescraper/plugins/serviceability/collector_args.py b/nodescraper/plugins/serviceability/collector_args.py new file mode 100644 index 00000000..924c3cc9 --- /dev/null +++ b/nodescraper/plugins/serviceability/collector_args.py @@ -0,0 +1,134 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import List, Optional, Tuple + +from pydantic import Field, field_validator, model_validator + +from nodescraper.models import CollectorArgs + + +class ServiceabilityCollectorArgs(CollectorArgs): + """Redfish collection arguments for ``ServiceabilityCollectorBase``. + + All Redfish URIs must be supplied by the caller; the base collector does not + embed product paths. Optional sections (assembly inventory, firmware bundle) + are skipped when the corresponding URI or template is omitted. + """ + + uri: Optional[str] = Field( + default=None, + description="Optional alias for ``rf_event_log_uri`` (non-empty string).", + ) + rf_event_log_uri: Optional[str] = Field( + default=None, + description="Redfish URI for the event log ``Entries`` collection.", + ) + rf_chassis_devices: Optional[List[str]] = Field( + default=None, + description="Chassis designations for Assembly GETs; required with ``rf_assembly_uri_template``.", + ) + rf_assembly_uri_template: Optional[str] = Field( + default=None, + description="Redfish URI template containing ``{device}`` for each chassis Assembly resource.", + ) + rf_firmware_bundle_uri: Optional[str] = Field( + default=None, + description="Redfish URI for firmware bundle inventory (e.g. ComponentDetails).", + ) + rf_assembly_fields: Optional[Tuple[str, ...]] = Field( + default=None, + description="Standard Assembly JSON field names mapped into ``DeviceInfo``.", + ) + rf_assembly_oem_fields: Optional[Tuple[str, ...]] = Field( + default=None, + description="OEM Assembly field names (under ``Oem``) mapped into ``DeviceInfo``.", + ) + follow_next_link: bool = Field( + default=True, + description=( + "When True, follow Members@odata.nextLink and merge pages (up to max_pages). " + "When False, only the first GET response is used." + ), + ) + max_pages: int = Field( + default=200, + ge=1, + le=10_000, + description="Safety cap on the number of pages when following event log pagination.", + ) + top: Optional[int] = Field( + default=None, + ge=1, + description=( + "Return only the most recent N entries using $skip when the collection " + "supports OData count; None collects per follow_next_link rules." + ), + ) + from_ac_cycle: int = Field( + default=-1, + description="Passed to ``filter_event_members`` implementations (e.g. A/C cycle window). -1 disables.", + ) + from_date: Optional[str] = Field( + default=None, + description="Passed to ``filter_event_members`` implementations (e.g. ISO date window).", + ) + + @field_validator("from_ac_cycle") + @classmethod + def validate_from_ac_cycle(cls, v: int) -> int: + if v != -1 and v < 0: + raise ValueError("from_ac_cycle must be -1 (no filter) or a non-negative integer") + return v + + @model_validator(mode="after") + def _require_event_log_uri(self) -> ServiceabilityCollectorArgs: + if not self.resolved_event_log_uri(): + raise ValueError( + "Provide a non-empty rf_event_log_uri or uri for the event log collection." + ) + return self + + @model_validator(mode="after") + def _assembly_consistency(self) -> ServiceabilityCollectorArgs: + has_tpl = bool( + self.rf_assembly_uri_template and "{device}" in self.rf_assembly_uri_template + ) + has_dev = bool(self.rf_chassis_devices) + if has_tpl != has_dev: + raise ValueError( + "Provide both rf_assembly_uri_template (with '{device}') and rf_chassis_devices, " + "or omit both to skip assembly collection." + ) + return self + + def resolved_event_log_uri(self) -> str: + """Effective event-log URI (``uri`` or ``rf_event_log_uri``).""" + for candidate in (self.uri, self.rf_event_log_uri): + if candidate and str(candidate).strip(): + return str(candidate).strip() + return "" diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py new file mode 100644 index 00000000..19942f49 --- /dev/null +++ b/nodescraper/plugins/serviceability/serviceability_collector.py @@ -0,0 +1,195 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +import abc +from typing import Any, Optional +from urllib.parse import urlparse + +from nodescraper.base import RedfishDataCollector +from nodescraper.connection.redfish import RF_MEMBERS, RF_MEMBERS_COUNT +from nodescraper.enums import ExecutionStatus +from nodescraper.models import TaskResult + +from .collector_args import ServiceabilityCollectorArgs +from .serviceability_data import DeviceInfo, ServiceabilityDataModel + + +class ServiceabilityCollectorBase( + RedfishDataCollector[ServiceabilityDataModel, ServiceabilityCollectorArgs], +): + """Redfish serviceability collection flow with product-specific hooks. + + Subclasses implement event filtering, CPER detection, and CPER attachment handling. + Redfish URIs come only from :class:`ServiceabilityCollectorArgs`. + """ + + DATA_MODEL = ServiceabilityDataModel + + def __init__(self, **kwargs: Any) -> None: + self._log_path: Optional[str] = kwargs.get("log_path") + super().__init__(**kwargs) + + @abc.abstractmethod + def filter_event_members( + self, + members: list[Any], + args: ServiceabilityCollectorArgs, + ) -> list[Any]: + """Return the event list to analyze (e.g. time / A/C window).""" + + @abc.abstractmethod + def is_cper_event(self, event: dict) -> bool: + """Return whether a Redfish event entry should be treated as diagnostic-backed.""" + + @abc.abstractmethod + def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + """Fetch and decode diagnostic attachments for qualifying events (subclass-defined).""" + + def _fetch_event_log(self, args: ServiceabilityCollectorArgs, uri: str): + if args.follow_next_link: + return self._run_redfish_get_paged(uri, max_pages=args.max_pages) + return self._run_redfish_get(uri, log_artifact=True) + + def collect_data( + self, args: Optional[ServiceabilityCollectorArgs] = None + ) -> tuple[TaskResult, Optional[ServiceabilityDataModel]]: + if args is None: + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = "ServiceabilityCollectorArgs are required" + return self.result, None + + event_uri = args.resolved_event_log_uri() + if args.top is not None: + res = self._fetch_top(args, args.top, args.max_pages) + else: + res = self._fetch_event_log(args, event_uri) + + if not res.success or res.data is None: + self.result.status = ExecutionStatus.ERROR + self.result.message = f"Redfish GET failed for {event_uri}: {res.error}" + return self.result, None + + members = res.data.get(RF_MEMBERS, []) + responses = {res.path: res.data} + raw_base_url = getattr(self.connection, "base_url", None) + bmc_host = urlparse(raw_base_url).hostname if raw_base_url else None + + try: + filtered_members = self.filter_event_members(members, args) + except ValueError as exc: + self.result.status = ExecutionStatus.ERROR + self.result.message = f"Event filter failed: {exc}" + return self.result, None + + assembly_info: dict[str, DeviceInfo] = {} + tpl = args.rf_assembly_uri_template + devices = args.rf_chassis_devices + if tpl and devices: + std_fields = tuple(args.rf_assembly_fields or ()) + oem_fields = tuple(args.rf_assembly_oem_fields or ()) + std_to_device = { + "Name": "name", + "PartNumber": "part_number", + "ProductionDate": "production_date", + "SerialNumber": "serial_number", + "Version": "version", + } + + for device in devices: + uri_asm = tpl.format(device=device) + assembly_res = self._run_redfish_get(uri_asm, log_artifact=True) + if not assembly_res.success or assembly_res.data is None: + continue + responses[assembly_res.path] = assembly_res.data + + assemblies = assembly_res.data.get("Assemblies", []) + if not assemblies: + continue + + entry = assemblies[0] + oem = entry.get("Oem", {}) + di_kwargs: dict[str, Any] = {} + for fname in std_fields: + key = std_to_device.get(fname) + if key: + di_kwargs[key] = entry.get(fname) + + for of in oem_fields: + if of == "AssemblyPartNumber": + di_kwargs["assembly_part_number"] = oem.get(of) + elif of == "AssemblySerialNumber": + di_kwargs["assembly_serial_number"] = oem.get(of) + + assembly_info[device] = DeviceInfo(**di_kwargs) + + cper_data = self.collect_cper_data(filtered_members or []) + + data = ServiceabilityDataModel( + responses=responses, + rf_events=filtered_members or [], + assembly_info=assembly_info, + cper_data=cper_data, + component_details=self._fetch_component_details(responses, args), + log_path=self._log_path, + bmc_host=bmc_host, + ) + self.result.status = ExecutionStatus.OK + self.result.message = f"Collected {len(members)} event log member(s)" + return self.result, data + + def _fetch_component_details( + self, responses: dict[str, Any], args: ServiceabilityCollectorArgs + ) -> Optional[str]: + fw_uri = args.rf_firmware_bundle_uri + if not fw_uri or not str(fw_uri).strip(): + return None + fw_uri = str(fw_uri).strip() + fw_res = self._run_redfish_get(fw_uri, log_artifact=True) + if not fw_res.success or fw_res.data is None: + return None + responses[fw_res.path] = fw_res.data + + oem = fw_res.data.get("Oem", {}) + version_id = oem.get("AMD", oem).get("VersionID", {}) + return version_id.get("ComponentDetails") + + def _fetch_top(self, args: ServiceabilityCollectorArgs, top: int, max_pages: int): + event_uri = args.resolved_event_log_uri() + probe = self._run_redfish_get(f"{event_uri}?$top=1", log_artifact=True) + if not probe.success or probe.data is None: + return probe + + count = probe.data.get(RF_MEMBERS_COUNT, 0) + + if count <= top: + return self._fetch_event_log(args, event_uri) + + skip = count - top + skip_uri = f"{event_uri}?$skip={skip}" + if args.follow_next_link: + return self._run_redfish_get_paged(skip_uri, max_pages=max_pages) + return self._run_redfish_get(skip_uri, log_artifact=True) diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py new file mode 100644 index 00000000..4329feae --- /dev/null +++ b/nodescraper/plugins/serviceability/serviceability_data.py @@ -0,0 +1,80 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +import json +import os +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel + +from nodescraper.models import DataModel + + +class DeviceInfo(BaseModel): + """Information for a single chassis device collected via Redfish.""" + + name: Optional[str] = None + part_number: Optional[str] = None + production_date: Optional[str] = None + serial_number: Optional[str] = None + version: Optional[str] = None + assembly_part_number: Optional[str] = None + assembly_serial_number: Optional[str] = None + + +class ServiceabilityResult(BaseModel): + """Structured serviceability output (typically populated by a downstream analyzer).""" + + node: Optional[str] = None + service_recommendations: Dict[str, List[dict]] = {} + service_action_definitions: Dict[str, dict] = {} + afid_sag_metadata: Dict[str, Any] = {} + node_info: Dict[str, Any] = {} + + +class ServiceabilityDataModel(DataModel): + """Collected Redfish responses and intermediate serviceability fields.""" + + responses: dict[str, Any] = {} + rf_events: list[Any] = [] + assembly_info: Dict[str, DeviceInfo] = {} + cper_data: Dict[str, Any] = {} + component_details: Optional[str] = None + log_path: Optional[str] = None + bmc_host: Optional[str] = None + result: Optional[ServiceabilityResult] = None + + def log_model(self, log_path: str) -> None: + """Write raw Redfish responses and decoded CPER data to the log directory.""" + os.makedirs(log_path, exist_ok=True) + responses_path = os.path.join(log_path, "redfish_responses.json") + with open(responses_path, "w", encoding="utf-8") as f: + json.dump(self.responses, f, indent=2) + if self.cper_data: + cper_path = os.path.join(log_path, "cper_data.json") + with open(cper_path, "w", encoding="utf-8") as f: + json.dump(self.cper_data, f, indent=2) diff --git a/nodescraper/plugins/serviceability/serviceability_plugin_base.py b/nodescraper/plugins/serviceability/serviceability_plugin_base.py new file mode 100644 index 00000000..fbc8082f --- /dev/null +++ b/nodescraper/plugins/serviceability/serviceability_plugin_base.py @@ -0,0 +1,45 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.base import OOBandDataPlugin + +from .collector_args import ServiceabilityCollectorArgs +from .serviceability_collector import ServiceabilityCollectorBase +from .serviceability_data import ServiceabilityDataModel + + +class ServiceabilityPluginBase( + OOBandDataPlugin[ServiceabilityDataModel, ServiceabilityCollectorArgs, None], +): + """OOB Redfish plugin base: collection only (no analyzer). + + Set ``COLLECTOR`` on a **subclass** to a concrete collector derived from + :class:`ServiceabilityCollectorBase` (the base ``COLLECTOR`` here is abstract + and cannot be instantiated). Add an ``ANALYZER`` on the subclass when needed. + """ + + DATA_MODEL = ServiceabilityDataModel + COLLECTOR = ServiceabilityCollectorBase + COLLECTOR_ARGS = ServiceabilityCollectorArgs From 97aa436b841218fdfc3e2eca19124a7dbb31fe5f Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 15 May 2026 10:07:09 -0500 Subject: [PATCH 02/19] updates: oob, abstracted some dicts --- .../plugins/serviceability/collector_args.py | 48 +-- .../serviceability_collector.py | 55 ++- .../serviceability/serviceability_data.py | 12 +- .../serviceability_plugin_base.py | 7 +- .../plugin/test_serviceability_collector.py | 326 ++++++++++++++++++ 5 files changed, 362 insertions(+), 86 deletions(-) create mode 100644 test/unit/plugin/test_serviceability_collector.py diff --git a/nodescraper/plugins/serviceability/collector_args.py b/nodescraper/plugins/serviceability/collector_args.py index 924c3cc9..4b2511ca 100644 --- a/nodescraper/plugins/serviceability/collector_args.py +++ b/nodescraper/plugins/serviceability/collector_args.py @@ -25,20 +25,15 @@ ############################################################################### from __future__ import annotations -from typing import List, Optional, Tuple +from typing import List, Optional -from pydantic import Field, field_validator, model_validator +from pydantic import Field, model_validator from nodescraper.models import CollectorArgs class ServiceabilityCollectorArgs(CollectorArgs): - """Redfish collection arguments for ``ServiceabilityCollectorBase``. - - All Redfish URIs must be supplied by the caller; the base collector does not - embed product paths. Optional sections (assembly inventory, firmware bundle) - are skipped when the corresponding URI or template is omitted. - """ + """URIs and pagination only. Subclasses add filtering and OEM-specific options.""" uri: Optional[str] = Field( default=None, @@ -58,22 +53,11 @@ class ServiceabilityCollectorArgs(CollectorArgs): ) rf_firmware_bundle_uri: Optional[str] = Field( default=None, - description="Redfish URI for firmware bundle inventory (e.g. ComponentDetails).", - ) - rf_assembly_fields: Optional[Tuple[str, ...]] = Field( - default=None, - description="Standard Assembly JSON field names mapped into ``DeviceInfo``.", - ) - rf_assembly_oem_fields: Optional[Tuple[str, ...]] = Field( - default=None, - description="OEM Assembly field names (under ``Oem``) mapped into ``DeviceInfo``.", + description="Redfish URI for firmware bundle inventory when subclasses extract component details.", ) follow_next_link: bool = Field( default=True, - description=( - "When True, follow Members@odata.nextLink and merge pages (up to max_pages). " - "When False, only the first GET response is used." - ), + description="If True, follow Members@odata.nextLink up to max_pages; else single GET.", ) max_pages: int = Field( default=200, @@ -84,26 +68,8 @@ class ServiceabilityCollectorArgs(CollectorArgs): top: Optional[int] = Field( default=None, ge=1, - description=( - "Return only the most recent N entries using $skip when the collection " - "supports OData count; None collects per follow_next_link rules." - ), + description="Most recent N entries via $skip after count probe; None collects full window.", ) - from_ac_cycle: int = Field( - default=-1, - description="Passed to ``filter_event_members`` implementations (e.g. A/C cycle window). -1 disables.", - ) - from_date: Optional[str] = Field( - default=None, - description="Passed to ``filter_event_members`` implementations (e.g. ISO date window).", - ) - - @field_validator("from_ac_cycle") - @classmethod - def validate_from_ac_cycle(cls, v: int) -> int: - if v != -1 and v < 0: - raise ValueError("from_ac_cycle must be -1 (no filter) or a non-negative integer") - return v @model_validator(mode="after") def _require_event_log_uri(self) -> ServiceabilityCollectorArgs: @@ -127,7 +93,7 @@ def _assembly_consistency(self) -> ServiceabilityCollectorArgs: return self def resolved_event_log_uri(self) -> str: - """Effective event-log URI (``uri`` or ``rf_event_log_uri``).""" + """Return uri or rf_event_log_uri.""" for candidate in (self.uri, self.rf_event_log_uri): if candidate and str(candidate).strip(): return str(candidate).strip() diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py index 19942f49..7e364afd 100644 --- a/nodescraper/plugins/serviceability/serviceability_collector.py +++ b/nodescraper/plugins/serviceability/serviceability_collector.py @@ -41,11 +41,7 @@ class ServiceabilityCollectorBase( RedfishDataCollector[ServiceabilityDataModel, ServiceabilityCollectorArgs], ): - """Redfish serviceability collection flow with product-specific hooks. - - Subclasses implement event filtering, CPER detection, and CPER attachment handling. - Redfish URIs come only from :class:`ServiceabilityCollectorArgs`. - """ + """OOB Redfish collection skeleton; subclasses implement filtering, CPER handling, and JSON parsing.""" DATA_MODEL = ServiceabilityDataModel @@ -59,7 +55,7 @@ def filter_event_members( members: list[Any], args: ServiceabilityCollectorArgs, ) -> list[Any]: - """Return the event list to analyze (e.g. time / A/C window).""" + """Return the event list to retain for downstream analysis.""" @abc.abstractmethod def is_cper_event(self, event: dict) -> bool: @@ -69,6 +65,23 @@ def is_cper_event(self, event: dict) -> bool: def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: """Fetch and decode diagnostic attachments for qualifying events (subclass-defined).""" + @abc.abstractmethod + def parse_assembly_entry( + self, + designation: str, + assembly_member_entry: dict[str, Any], + args: ServiceabilityCollectorArgs, + ) -> DeviceInfo: + """Map one Assemblies[] member dict into DeviceInfo.""" + + @abc.abstractmethod + def extract_component_details( + self, + firmware_inventory_payload: dict[str, Any], + args: ServiceabilityCollectorArgs, + ) -> Optional[str]: + """Derive component-details text from a firmware inventory GET payload, or None.""" + def _fetch_event_log(self, args: ServiceabilityCollectorArgs, uri: str): if args.follow_next_link: return self._run_redfish_get_paged(uri, max_pages=args.max_pages) @@ -109,16 +122,6 @@ def collect_data( tpl = args.rf_assembly_uri_template devices = args.rf_chassis_devices if tpl and devices: - std_fields = tuple(args.rf_assembly_fields or ()) - oem_fields = tuple(args.rf_assembly_oem_fields or ()) - std_to_device = { - "Name": "name", - "PartNumber": "part_number", - "ProductionDate": "production_date", - "SerialNumber": "serial_number", - "Version": "version", - } - for device in devices: uri_asm = tpl.format(device=device) assembly_res = self._run_redfish_get(uri_asm, log_artifact=True) @@ -131,20 +134,7 @@ def collect_data( continue entry = assemblies[0] - oem = entry.get("Oem", {}) - di_kwargs: dict[str, Any] = {} - for fname in std_fields: - key = std_to_device.get(fname) - if key: - di_kwargs[key] = entry.get(fname) - - for of in oem_fields: - if of == "AssemblyPartNumber": - di_kwargs["assembly_part_number"] = oem.get(of) - elif of == "AssemblySerialNumber": - di_kwargs["assembly_serial_number"] = oem.get(of) - - assembly_info[device] = DeviceInfo(**di_kwargs) + assembly_info[device] = self.parse_assembly_entry(device, entry, args) cper_data = self.collect_cper_data(filtered_members or []) @@ -172,10 +162,7 @@ def _fetch_component_details( if not fw_res.success or fw_res.data is None: return None responses[fw_res.path] = fw_res.data - - oem = fw_res.data.get("Oem", {}) - version_id = oem.get("AMD", oem).get("VersionID", {}) - return version_id.get("ComponentDetails") + return self.extract_component_details(fw_res.data, args) def _fetch_top(self, args: ServiceabilityCollectorArgs, top: int, max_pages: int): event_uri = args.resolved_event_log_uri() diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py index 4329feae..93e57737 100644 --- a/nodescraper/plugins/serviceability/serviceability_data.py +++ b/nodescraper/plugins/serviceability/serviceability_data.py @@ -29,21 +29,23 @@ import os from typing import Any, Dict, List, Optional -from pydantic import BaseModel +from pydantic import BaseModel, Field from nodescraper.models import DataModel class DeviceInfo(BaseModel): - """Information for a single chassis device collected via Redfish.""" + """Chassis fields from Assembly parsing; extra vendor keys belong in oem_extensions.""" name: Optional[str] = None part_number: Optional[str] = None production_date: Optional[str] = None serial_number: Optional[str] = None version: Optional[str] = None - assembly_part_number: Optional[str] = None - assembly_serial_number: Optional[str] = None + oem_extensions: Dict[str, Any] = Field( + default_factory=dict, + description="Opaque vendor/product extensions parsed by the concrete collector.", + ) class ServiceabilityResult(BaseModel): @@ -69,7 +71,7 @@ class ServiceabilityDataModel(DataModel): result: Optional[ServiceabilityResult] = None def log_model(self, log_path: str) -> None: - """Write raw Redfish responses and decoded CPER data to the log directory.""" + """Write redfish_responses.json and optional cper_data.json under log_path.""" os.makedirs(log_path, exist_ok=True) responses_path = os.path.join(log_path, "redfish_responses.json") with open(responses_path, "w", encoding="utf-8") as f: diff --git a/nodescraper/plugins/serviceability/serviceability_plugin_base.py b/nodescraper/plugins/serviceability/serviceability_plugin_base.py index fbc8082f..b3ca322a 100644 --- a/nodescraper/plugins/serviceability/serviceability_plugin_base.py +++ b/nodescraper/plugins/serviceability/serviceability_plugin_base.py @@ -33,12 +33,7 @@ class ServiceabilityPluginBase( OOBandDataPlugin[ServiceabilityDataModel, ServiceabilityCollectorArgs, None], ): - """OOB Redfish plugin base: collection only (no analyzer). - - Set ``COLLECTOR`` on a **subclass** to a concrete collector derived from - :class:`ServiceabilityCollectorBase` (the base ``COLLECTOR`` here is abstract - and cannot be instantiated). Add an ``ANALYZER`` on the subclass when needed. - """ + """OOB Redfish collect-only plugin stub; subclass with a concrete COLLECTOR and optional ANALYZER.""" DATA_MODEL = ServiceabilityDataModel COLLECTOR = ServiceabilityCollectorBase diff --git a/test/unit/plugin/test_serviceability_collector.py b/test/unit/plugin/test_serviceability_collector.py new file mode 100644 index 00000000..e3a67d5d --- /dev/null +++ b/test/unit/plugin/test_serviceability_collector.py @@ -0,0 +1,326 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from typing import Any, Optional + +import pytest +from pydantic import ValidationError + +from nodescraper.connection.redfish import ( + RF_MEMBERS, + RF_MEMBERS_COUNT, + RedfishGetResult, +) +from nodescraper.enums import ExecutionStatus +from nodescraper.plugins.serviceability import ( + DeviceInfo, + ServiceabilityCollectorArgs, + ServiceabilityDataModel, + ServiceabilityPluginBase, +) +from nodescraper.plugins.serviceability.serviceability_collector import ( + ServiceabilityCollectorBase, +) + +EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" + + +class _StubServiceabilityCollector(ServiceabilityCollectorBase): + def filter_event_members( + self, + members: list[Any], + args: ServiceabilityCollectorArgs, + ) -> list[Any]: + return members + + def is_cper_event(self, event: dict) -> bool: + return False + + def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + return {} + + def parse_assembly_entry( + self, + designation: str, + assembly_member_entry: dict[str, Any], + args: ServiceabilityCollectorArgs, + ) -> DeviceInfo: + return DeviceInfo(name=designation, serial_number=assembly_member_entry.get("SerialNumber")) + + def extract_component_details( + self, + firmware_inventory_payload: dict[str, Any], + args: ServiceabilityCollectorArgs, + ) -> Optional[str]: + return firmware_inventory_payload.get("Details") + + +@pytest.fixture +def stub_serviceability_collector(system_info, redfish_conn_mock): + redfish_conn_mock.base_url = "https://bmc.example/redfish/v1" + return _StubServiceabilityCollector( + system_info=system_info, + connection=redfish_conn_mock, + log_path="/tmp/serviceability.log", + ) + + +def test_serviceability_collector_args_requires_event_log_uri(): + with pytest.raises(ValidationError): + ServiceabilityCollectorArgs() + + +def test_serviceability_collector_args_uri_alias_prefers_uri_over_rf_event_log_uri(): + args = ServiceabilityCollectorArgs(uri=" /events ", rf_event_log_uri="/other") + assert args.resolved_event_log_uri() == "/events" + + +def test_serviceability_collector_args_assembly_requires_both_template_and_devices(): + with pytest.raises(ValidationError): + ServiceabilityCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", + ) + with pytest.raises(ValidationError): + ServiceabilityCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_chassis_devices=["C1"], + ) + + +def test_serviceability_collector_args_assembly_template_must_include_device_placeholder(): + with pytest.raises(ValidationError): + ServiceabilityCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/C1/Assembly", + rf_chassis_devices=["C1"], + ) + + +def test_serviceability_collector_args_assembly_optional_when_omitted(): + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) + assert args.rf_assembly_uri_template is None + assert args.rf_chassis_devices is None + + +def test_serviceability_plugin_base_wiring(): + assert ServiceabilityPluginBase.DATA_MODEL is ServiceabilityDataModel + assert ServiceabilityPluginBase.COLLECTOR is ServiceabilityCollectorBase + assert ServiceabilityPluginBase.COLLECTOR_ARGS is ServiceabilityCollectorArgs + assert ServiceabilityPluginBase.ANALYZER is None + + +def test_stub_collector_no_args(stub_serviceability_collector): + result, data = stub_serviceability_collector.collect_data() + assert result.status == ExecutionStatus.NOT_RAN + assert "required" in result.message.lower() + assert data is None + + +def test_stub_collector_event_log_get_fails(stub_serviceability_collector, redfish_conn_mock): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=False, + error="timeout", + status_code=None, + ) + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.ERROR + assert EVENT_URI in result.message + assert data is None + + +def test_stub_collector_success_minimal(stub_serviceability_collector, redfish_conn_mock): + members = [{"Id": "1"}] + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: members}, + status_code=200, + ) + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rf_events == members + assert EVENT_URI in data.responses + assert data.bmc_host == "bmc.example" + assert data.log_path == "/tmp/serviceability.log" + redfish_conn_mock.run_get_paged.assert_called_once() + + +def test_stub_collector_filter_raises_maps_to_error( + stub_serviceability_collector, redfish_conn_mock +): + class _BadFilter(_StubServiceabilityCollector): + def filter_event_members(self, members, args): + raise ValueError("bad filter") + + collector = _BadFilter( + system_info=stub_serviceability_collector.system_info, + connection=redfish_conn_mock, + ) + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: []}, + status_code=200, + ) + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = collector.collect_data(args=args) + assert result.status == ExecutionStatus.ERROR + assert "Event filter failed" in result.message + assert data is None + + +def test_stub_collector_assembly_and_firmware_paths( + stub_serviceability_collector, redfish_conn_mock +): + tpl = "/redfish/v1/Chassis/{device}/Assembly" + asm_uri = tpl.format(device="C1") + fw_uri = "/redfish/v1/UpdateService/FirmwareInventory" + + def run_get_side_effect(path: str, *_args, **_kwargs): + if path == EVENT_URI: + return RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: []}, + status_code=200, + ) + if path == asm_uri: + return RedfishGetResult( + path=asm_uri, + success=True, + data={"Assemblies": [{"SerialNumber": "SN-ASM"}]}, + status_code=200, + ) + if path == fw_uri: + return RedfishGetResult( + path=fw_uri, + success=True, + data={"Details": "fw-summary"}, + status_code=200, + ) + raise AssertionError(f"unexpected Redfish GET path: {path!r}") + + redfish_conn_mock.run_get.side_effect = run_get_side_effect + + def run_get_paged_forbidden(*_args, **_kwargs): + raise AssertionError("run_get_paged must not run when follow_next_link=False") + + redfish_conn_mock.run_get_paged.side_effect = run_get_paged_forbidden + + args = ServiceabilityCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template=tpl, + rf_chassis_devices=["C1"], + rf_firmware_bundle_uri=fw_uri, + follow_next_link=False, + ) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert "C1" in data.assembly_info + assert data.assembly_info["C1"].serial_number == "SN-ASM" + assert data.component_details == "fw-summary" + assert asm_uri in data.responses + + +def test_stub_collector_top_when_count_exceeds_top_uses_skip_and_paged( + stub_serviceability_collector, redfish_conn_mock +): + probe = RedfishGetResult( + path=f"{EVENT_URI}?$top=1", + success=True, + data={RF_MEMBERS_COUNT: 100}, + status_code=200, + ) + window = RedfishGetResult( + path=f"{EVENT_URI}?$skip=90", + success=True, + data={RF_MEMBERS: [{"Id": "last"}]}, + status_code=200, + ) + redfish_conn_mock.run_get.return_value = probe + redfish_conn_mock.run_get_paged.return_value = window + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI, top=10) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rf_events == [{"Id": "last"}] + redfish_conn_mock.run_get.assert_called_once() + assert "?$top=1" in redfish_conn_mock.run_get.call_args[0][0] + redfish_conn_mock.run_get_paged.assert_called_once_with( + f"{EVENT_URI}?$skip=90", max_pages=args.max_pages + ) + + +def test_stub_collector_top_when_count_within_top_fetches_full_log( + stub_serviceability_collector, redfish_conn_mock +): + probe = RedfishGetResult( + path=f"{EVENT_URI}?$top=1", + success=True, + data={RF_MEMBERS_COUNT: 3}, + status_code=200, + ) + full = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: [{"Id": "a"}, {"Id": "b"}]}, + status_code=200, + ) + redfish_conn_mock.run_get.return_value = probe + redfish_conn_mock.run_get_paged.return_value = full + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI, top=50) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.rf_events) == 2 + redfish_conn_mock.run_get_paged.assert_called_once_with(EVENT_URI, max_pages=args.max_pages) + + +def test_serviceability_data_model_log_model_writes_json(tmp_path): + model = ServiceabilityDataModel( + responses={"/x": {"ok": True}}, + cper_data={"slot": {"raw": "data"}}, + ) + model.log_model(str(tmp_path)) + responses_file = tmp_path / "redfish_responses.json" + cper_file = tmp_path / "cper_data.json" + assert responses_file.is_file() + assert cper_file.is_file() + assert json.loads(responses_file.read_text(encoding="utf-8")) == {"/x": {"ok": True}} + assert json.loads(cper_file.read_text(encoding="utf-8")) == {"slot": {"raw": "data"}} + + +def test_serviceability_data_model_log_model_skips_cper_when_empty(tmp_path): + model = ServiceabilityDataModel(responses={}) + model.log_model(str(tmp_path)) + assert (tmp_path / "redfish_responses.json").is_file() + assert not (tmp_path / "cper_data.json").exists() From d121f99ff60da154f690865f03bafe22bc3b8d27 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 18 May 2026 19:12:37 -0500 Subject: [PATCH 03/19] updates --- .../plugins/serviceability/__init__.py | 40 ++- .../serviceability/oob_redfish/__init__.py | 44 +++ .../oob_redfish/oob_redfish_collector.py | 76 ++++ .../oob_redfish_collector_args.py} | 51 ++- .../oob_redfish/oob_redfish_data.py | 186 ++++++++++ .../oob_redfish_plugin.py} | 18 +- .../serviceability_collector.py | 182 ---------- .../serviceability/serviceability_data.py | 82 ----- .../plugins/serviceability/time_utils.py | 116 +++++++ .../unit/plugin/test_oob_redfish_collector.py | 181 ++++++++++ .../plugin/test_serviceability_collector.py | 326 ------------------ 11 files changed, 684 insertions(+), 618 deletions(-) create mode 100644 nodescraper/plugins/serviceability/oob_redfish/__init__.py create mode 100644 nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py rename nodescraper/plugins/serviceability/{collector_args.py => oob_redfish/oob_redfish_collector_args.py} (67%) create mode 100644 nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py rename nodescraper/plugins/serviceability/{serviceability_plugin_base.py => oob_redfish/oob_redfish_plugin.py} (71%) delete mode 100644 nodescraper/plugins/serviceability/serviceability_collector.py delete mode 100644 nodescraper/plugins/serviceability/serviceability_data.py create mode 100644 nodescraper/plugins/serviceability/time_utils.py create mode 100644 test/unit/plugin/test_oob_redfish_collector.py delete mode 100644 test/unit/plugin/test_serviceability_collector.py diff --git a/nodescraper/plugins/serviceability/__init__.py b/nodescraper/plugins/serviceability/__init__.py index af181362..16a87632 100644 --- a/nodescraper/plugins/serviceability/__init__.py +++ b/nodescraper/plugins/serviceability/__init__.py @@ -23,20 +23,34 @@ # SOFTWARE. # ############################################################################### -from .collector_args import ServiceabilityCollectorArgs -from .serviceability_collector import ServiceabilityCollectorBase -from .serviceability_data import ( - DeviceInfo, - ServiceabilityDataModel, - ServiceabilityResult, +from .oob_redfish import ( + OobRedfishCollector, + OobRedfishCollectorArgs, + OobRedfishDataModel, + OobRedfishDeviceInfo, + OobRedfishPlugin, + OobRedfishResult, + build_oob_redfish_reporting_version_fields, +) +from .time_utils import ( + TimeOperator, + compare_iso_datetime, + is_valid_iso_datetime, + parse_iso_datetime, + satisfies_time_check, ) -from .serviceability_plugin_base import ServiceabilityPluginBase __all__ = [ - "DeviceInfo", - "ServiceabilityCollectorArgs", - "ServiceabilityCollectorBase", - "ServiceabilityDataModel", - "ServiceabilityPluginBase", - "ServiceabilityResult", + "OobRedfishCollector", + "OobRedfishCollectorArgs", + "OobRedfishDataModel", + "OobRedfishDeviceInfo", + "OobRedfishPlugin", + "OobRedfishResult", + "TimeOperator", + "build_oob_redfish_reporting_version_fields", + "compare_iso_datetime", + "is_valid_iso_datetime", + "parse_iso_datetime", + "satisfies_time_check", ] diff --git a/nodescraper/plugins/serviceability/oob_redfish/__init__.py b/nodescraper/plugins/serviceability/oob_redfish/__init__.py new file mode 100644 index 00000000..e0dae020 --- /dev/null +++ b/nodescraper/plugins/serviceability/oob_redfish/__init__.py @@ -0,0 +1,44 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .oob_redfish_collector import OobRedfishCollector +from .oob_redfish_collector_args import OobRedfishCollectorArgs +from .oob_redfish_data import ( + OobRedfishDataModel, + OobRedfishDeviceInfo, + OobRedfishResult, + build_oob_redfish_reporting_version_fields, +) +from .oob_redfish_plugin import OobRedfishPlugin + +__all__ = [ + "OobRedfishCollector", + "OobRedfishCollectorArgs", + "OobRedfishDataModel", + "OobRedfishDeviceInfo", + "OobRedfishPlugin", + "OobRedfishResult", + "build_oob_redfish_reporting_version_fields", +] diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py new file mode 100644 index 00000000..503d7103 --- /dev/null +++ b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py @@ -0,0 +1,76 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import Any, Optional + +from nodescraper.base import RedfishDataCollector +from nodescraper.enums import ExecutionStatus +from nodescraper.models import TaskResult +from nodescraper.plugins.serviceability.time_utils import satisfies_time_check + +from .oob_redfish_collector_args import OobRedfishCollectorArgs +from .oob_redfish_data import OobRedfishDataModel + + +class OobRedfishCollector( + RedfishDataCollector[OobRedfishDataModel, OobRedfishCollectorArgs], +): + """Collect OOB Redfish serviceability data.""" + + DATA_MODEL = OobRedfishDataModel + + def __init__(self, **kwargs: Any) -> None: + self._log_path: Optional[str] = kwargs.pop("log_path", None) + super().__init__(**kwargs) + + def satisfies_reference_time( + self, + candidate: str, + args: OobRedfishCollectorArgs, + ) -> bool: + """Test a timestamp against optional reference-time filter settings. + + Args: + candidate: Timestamp string to test. + args: Collector arguments that may define reference_time and time_operator. + + Returns: + True when no filter is configured or the comparison succeeds. + """ + if args.reference_time is None or args.time_operator is None: + return True + return satisfies_time_check(candidate, args.reference_time, args.time_operator) + + def _missing_args_result(self) -> tuple[TaskResult, None]: + """Build a not-ran result when collector arguments are missing. + + Returns: + Task result with NOT_RAN status and no data model. + """ + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = "OobRedfishCollectorArgs are required" + return self.result, None diff --git a/nodescraper/plugins/serviceability/collector_args.py b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py similarity index 67% rename from nodescraper/plugins/serviceability/collector_args.py rename to nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py index 4b2511ca..5c1b0687 100644 --- a/nodescraper/plugins/serviceability/collector_args.py +++ b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py @@ -27,13 +27,17 @@ from typing import List, Optional -from pydantic import Field, model_validator +from pydantic import Field, field_validator, model_validator from nodescraper.models import CollectorArgs +from nodescraper.plugins.serviceability.time_utils import ( + TimeOperator, + is_valid_iso_datetime, +) -class ServiceabilityCollectorArgs(CollectorArgs): - """URIs and pagination only. Subclasses add filtering and OEM-specific options.""" +class OobRedfishCollectorArgs(CollectorArgs): + """Arguments for OOB Redfish serviceability collection.""" uri: Optional[str] = Field( default=None, @@ -70,9 +74,32 @@ class ServiceabilityCollectorArgs(CollectorArgs): ge=1, description="Most recent N entries via $skip after count probe; None collects full window.", ) + reference_time: Optional[str] = Field( + default=None, + description=( + "Optional ISO-8601 date or date-time used with time_operator " + "(e.g. 2026-05-17 or 2026-05-17T13:01:00)." + ), + ) + time_operator: Optional[TimeOperator] = Field( + default=None, + description="Comparison operator applied when reference_time is set.", + ) + + @field_validator("reference_time") + @classmethod + def _validate_reference_time_iso(cls, value: Optional[str]) -> Optional[str]: + if value is None: + return None + text = str(value).strip() + if not text: + raise ValueError("reference_time must be a non-empty ISO-8601 string") + if not is_valid_iso_datetime(text): + raise ValueError(f"reference_time is not ISO-8601 compliant: {value!r}") + return text @model_validator(mode="after") - def _require_event_log_uri(self) -> ServiceabilityCollectorArgs: + def _require_event_log_uri(self) -> OobRedfishCollectorArgs: if not self.resolved_event_log_uri(): raise ValueError( "Provide a non-empty rf_event_log_uri or uri for the event log collection." @@ -80,7 +107,7 @@ def _require_event_log_uri(self) -> ServiceabilityCollectorArgs: return self @model_validator(mode="after") - def _assembly_consistency(self) -> ServiceabilityCollectorArgs: + def _assembly_consistency(self) -> OobRedfishCollectorArgs: has_tpl = bool( self.rf_assembly_uri_template and "{device}" in self.rf_assembly_uri_template ) @@ -92,8 +119,20 @@ def _assembly_consistency(self) -> ServiceabilityCollectorArgs: ) return self + @model_validator(mode="after") + def _reference_time_requires_operator(self) -> OobRedfishCollectorArgs: + has_ref = self.reference_time is not None + has_op = self.time_operator is not None + if has_ref != has_op: + raise ValueError("Provide both reference_time and time_operator, or omit both.") + return self + def resolved_event_log_uri(self) -> str: - """Return uri or rf_event_log_uri.""" + """Resolve the configured event log URI. + + Returns: + Non-empty URI from uri or rf_event_log_uri, or an empty string. + """ for candidate in (self.uri, self.rf_event_log_uri): if candidate and str(candidate).strip(): return str(candidate).strip() diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py new file mode 100644 index 00000000..6ad69a7b --- /dev/null +++ b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py @@ -0,0 +1,186 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +import json +import os +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + +from nodescraper.models import DataModel + + +class OobRedfishDeviceInfo(BaseModel): + """Device identity with separate board and product fields.""" + + board_product_name: Optional[str] = Field( + default=None, + description="Board product name (IPMI board information area).", + ) + board_part_number: Optional[str] = Field( + default=None, + description="Board part number.", + ) + board_serial_number: Optional[str] = Field( + default=None, + description="Board serial number.", + ) + board_manufacturing_date: Optional[str] = Field( + default=None, + description=( + "Board manufacturing date as a rendered string " + "(not IPMI minutes-since-1996 encoding)." + ), + ) + product_name: Optional[str] = Field( + default=None, + description="Product name (IPMI product information area).", + ) + product_part_number: Optional[str] = Field( + default=None, + description="Product part or model number.", + ) + product_serial_number: Optional[str] = Field( + default=None, + description="Product serial number.", + ) + product_version: Optional[str] = Field( + default=None, + description="Product version (no board-area equivalent in IPMI FRU).", + ) + oem_extensions: Dict[str, Any] = Field( + default_factory=dict, + description=("Vendor-specific fields: extra board/product data, multirecord, etc."), + ) + + +class OobRedfishResult(BaseModel): + """Structured serviceability report output.""" + + node: Optional[str] = None + node_scraper_version: Optional[str] = Field( + default=None, + description="Version of amd-node-scraper that produced this report.", + ) + plugin_name: Optional[str] = Field( + default=None, + description="Name of the serviceability plugin that produced this report.", + ) + plugin_version: Optional[str] = Field( + default=None, + description="Version of the serviceability plugin that produced this report.", + ) + reporter_extensions: Dict[str, str] = Field( + default_factory=dict, + description="Additional tool versions keyed by name.", + ) + service_recommendations: Dict[str, List[dict]] = Field(default_factory=dict) + service_action_definitions: Dict[str, dict] = Field(default_factory=dict) + afid_sag_metadata: Dict[str, Any] = Field(default_factory=dict) + node_info: Dict[str, Any] = Field(default_factory=dict) + extensions: Dict[str, Any] = Field( + default_factory=dict, + description="Additional implementation-specific fields.", + ) + + +def build_oob_redfish_reporting_version_fields( + *, + plugin_name: Optional[str] = None, + plugin_version: Optional[str] = None, + node_scraper_version: Optional[str] = None, + **reporter_extensions: str, +) -> Dict[str, Any]: + """Build keyword arguments for result versioning fields. + + Args: + plugin_name: Name of the reporting plugin. + plugin_version: Version of the reporting plugin. + node_scraper_version: Node scraper version; defaults to the installed package version. + reporter_extensions: Additional tool versions as keyword arguments. + + Returns: + Dictionary of versioning fields for a result model. + """ + import nodescraper + + return { + "node_scraper_version": node_scraper_version or nodescraper.__version__, + "plugin_name": plugin_name, + "plugin_version": plugin_version, + "reporter_extensions": dict(reporter_extensions), + } + + +class OobRedfishDataModel(DataModel): + """Collected OOB Redfish serviceability data model.""" + + collected_data: Dict[str, Any] = Field( + default_factory=dict, + description="Arbitrary keyed payloads from the collector implementation.", + ) + device_info: Dict[str, OobRedfishDeviceInfo] = Field( + default_factory=dict, + description="Optional device identity keyed by implementer-defined labels.", + ) + artifacts: Dict[str, Any] = Field( + default_factory=dict, + description="Filename to JSON-serializable payload for log_model output.", + ) + endpoint: Optional[str] = Field( + default=None, + description="Optional host or service endpoint label (not necessarily a BMC).", + ) + log_path: Optional[str] = None + result: Optional[OobRedfishResult] = None + + def log_model(self, log_path: str) -> None: + """Write artifact files and a JSON summary under the log directory. + + Args: + log_path: Directory path for output files. + + Returns: + None. + """ + os.makedirs(log_path, exist_ok=True) + for filename, payload in self.artifacts.items(): + if not filename or not str(filename).strip(): + continue + artifact_path = os.path.join(log_path, str(filename).strip()) + with open(artifact_path, "w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2) + summary_path = os.path.join(log_path, "oob_redfish_data.json") + with open(summary_path, "w", encoding="utf-8") as handle: + json.dump( + self.model_dump( + exclude={"artifacts"}, + mode="json", + ), + handle, + indent=2, + ) diff --git a/nodescraper/plugins/serviceability/serviceability_plugin_base.py b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py similarity index 71% rename from nodescraper/plugins/serviceability/serviceability_plugin_base.py rename to nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py index b3ca322a..b891c522 100644 --- a/nodescraper/plugins/serviceability/serviceability_plugin_base.py +++ b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py @@ -25,16 +25,16 @@ ############################################################################### from nodescraper.base import OOBandDataPlugin -from .collector_args import ServiceabilityCollectorArgs -from .serviceability_collector import ServiceabilityCollectorBase -from .serviceability_data import ServiceabilityDataModel +from .oob_redfish_collector import OobRedfishCollector +from .oob_redfish_collector_args import OobRedfishCollectorArgs +from .oob_redfish_data import OobRedfishDataModel -class ServiceabilityPluginBase( - OOBandDataPlugin[ServiceabilityDataModel, ServiceabilityCollectorArgs, None], +class OobRedfishPlugin( + OOBandDataPlugin[OobRedfishDataModel, OobRedfishCollectorArgs, None], ): - """OOB Redfish collect-only plugin stub; subclass with a concrete COLLECTOR and optional ANALYZER.""" + """OOB Redfish serviceability plugin base.""" - DATA_MODEL = ServiceabilityDataModel - COLLECTOR = ServiceabilityCollectorBase - COLLECTOR_ARGS = ServiceabilityCollectorArgs + DATA_MODEL = OobRedfishDataModel + COLLECTOR = OobRedfishCollector + COLLECTOR_ARGS = OobRedfishCollectorArgs diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py deleted file mode 100644 index 7e364afd..00000000 --- a/nodescraper/plugins/serviceability/serviceability_collector.py +++ /dev/null @@ -1,182 +0,0 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from __future__ import annotations - -import abc -from typing import Any, Optional -from urllib.parse import urlparse - -from nodescraper.base import RedfishDataCollector -from nodescraper.connection.redfish import RF_MEMBERS, RF_MEMBERS_COUNT -from nodescraper.enums import ExecutionStatus -from nodescraper.models import TaskResult - -from .collector_args import ServiceabilityCollectorArgs -from .serviceability_data import DeviceInfo, ServiceabilityDataModel - - -class ServiceabilityCollectorBase( - RedfishDataCollector[ServiceabilityDataModel, ServiceabilityCollectorArgs], -): - """OOB Redfish collection skeleton; subclasses implement filtering, CPER handling, and JSON parsing.""" - - DATA_MODEL = ServiceabilityDataModel - - def __init__(self, **kwargs: Any) -> None: - self._log_path: Optional[str] = kwargs.get("log_path") - super().__init__(**kwargs) - - @abc.abstractmethod - def filter_event_members( - self, - members: list[Any], - args: ServiceabilityCollectorArgs, - ) -> list[Any]: - """Return the event list to retain for downstream analysis.""" - - @abc.abstractmethod - def is_cper_event(self, event: dict) -> bool: - """Return whether a Redfish event entry should be treated as diagnostic-backed.""" - - @abc.abstractmethod - def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: - """Fetch and decode diagnostic attachments for qualifying events (subclass-defined).""" - - @abc.abstractmethod - def parse_assembly_entry( - self, - designation: str, - assembly_member_entry: dict[str, Any], - args: ServiceabilityCollectorArgs, - ) -> DeviceInfo: - """Map one Assemblies[] member dict into DeviceInfo.""" - - @abc.abstractmethod - def extract_component_details( - self, - firmware_inventory_payload: dict[str, Any], - args: ServiceabilityCollectorArgs, - ) -> Optional[str]: - """Derive component-details text from a firmware inventory GET payload, or None.""" - - def _fetch_event_log(self, args: ServiceabilityCollectorArgs, uri: str): - if args.follow_next_link: - return self._run_redfish_get_paged(uri, max_pages=args.max_pages) - return self._run_redfish_get(uri, log_artifact=True) - - def collect_data( - self, args: Optional[ServiceabilityCollectorArgs] = None - ) -> tuple[TaskResult, Optional[ServiceabilityDataModel]]: - if args is None: - self.result.status = ExecutionStatus.NOT_RAN - self.result.message = "ServiceabilityCollectorArgs are required" - return self.result, None - - event_uri = args.resolved_event_log_uri() - if args.top is not None: - res = self._fetch_top(args, args.top, args.max_pages) - else: - res = self._fetch_event_log(args, event_uri) - - if not res.success or res.data is None: - self.result.status = ExecutionStatus.ERROR - self.result.message = f"Redfish GET failed for {event_uri}: {res.error}" - return self.result, None - - members = res.data.get(RF_MEMBERS, []) - responses = {res.path: res.data} - raw_base_url = getattr(self.connection, "base_url", None) - bmc_host = urlparse(raw_base_url).hostname if raw_base_url else None - - try: - filtered_members = self.filter_event_members(members, args) - except ValueError as exc: - self.result.status = ExecutionStatus.ERROR - self.result.message = f"Event filter failed: {exc}" - return self.result, None - - assembly_info: dict[str, DeviceInfo] = {} - tpl = args.rf_assembly_uri_template - devices = args.rf_chassis_devices - if tpl and devices: - for device in devices: - uri_asm = tpl.format(device=device) - assembly_res = self._run_redfish_get(uri_asm, log_artifact=True) - if not assembly_res.success or assembly_res.data is None: - continue - responses[assembly_res.path] = assembly_res.data - - assemblies = assembly_res.data.get("Assemblies", []) - if not assemblies: - continue - - entry = assemblies[0] - assembly_info[device] = self.parse_assembly_entry(device, entry, args) - - cper_data = self.collect_cper_data(filtered_members or []) - - data = ServiceabilityDataModel( - responses=responses, - rf_events=filtered_members or [], - assembly_info=assembly_info, - cper_data=cper_data, - component_details=self._fetch_component_details(responses, args), - log_path=self._log_path, - bmc_host=bmc_host, - ) - self.result.status = ExecutionStatus.OK - self.result.message = f"Collected {len(members)} event log member(s)" - return self.result, data - - def _fetch_component_details( - self, responses: dict[str, Any], args: ServiceabilityCollectorArgs - ) -> Optional[str]: - fw_uri = args.rf_firmware_bundle_uri - if not fw_uri or not str(fw_uri).strip(): - return None - fw_uri = str(fw_uri).strip() - fw_res = self._run_redfish_get(fw_uri, log_artifact=True) - if not fw_res.success or fw_res.data is None: - return None - responses[fw_res.path] = fw_res.data - return self.extract_component_details(fw_res.data, args) - - def _fetch_top(self, args: ServiceabilityCollectorArgs, top: int, max_pages: int): - event_uri = args.resolved_event_log_uri() - probe = self._run_redfish_get(f"{event_uri}?$top=1", log_artifact=True) - if not probe.success or probe.data is None: - return probe - - count = probe.data.get(RF_MEMBERS_COUNT, 0) - - if count <= top: - return self._fetch_event_log(args, event_uri) - - skip = count - top - skip_uri = f"{event_uri}?$skip={skip}" - if args.follow_next_link: - return self._run_redfish_get_paged(skip_uri, max_pages=max_pages) - return self._run_redfish_get(skip_uri, log_artifact=True) diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py deleted file mode 100644 index 93e57737..00000000 --- a/nodescraper/plugins/serviceability/serviceability_data.py +++ /dev/null @@ -1,82 +0,0 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from __future__ import annotations - -import json -import os -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel, Field - -from nodescraper.models import DataModel - - -class DeviceInfo(BaseModel): - """Chassis fields from Assembly parsing; extra vendor keys belong in oem_extensions.""" - - name: Optional[str] = None - part_number: Optional[str] = None - production_date: Optional[str] = None - serial_number: Optional[str] = None - version: Optional[str] = None - oem_extensions: Dict[str, Any] = Field( - default_factory=dict, - description="Opaque vendor/product extensions parsed by the concrete collector.", - ) - - -class ServiceabilityResult(BaseModel): - """Structured serviceability output (typically populated by a downstream analyzer).""" - - node: Optional[str] = None - service_recommendations: Dict[str, List[dict]] = {} - service_action_definitions: Dict[str, dict] = {} - afid_sag_metadata: Dict[str, Any] = {} - node_info: Dict[str, Any] = {} - - -class ServiceabilityDataModel(DataModel): - """Collected Redfish responses and intermediate serviceability fields.""" - - responses: dict[str, Any] = {} - rf_events: list[Any] = [] - assembly_info: Dict[str, DeviceInfo] = {} - cper_data: Dict[str, Any] = {} - component_details: Optional[str] = None - log_path: Optional[str] = None - bmc_host: Optional[str] = None - result: Optional[ServiceabilityResult] = None - - def log_model(self, log_path: str) -> None: - """Write redfish_responses.json and optional cper_data.json under log_path.""" - os.makedirs(log_path, exist_ok=True) - responses_path = os.path.join(log_path, "redfish_responses.json") - with open(responses_path, "w", encoding="utf-8") as f: - json.dump(self.responses, f, indent=2) - if self.cper_data: - cper_path = os.path.join(log_path, "cper_data.json") - with open(cper_path, "w", encoding="utf-8") as f: - json.dump(self.cper_data, f, indent=2) diff --git a/nodescraper/plugins/serviceability/time_utils.py b/nodescraper/plugins/serviceability/time_utils.py new file mode 100644 index 00000000..8bbc8a83 --- /dev/null +++ b/nodescraper/plugins/serviceability/time_utils.py @@ -0,0 +1,116 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from datetime import datetime +from typing import Literal + +TimeOperator = Literal[">", ">=", "<", "<=", "=="] + +_TIME_OPERATORS: set[str] = {">", ">=", "<", "<=", "=="} + + +def is_valid_iso_datetime(value: str) -> bool: + """Return whether a string is ISO-8601 compliant. + + Args: + value: Date or date-time string to validate. + + Returns: + True if the value parses as ISO-8601. + """ + try: + parse_iso_datetime(value) + except ValueError: + return False + return True + + +def parse_iso_datetime(value: str) -> datetime: + """Parse an ISO-8601 date or date-time string. + + Args: + value: Date (e.g. 2026-05-17) or date-time (e.g. 2026-05-17T13:01:00). + + Returns: + Parsed datetime. + """ + text = str(value).strip() + if not text: + raise ValueError("Empty datetime string") + if text.endswith("Z"): + text = f"{text[:-1]}+00:00" + try: + parsed = datetime.fromisoformat(text) + except ValueError as exc: + raise ValueError(f"Not ISO-8601 compliant: {value!r}") from exc + if "T" not in text and "+" not in text and text.count("-") == 2: + return parsed.replace(hour=0, minute=0, second=0, microsecond=0) + return parsed + + +def compare_iso_datetime(left: str, right: str, operator: TimeOperator) -> bool: + """Compare two ISO-8601 values with a relational operator. + + Args: + left: Left-hand date or date-time string. + right: Right-hand date or date-time string. + operator: One of >, >=, <, <=, or ==. + + Returns: + Result of the comparison. + """ + if operator not in _TIME_OPERATORS: + raise ValueError(f"Unsupported time operator: {operator!r}") + left_dt = parse_iso_datetime(left) + right_dt = parse_iso_datetime(right) + if operator == ">": + return left_dt > right_dt + if operator == ">=": + return left_dt >= right_dt + if operator == "<": + return left_dt < right_dt + if operator == "<=": + return left_dt <= right_dt + return left_dt == right_dt + + +def satisfies_time_check( + candidate: str, + reference: str, + operator: TimeOperator, +) -> bool: + """Test whether candidate satisfies operator against reference. + + Args: + candidate: Date or date-time string to test. + reference: Reference date or date-time string. + operator: One of >, >=, <, <=, or ==. + + Returns: + True when the comparison holds. + """ + return compare_iso_datetime(candidate, reference, operator) diff --git a/test/unit/plugin/test_oob_redfish_collector.py b/test/unit/plugin/test_oob_redfish_collector.py new file mode 100644 index 00000000..e729cedc --- /dev/null +++ b/test/unit/plugin/test_oob_redfish_collector.py @@ -0,0 +1,181 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional + +import pytest +from pydantic import ValidationError + +from nodescraper.base import OOBandDataPlugin +from nodescraper.connection.redfish import RedfishConnectionManager +from nodescraper.enums import ExecutionStatus +from nodescraper.plugins.serviceability import ( + OobRedfishCollector, + OobRedfishCollectorArgs, + OobRedfishDataModel, + OobRedfishDeviceInfo, + OobRedfishPlugin, + OobRedfishResult, + build_oob_redfish_reporting_version_fields, + compare_iso_datetime, + is_valid_iso_datetime, + satisfies_time_check, +) + +EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" + + +class _StubOobRedfishCollector(OobRedfishCollector): + def collect_data(self, args: Optional[OobRedfishCollectorArgs] = None): + if args is None: + return self._missing_args_result() + data = OobRedfishDataModel( + collected_data={"events": []}, + log_path=self._log_path, + ) + self.result.status = ExecutionStatus.OK + self.result.message = "stub collection complete" + return self.result, data + + +@pytest.fixture +def stub_oob_redfish_collector(system_info, redfish_conn_mock): + return _StubOobRedfishCollector( + system_info=system_info, + connection=redfish_conn_mock, + log_path="/tmp/oob_redfish.log", + ) + + +def test_oob_redfish_collector_args_requires_event_log_uri(): + with pytest.raises(ValidationError): + OobRedfishCollectorArgs() + + +def test_oob_redfish_collector_args_uri_alias(): + args = OobRedfishCollectorArgs(uri=" /events ", rf_event_log_uri="/other") + assert args.resolved_event_log_uri() == "/events" + + +def test_oob_redfish_collector_args_assembly_requires_both_template_and_devices(): + with pytest.raises(ValidationError): + OobRedfishCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", + ) + with pytest.raises(ValidationError): + OobRedfishCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_chassis_devices=["C1"], + ) + + +def test_oob_redfish_collector_args_reference_time_requires_operator(): + with pytest.raises(ValidationError): + OobRedfishCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2026-05-17", + ) + + +def test_oob_redfish_collector_args_accepts_iso_date_and_datetime(): + date_args = OobRedfishCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2026-05-17", + time_operator=">=", + ) + assert date_args.reference_time == "2026-05-17" + + +def test_time_utils_iso_validation_and_comparison(): + assert is_valid_iso_datetime("2026-05-17") + assert satisfies_time_check("2026-05-18", "2026-05-17", ">") + assert compare_iso_datetime("2026-05-17T13:01:00", "2026-05-17T13:01:00", "==") + + +def test_oob_redfish_plugin_wiring(): + assert issubclass(OobRedfishPlugin, OOBandDataPlugin) + assert OobRedfishPlugin.DATA_MODEL is OobRedfishDataModel + assert OobRedfishPlugin.COLLECTOR is OobRedfishCollector + assert OobRedfishPlugin.COLLECTOR_ARGS is OobRedfishCollectorArgs + assert OobRedfishPlugin.CONNECTION_TYPE is RedfishConnectionManager + assert OobRedfishPlugin.ANALYZER is None + + +def test_stub_collector_no_args(stub_oob_redfish_collector): + result, data = stub_oob_redfish_collector.collect_data() + assert result.status == ExecutionStatus.NOT_RAN + assert "required" in result.message.lower() + assert data is None + + +def test_stub_collector_success_minimal(stub_oob_redfish_collector): + args = OobRedfishCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = stub_oob_redfish_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.collected_data == {"events": []} + + +def test_collector_satisfies_reference_time_helper(stub_oob_redfish_collector): + args = OobRedfishCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2026-05-17", + time_operator=">=", + ) + assert stub_oob_redfish_collector.satisfies_reference_time("2026-05-18", args) + assert not stub_oob_redfish_collector.satisfies_reference_time("2026-05-16", args) + + +def test_oob_redfish_device_info_fields(): + info = OobRedfishDeviceInfo( + board_product_name="Board-A", + board_serial_number="BSN-1", + product_version="1.0", + ) + assert info.board_product_name == "Board-A" + assert info.product_version == "1.0" + + +def test_oob_redfish_result_reporting_versions(): + version_fields = build_oob_redfish_reporting_version_fields( + plugin_name="example_oob_redfish", + plugin_version="0.1.0", + node_scraper_version="1.2.3", + isa_version="9.8.7", + ) + result = OobRedfishResult(node="node-1", **version_fields) + assert result.plugin_name == "example_oob_redfish" + assert result.reporter_extensions["isa_version"] == "9.8.7" + + +def test_oob_redfish_data_model_log_model(tmp_path): + model = OobRedfishDataModel( + collected_data={"events": [{"id": 1}]}, + artifacts={"events.json": [{"id": 1}]}, + ) + model.log_model(str(tmp_path)) + assert (tmp_path / "events.json").is_file() + assert (tmp_path / "oob_redfish_data.json").is_file() diff --git a/test/unit/plugin/test_serviceability_collector.py b/test/unit/plugin/test_serviceability_collector.py deleted file mode 100644 index e3a67d5d..00000000 --- a/test/unit/plugin/test_serviceability_collector.py +++ /dev/null @@ -1,326 +0,0 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -import json -from typing import Any, Optional - -import pytest -from pydantic import ValidationError - -from nodescraper.connection.redfish import ( - RF_MEMBERS, - RF_MEMBERS_COUNT, - RedfishGetResult, -) -from nodescraper.enums import ExecutionStatus -from nodescraper.plugins.serviceability import ( - DeviceInfo, - ServiceabilityCollectorArgs, - ServiceabilityDataModel, - ServiceabilityPluginBase, -) -from nodescraper.plugins.serviceability.serviceability_collector import ( - ServiceabilityCollectorBase, -) - -EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" - - -class _StubServiceabilityCollector(ServiceabilityCollectorBase): - def filter_event_members( - self, - members: list[Any], - args: ServiceabilityCollectorArgs, - ) -> list[Any]: - return members - - def is_cper_event(self, event: dict) -> bool: - return False - - def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: - return {} - - def parse_assembly_entry( - self, - designation: str, - assembly_member_entry: dict[str, Any], - args: ServiceabilityCollectorArgs, - ) -> DeviceInfo: - return DeviceInfo(name=designation, serial_number=assembly_member_entry.get("SerialNumber")) - - def extract_component_details( - self, - firmware_inventory_payload: dict[str, Any], - args: ServiceabilityCollectorArgs, - ) -> Optional[str]: - return firmware_inventory_payload.get("Details") - - -@pytest.fixture -def stub_serviceability_collector(system_info, redfish_conn_mock): - redfish_conn_mock.base_url = "https://bmc.example/redfish/v1" - return _StubServiceabilityCollector( - system_info=system_info, - connection=redfish_conn_mock, - log_path="/tmp/serviceability.log", - ) - - -def test_serviceability_collector_args_requires_event_log_uri(): - with pytest.raises(ValidationError): - ServiceabilityCollectorArgs() - - -def test_serviceability_collector_args_uri_alias_prefers_uri_over_rf_event_log_uri(): - args = ServiceabilityCollectorArgs(uri=" /events ", rf_event_log_uri="/other") - assert args.resolved_event_log_uri() == "/events" - - -def test_serviceability_collector_args_assembly_requires_both_template_and_devices(): - with pytest.raises(ValidationError): - ServiceabilityCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", - ) - with pytest.raises(ValidationError): - ServiceabilityCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_chassis_devices=["C1"], - ) - - -def test_serviceability_collector_args_assembly_template_must_include_device_placeholder(): - with pytest.raises(ValidationError): - ServiceabilityCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_assembly_uri_template="/redfish/v1/Chassis/C1/Assembly", - rf_chassis_devices=["C1"], - ) - - -def test_serviceability_collector_args_assembly_optional_when_omitted(): - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) - assert args.rf_assembly_uri_template is None - assert args.rf_chassis_devices is None - - -def test_serviceability_plugin_base_wiring(): - assert ServiceabilityPluginBase.DATA_MODEL is ServiceabilityDataModel - assert ServiceabilityPluginBase.COLLECTOR is ServiceabilityCollectorBase - assert ServiceabilityPluginBase.COLLECTOR_ARGS is ServiceabilityCollectorArgs - assert ServiceabilityPluginBase.ANALYZER is None - - -def test_stub_collector_no_args(stub_serviceability_collector): - result, data = stub_serviceability_collector.collect_data() - assert result.status == ExecutionStatus.NOT_RAN - assert "required" in result.message.lower() - assert data is None - - -def test_stub_collector_event_log_get_fails(stub_serviceability_collector, redfish_conn_mock): - redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( - path=EVENT_URI, - success=False, - error="timeout", - status_code=None, - ) - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) - result, data = stub_serviceability_collector.collect_data(args=args) - assert result.status == ExecutionStatus.ERROR - assert EVENT_URI in result.message - assert data is None - - -def test_stub_collector_success_minimal(stub_serviceability_collector, redfish_conn_mock): - members = [{"Id": "1"}] - redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( - path=EVENT_URI, - success=True, - data={RF_MEMBERS: members}, - status_code=200, - ) - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) - result, data = stub_serviceability_collector.collect_data(args=args) - assert result.status == ExecutionStatus.OK - assert data is not None - assert data.rf_events == members - assert EVENT_URI in data.responses - assert data.bmc_host == "bmc.example" - assert data.log_path == "/tmp/serviceability.log" - redfish_conn_mock.run_get_paged.assert_called_once() - - -def test_stub_collector_filter_raises_maps_to_error( - stub_serviceability_collector, redfish_conn_mock -): - class _BadFilter(_StubServiceabilityCollector): - def filter_event_members(self, members, args): - raise ValueError("bad filter") - - collector = _BadFilter( - system_info=stub_serviceability_collector.system_info, - connection=redfish_conn_mock, - ) - redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( - path=EVENT_URI, - success=True, - data={RF_MEMBERS: []}, - status_code=200, - ) - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) - result, data = collector.collect_data(args=args) - assert result.status == ExecutionStatus.ERROR - assert "Event filter failed" in result.message - assert data is None - - -def test_stub_collector_assembly_and_firmware_paths( - stub_serviceability_collector, redfish_conn_mock -): - tpl = "/redfish/v1/Chassis/{device}/Assembly" - asm_uri = tpl.format(device="C1") - fw_uri = "/redfish/v1/UpdateService/FirmwareInventory" - - def run_get_side_effect(path: str, *_args, **_kwargs): - if path == EVENT_URI: - return RedfishGetResult( - path=EVENT_URI, - success=True, - data={RF_MEMBERS: []}, - status_code=200, - ) - if path == asm_uri: - return RedfishGetResult( - path=asm_uri, - success=True, - data={"Assemblies": [{"SerialNumber": "SN-ASM"}]}, - status_code=200, - ) - if path == fw_uri: - return RedfishGetResult( - path=fw_uri, - success=True, - data={"Details": "fw-summary"}, - status_code=200, - ) - raise AssertionError(f"unexpected Redfish GET path: {path!r}") - - redfish_conn_mock.run_get.side_effect = run_get_side_effect - - def run_get_paged_forbidden(*_args, **_kwargs): - raise AssertionError("run_get_paged must not run when follow_next_link=False") - - redfish_conn_mock.run_get_paged.side_effect = run_get_paged_forbidden - - args = ServiceabilityCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_assembly_uri_template=tpl, - rf_chassis_devices=["C1"], - rf_firmware_bundle_uri=fw_uri, - follow_next_link=False, - ) - result, data = stub_serviceability_collector.collect_data(args=args) - assert result.status == ExecutionStatus.OK - assert data is not None - assert "C1" in data.assembly_info - assert data.assembly_info["C1"].serial_number == "SN-ASM" - assert data.component_details == "fw-summary" - assert asm_uri in data.responses - - -def test_stub_collector_top_when_count_exceeds_top_uses_skip_and_paged( - stub_serviceability_collector, redfish_conn_mock -): - probe = RedfishGetResult( - path=f"{EVENT_URI}?$top=1", - success=True, - data={RF_MEMBERS_COUNT: 100}, - status_code=200, - ) - window = RedfishGetResult( - path=f"{EVENT_URI}?$skip=90", - success=True, - data={RF_MEMBERS: [{"Id": "last"}]}, - status_code=200, - ) - redfish_conn_mock.run_get.return_value = probe - redfish_conn_mock.run_get_paged.return_value = window - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI, top=10) - result, data = stub_serviceability_collector.collect_data(args=args) - assert result.status == ExecutionStatus.OK - assert data is not None - assert data.rf_events == [{"Id": "last"}] - redfish_conn_mock.run_get.assert_called_once() - assert "?$top=1" in redfish_conn_mock.run_get.call_args[0][0] - redfish_conn_mock.run_get_paged.assert_called_once_with( - f"{EVENT_URI}?$skip=90", max_pages=args.max_pages - ) - - -def test_stub_collector_top_when_count_within_top_fetches_full_log( - stub_serviceability_collector, redfish_conn_mock -): - probe = RedfishGetResult( - path=f"{EVENT_URI}?$top=1", - success=True, - data={RF_MEMBERS_COUNT: 3}, - status_code=200, - ) - full = RedfishGetResult( - path=EVENT_URI, - success=True, - data={RF_MEMBERS: [{"Id": "a"}, {"Id": "b"}]}, - status_code=200, - ) - redfish_conn_mock.run_get.return_value = probe - redfish_conn_mock.run_get_paged.return_value = full - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI, top=50) - result, data = stub_serviceability_collector.collect_data(args=args) - assert result.status == ExecutionStatus.OK - assert data is not None - assert len(data.rf_events) == 2 - redfish_conn_mock.run_get_paged.assert_called_once_with(EVENT_URI, max_pages=args.max_pages) - - -def test_serviceability_data_model_log_model_writes_json(tmp_path): - model = ServiceabilityDataModel( - responses={"/x": {"ok": True}}, - cper_data={"slot": {"raw": "data"}}, - ) - model.log_model(str(tmp_path)) - responses_file = tmp_path / "redfish_responses.json" - cper_file = tmp_path / "cper_data.json" - assert responses_file.is_file() - assert cper_file.is_file() - assert json.loads(responses_file.read_text(encoding="utf-8")) == {"/x": {"ok": True}} - assert json.loads(cper_file.read_text(encoding="utf-8")) == {"slot": {"raw": "data"}} - - -def test_serviceability_data_model_log_model_skips_cper_when_empty(tmp_path): - model = ServiceabilityDataModel(responses={}) - model.log_model(str(tmp_path)) - assert (tmp_path / "redfish_responses.json").is_file() - assert not (tmp_path / "cper_data.json").exists() From f42ff6eabba4c39bca29156b4bda54007fffe846 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 26 May 2026 12:56:01 -0500 Subject: [PATCH 04/19] addressed reviews + added alllogs utests + missing functionality --- nodescraper/plugins/inband/rocm/rocmdata.py | 30 +- .../plugins/serviceability/__init__.py | 68 +++- .../plugins/serviceability/afid_events.py | 144 ++++++++ .../plugins/serviceability/analyzer_args.py | 109 ++++++ .../{oob_redfish => mi3xx}/__init__.py | 32 +- .../serviceability/mi3xx/mi3xx_analyzer.py | 88 +++++ .../serviceability/mi3xx/mi3xx_collector.py | 107 ++++++ .../mi3xx_collector_args.py} | 16 +- .../mi3xx_data.py} | 14 +- .../mi3xx/serviceability_plugin_mi3xx.py | 44 +++ .../oob_redfish/oob_redfish_collector.py | 76 ---- .../plugins/serviceability/se_adapter.py | 137 ++++++++ .../plugins/serviceability/se_models.py | 85 +++++ .../plugins/serviceability/se_runner.py | 269 ++++++++++++++ .../serviceability_collector.py | 197 +++++++++++ .../serviceability/serviceability_data.py | 100 ++++++ ...lugin.py => serviceability_plugin_base.py} | 23 +- .../plugins/serviceability/time_utils.py | 30 +- .../unit/plugin/test_oob_redfish_collector.py | 181 ---------- .../plugin/test_serviceability_collector.py | 329 ++++++++++++++++++ 20 files changed, 1759 insertions(+), 320 deletions(-) create mode 100644 nodescraper/plugins/serviceability/afid_events.py create mode 100644 nodescraper/plugins/serviceability/analyzer_args.py rename nodescraper/plugins/serviceability/{oob_redfish => mi3xx}/__init__.py (70%) create mode 100644 nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py create mode 100644 nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py rename nodescraper/plugins/serviceability/{oob_redfish/oob_redfish_collector_args.py => mi3xx/mi3xx_collector_args.py} (91%) rename nodescraper/plugins/serviceability/{oob_redfish/oob_redfish_data.py => mi3xx/mi3xx_data.py} (95%) create mode 100644 nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py delete mode 100644 nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py create mode 100644 nodescraper/plugins/serviceability/se_adapter.py create mode 100644 nodescraper/plugins/serviceability/se_models.py create mode 100644 nodescraper/plugins/serviceability/se_runner.py create mode 100644 nodescraper/plugins/serviceability/serviceability_collector.py create mode 100644 nodescraper/plugins/serviceability/serviceability_data.py rename nodescraper/plugins/serviceability/{oob_redfish/oob_redfish_plugin.py => serviceability_plugin_base.py} (69%) delete mode 100644 test/unit/plugin/test_oob_redfish_collector.py create mode 100644 test/unit/plugin/test_serviceability_collector.py diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index c7e75608..eb1794c3 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,12 +24,22 @@ # ############################################################################### import re -from typing import List +from typing import List, Optional from pydantic import field_validator from nodescraper.models import DataModel +# e.g. 7.13.0, 7.13.0-123, 7.13.0-123-gfx942, 7.13.0-123-gfx942;gfx950 +_ROCM_VERSION_RE = re.compile(r"^\d+(?:\.\d+){0,3}(?:-\d+)?(?:-gfx\d+(?:;gfx\d+)*)?$") +_ROCM_BUILD_NUMBER_RE = re.compile(r"^\d+(?:\.\d+){0,3}-(\d+)") + + +def _validate_rocm_version_string(rocm_version: str) -> str: + if not _ROCM_VERSION_RE.match(rocm_version): + raise ValueError(f"ROCm version has invalid format: {rocm_version}") + return rocm_version + class RocmDataModel(DataModel): rocm_version: str @@ -58,6 +68,18 @@ def validate_rocm_version(cls, rocm_version: str) -> str: Returns: str: The validated ROCm version string. """ - if not re.match(r"^\d+(?:\.\d+){0,3}(-\d+)?$", rocm_version): - raise ValueError(f"ROCm version has invalid format: {rocm_version}") - return rocm_version + return _validate_rocm_version_string(rocm_version) + + @field_validator("rocm_sub_versions") + @classmethod + def validate_rocm_sub_versions(cls, rocm_sub_versions: dict[str, str]) -> dict[str, str]: + for value in rocm_sub_versions.values(): + _validate_rocm_version_string(value) + return rocm_sub_versions + + @property + def build_number(self) -> Optional[str]: + """ROCm package build number from version-rocm sub-version or rocm_version.""" + version_str = self.rocm_sub_versions.get("version-rocm") or self.rocm_version + match = _ROCM_BUILD_NUMBER_RE.match(version_str) + return match.group(1) if match else None diff --git a/nodescraper/plugins/serviceability/__init__.py b/nodescraper/plugins/serviceability/__init__.py index 16a87632..ae190bca 100644 --- a/nodescraper/plugins/serviceability/__init__.py +++ b/nodescraper/plugins/serviceability/__init__.py @@ -23,34 +23,72 @@ # SOFTWARE. # ############################################################################### -from .oob_redfish import ( - OobRedfishCollector, - OobRedfishCollectorArgs, - OobRedfishDataModel, - OobRedfishDeviceInfo, - OobRedfishPlugin, - OobRedfishResult, - build_oob_redfish_reporting_version_fields, +from .afid_events import build_afid_events_from_data +from .analyzer_args import ServiceabilityAnalyzerArgs +from .mi3xx import ( + Mi3xxAnalyzer, + Mi3xxCollector, + Mi3xxCollectorArgs, + Mi3xxDataModel, + Mi3xxDeviceInfo, + Mi3xxResult, + ServiceabilityPluginMI3XX, + build_mi3xx_reporting_version_fields, ) +from .se_adapter import afid_events_to_engine_input, serviceability_block_from_engine +from .se_models import ( + AfidEvent, + SeInputPayload, + ServiceabilityBlock, + ServiceabilitySolution, +) +from .se_runner import EngineBackend, SeRunError, resolve_engine_command, run_se +from .serviceability_collector import ServiceabilityCollectorBase +from .serviceability_data import ( + DeviceInfo, + ServiceabilityDataModel, + ServiceabilityResult, +) +from .serviceability_plugin_base import ServiceabilityPluginBase from .time_utils import ( TimeOperator, compare_iso_datetime, is_valid_iso_datetime, + normalize_se_timestamp, parse_iso_datetime, satisfies_time_check, ) __all__ = [ - "OobRedfishCollector", - "OobRedfishCollectorArgs", - "OobRedfishDataModel", - "OobRedfishDeviceInfo", - "OobRedfishPlugin", - "OobRedfishResult", + "AfidEvent", + "DeviceInfo", + "EngineBackend", + "Mi3xxAnalyzer", + "Mi3xxCollector", + "Mi3xxCollectorArgs", + "Mi3xxDataModel", + "Mi3xxDeviceInfo", + "Mi3xxResult", + "SeInputPayload", + "SeRunError", + "ServiceabilityAnalyzerArgs", + "ServiceabilityBlock", + "ServiceabilityCollectorBase", + "ServiceabilityDataModel", + "ServiceabilityPluginBase", + "ServiceabilityPluginMI3XX", + "ServiceabilityResult", + "ServiceabilitySolution", "TimeOperator", - "build_oob_redfish_reporting_version_fields", + "afid_events_to_engine_input", + "build_afid_events_from_data", + "serviceability_block_from_engine", + "build_mi3xx_reporting_version_fields", "compare_iso_datetime", "is_valid_iso_datetime", + "normalize_se_timestamp", "parse_iso_datetime", + "resolve_engine_command", + "run_se", "satisfies_time_check", ] diff --git a/nodescraper/plugins/serviceability/afid_events.py b/nodescraper/plugins/serviceability/afid_events.py new file mode 100644 index 00000000..2138c0cf --- /dev/null +++ b/nodescraper/plugins/serviceability/afid_events.py @@ -0,0 +1,144 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import Any, Optional + +from .se_models import AfidEvent +from .serviceability_data import ServiceabilityDataModel +from .time_utils import normalize_se_timestamp + +_EVENT_TIMESTAMP_KEYS = ("Created", "EventTimestamp", "Timestamp") +_AFID_KEYS = ("Afid", "AFID", "afid") + + +def build_afid_events_from_data(data: ServiceabilityDataModel) -> list[AfidEvent]: + """Build SE input events from collected Redfish and CPER fields.""" + events: list[AfidEvent] = [] + seen: set[tuple[int, str, str]] = set() + + for rf_event in data.rf_events: + parsed = _afid_event_from_rf_member(rf_event) + if parsed is None: + continue + key = (parsed.afid, parsed.serviceable_unit, parsed.time) + if key in seen: + continue + seen.add(key) + events.append(parsed) + + for unit, payload in data.cper_data.items(): + parsed = _afid_event_from_cper_slot(str(unit), payload) + if parsed is None: + continue + key = (parsed.afid, parsed.serviceable_unit, parsed.time) + if key in seen: + continue + seen.add(key) + events.append(parsed) + + return events + + +def _afid_event_from_rf_member(member: Any) -> Optional[AfidEvent]: + if not isinstance(member, dict): + return None + afid = _extract_afid(member) + unit = _extract_serviceable_unit(member) + timestamp = _extract_timestamp(member) + if afid is None or unit is None or timestamp is None: + return None + return AfidEvent( + afid=afid, + serviceable_unit=unit, + time=normalize_se_timestamp(timestamp), + ) + + +def _afid_event_from_cper_slot(unit: str, payload: Any) -> Optional[AfidEvent]: + if not isinstance(payload, dict): + return None + afid = _extract_afid(payload) + timestamp = _extract_timestamp(payload) + unit_name = str(payload.get("serviceable_unit") or unit).strip() + if afid is None or not unit_name or timestamp is None: + return None + return AfidEvent( + afid=afid, + serviceable_unit=unit_name, + time=normalize_se_timestamp(timestamp), + ) + + +def _extract_afid(payload: dict[str, Any]) -> Optional[int]: + for key in _AFID_KEYS: + if key in payload and payload[key] is not None: + return int(payload[key]) + oem = payload.get("Oem") + if isinstance(oem, dict): + for vendor_payload in oem.values(): + if isinstance(vendor_payload, dict): + for key in _AFID_KEYS: + if key in vendor_payload and vendor_payload[key] is not None: + return int(vendor_payload[key]) + return None + + +def _extract_serviceable_unit(payload: dict[str, Any]) -> Optional[str]: + for key in ("serviceable_unit", "ServiceableUnit", "OriginOfCondition", "Device"): + value = payload.get(key) + if value is None: + continue + if isinstance(value, dict): + odata_id = value.get("@odata.id") or value.get("odata.id") + if odata_id: + return _unit_from_odata_id(str(odata_id)) + text = str(value).strip() + if text: + return _unit_from_odata_id(text) if "/" in text else text + oem = payload.get("Oem") + if isinstance(oem, dict): + for vendor_payload in oem.values(): + if isinstance(vendor_payload, dict): + unit = vendor_payload.get("serviceable_unit") or vendor_payload.get( + "ServiceableUnit" + ) + if unit is not None and str(unit).strip(): + return str(unit).strip() + return None + + +def _extract_timestamp(payload: dict[str, Any]) -> Optional[str]: + for key in _EVENT_TIMESTAMP_KEYS: + value = payload.get(key) + if value is not None and str(value).strip(): + return str(value).strip() + return None + + +def _unit_from_odata_id(odata_id: str) -> str: + segment = odata_id.rstrip("/").split("/")[-1] + return segment or odata_id diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py new file mode 100644 index 00000000..c20366db --- /dev/null +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -0,0 +1,109 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import List, Literal, Optional + +from pydantic import Field, field_validator, model_validator + +from nodescraper.models import AnalyzerArgs + +EngineBackend = Literal["python", "cli", "subprocess"] + + +class ServiceabilityAnalyzerArgs(AnalyzerArgs): + """Analyzer args for serviceability plugins.""" + + engine_backend: EngineBackend = Field( + default="python", + description=( + "How to invoke the SE: 'python' (serviceability_engine bindings), " + "'cli' (external analyze subcommand), or 'subprocess' (--input/--output protocol)." + ), + ) + engine_python_module: str = Field( + default="serviceability_engine", + description="Python package providing ServiceabilityEngine bindings (python backend).", + ) + engine_executable: Optional[str] = Field( + default=None, + description="Path to the SE binary (cli or subprocess backends).", + ) + engine_entry_point: Optional[str] = Field( + default=None, + description=( + "Command for cli/subprocess backends: executable path or argv prefix on PATH. " + "Required when engine_backend is 'cli' or 'subprocess'." + ), + ) + afid_sag_path: Optional[str] = Field( + default=None, + description="Path to AFID_SAG.json.", + ) + engine_extra_args: List[str] = Field( + default_factory=list, + description="Extra CLI arguments (cli/subprocess backends).", + ) + engine_timeout_seconds: int = Field( + default=600, + ge=1, + le=86_400, + description="Subprocess timeout (cli/subprocess backends).", + ) + skip_engine: bool = Field( + default=False, + description="If True, only build afid_events without running the SE.", + ) + + @field_validator("engine_executable", "engine_entry_point", "afid_sag_path") + @classmethod + def _strip_optional_paths(cls, value: Optional[str]) -> Optional[str]: + if value is None: + return None + text = str(value).strip() + return text or None + + @model_validator(mode="after") + def _require_engine_config_when_running(self) -> ServiceabilityAnalyzerArgs: + if self.skip_engine: + return self + if not self.afid_sag_path: + raise ValueError("afid_sag_path is required when running the serviceability engine.") + if self.engine_backend == "python": + return self + has_exe = self.engine_executable is not None + has_entry = self.engine_entry_point is not None + if has_exe and has_entry: + raise ValueError( + "Provide only one of engine_executable or engine_entry_point " + "for cli/subprocess backends." + ) + if not has_exe and not has_entry: + raise ValueError( + "engine_executable or engine_entry_point is required when " + "engine_backend is 'cli' or 'subprocess'." + ) + return self diff --git a/nodescraper/plugins/serviceability/oob_redfish/__init__.py b/nodescraper/plugins/serviceability/mi3xx/__init__.py similarity index 70% rename from nodescraper/plugins/serviceability/oob_redfish/__init__.py rename to nodescraper/plugins/serviceability/mi3xx/__init__.py index e0dae020..25e83a07 100644 --- a/nodescraper/plugins/serviceability/oob_redfish/__init__.py +++ b/nodescraper/plugins/serviceability/mi3xx/__init__.py @@ -23,22 +23,24 @@ # SOFTWARE. # ############################################################################### -from .oob_redfish_collector import OobRedfishCollector -from .oob_redfish_collector_args import OobRedfishCollectorArgs -from .oob_redfish_data import ( - OobRedfishDataModel, - OobRedfishDeviceInfo, - OobRedfishResult, - build_oob_redfish_reporting_version_fields, +from .mi3xx_analyzer import Mi3xxAnalyzer +from .mi3xx_collector import Mi3xxCollector +from .mi3xx_collector_args import Mi3xxCollectorArgs +from .mi3xx_data import ( + Mi3xxDataModel, + Mi3xxDeviceInfo, + Mi3xxResult, + build_mi3xx_reporting_version_fields, ) -from .oob_redfish_plugin import OobRedfishPlugin +from .serviceability_plugin_mi3xx import ServiceabilityPluginMI3XX __all__ = [ - "OobRedfishCollector", - "OobRedfishCollectorArgs", - "OobRedfishDataModel", - "OobRedfishDeviceInfo", - "OobRedfishPlugin", - "OobRedfishResult", - "build_oob_redfish_reporting_version_fields", + "Mi3xxAnalyzer", + "Mi3xxCollector", + "Mi3xxCollectorArgs", + "Mi3xxDataModel", + "Mi3xxDeviceInfo", + "Mi3xxResult", + "ServiceabilityPluginMI3XX", + "build_mi3xx_reporting_version_fields", ] diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py new file mode 100644 index 00000000..cd67bb58 --- /dev/null +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -0,0 +1,88 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import Optional + +from nodescraper.enums import ExecutionStatus +from nodescraper.interfaces import DataAnalyzer +from nodescraper.models import TaskResult +from nodescraper.plugins.serviceability.afid_events import build_afid_events_from_data +from nodescraper.plugins.serviceability.analyzer_args import ServiceabilityAnalyzerArgs +from nodescraper.plugins.serviceability.se_models import ServiceabilityBlock +from nodescraper.plugins.serviceability.se_runner import SeRunError, run_se +from nodescraper.plugins.serviceability.serviceability_data import ( + ServiceabilityDataModel, +) + + +class Mi3xxAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): + """Build AFID events from collected data and run the serviceability engine.""" + + DATA_MODEL = ServiceabilityDataModel + + def analyze_data( + self, + data: ServiceabilityDataModel, + args: Optional[ServiceabilityAnalyzerArgs] = None, + ) -> TaskResult: + if args is None: + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = "ServiceabilityAnalyzerArgs are required" + return self.result + + events = data.afid_events or build_afid_events_from_data(data) + data.afid_events = events + + if args.skip_engine: + data.serviceability = ServiceabilityBlock(afid_events=events) + self.result.status = ExecutionStatus.OK + self.result.message = f"Built {len(events)} AFID event(s); engine skipped" + return self.result + + try: + block = run_se( + engine_backend=args.engine_backend, + engine_python_module=args.engine_python_module, + engine_executable=args.engine_executable, + engine_entry_point=args.engine_entry_point, + afid_events=events, + afid_sag_path=args.afid_sag_path, # type: ignore[arg-type] + extra_args=args.engine_extra_args or None, + timeout_seconds=args.engine_timeout_seconds, + ) + except (SeRunError, ValueError) as exc: + self.result.status = ExecutionStatus.ERROR + self.result.message = str(exc) + return self.result + + data.serviceability = block + self.result.status = ExecutionStatus.OK + self.result.message = ( + f"Serviceability engine: {len(block.solution)} solution(s) " + f"from {len(events)} event(s)" + ) + return self.result diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py new file mode 100644 index 00000000..8f73941c --- /dev/null +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py @@ -0,0 +1,107 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import Any, Optional + +from nodescraper.plugins.serviceability.serviceability_collector import ( + ServiceabilityCollectorBase, +) +from nodescraper.plugins.serviceability.serviceability_data import DeviceInfo +from nodescraper.plugins.serviceability.time_utils import satisfies_time_check + +from .mi3xx_collector_args import Mi3xxCollectorArgs + +_EVENT_TIMESTAMP_KEYS = ("Created", "EventTimestamp", "Timestamp") + + +class Mi3xxCollector(ServiceabilityCollectorBase[Mi3xxCollectorArgs]): + """MI3xx OOB Redfish serviceability collector.""" + + def satisfies_reference_time( + self, + candidate: str, + args: Mi3xxCollectorArgs, + ) -> bool: + """Test a timestamp against optional reference-time filter settings.""" + if args.reference_time is None or args.time_operator is None: + return True + return satisfies_time_check(candidate, args.reference_time, args.time_operator) + + def filter_event_members( + self, + members: list[Any], + args: Mi3xxCollectorArgs, + ) -> list[Any]: + filtered: list[Any] = [] + for member in members: + if not isinstance(member, dict): + filtered.append(member) + continue + timestamp = self._event_timestamp(member) + if timestamp is None or self.satisfies_reference_time(timestamp, args): + filtered.append(member) + return filtered + + def is_cper_event(self, event: dict) -> bool: + message_id = str(event.get("MessageId", "")).lower() + message = str(event.get("Message", "")).lower() + return "cper" in message_id or "cper" in message or "diagnostic" in message_id + + def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + return {} + + def parse_assembly_entry( + self, + designation: str, + assembly_member_entry: dict[str, Any], + args: Mi3xxCollectorArgs, + ) -> DeviceInfo: + return DeviceInfo( + name=assembly_member_entry.get("Name") or designation, + part_number=assembly_member_entry.get("PartNumber"), + production_date=assembly_member_entry.get("ProductionDate"), + serial_number=assembly_member_entry.get("SerialNumber"), + version=assembly_member_entry.get("Version"), + ) + + def extract_component_details( + self, + firmware_inventory_payload: dict[str, Any], + args: Mi3xxCollectorArgs, + ) -> Optional[str]: + details = firmware_inventory_payload.get("Details") + if details is not None: + return str(details) + return None + + @staticmethod + def _event_timestamp(event: dict[str, Any]) -> Optional[str]: + for key in _EVENT_TIMESTAMP_KEYS: + value = event.get(key) + if value is not None and str(value).strip(): + return str(value).strip() + return None diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py similarity index 91% rename from nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py rename to nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py index 5c1b0687..ae7555d7 100644 --- a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py @@ -36,8 +36,8 @@ ) -class OobRedfishCollectorArgs(CollectorArgs): - """Arguments for OOB Redfish serviceability collection.""" +class Mi3xxCollectorArgs(CollectorArgs): + """MI3xx OOB Redfish serviceability collector arguments.""" uri: Optional[str] = Field( default=None, @@ -99,7 +99,7 @@ def _validate_reference_time_iso(cls, value: Optional[str]) -> Optional[str]: return text @model_validator(mode="after") - def _require_event_log_uri(self) -> OobRedfishCollectorArgs: + def _require_event_log_uri(self) -> Mi3xxCollectorArgs: if not self.resolved_event_log_uri(): raise ValueError( "Provide a non-empty rf_event_log_uri or uri for the event log collection." @@ -107,7 +107,7 @@ def _require_event_log_uri(self) -> OobRedfishCollectorArgs: return self @model_validator(mode="after") - def _assembly_consistency(self) -> OobRedfishCollectorArgs: + def _assembly_consistency(self) -> Mi3xxCollectorArgs: has_tpl = bool( self.rf_assembly_uri_template and "{device}" in self.rf_assembly_uri_template ) @@ -120,7 +120,7 @@ def _assembly_consistency(self) -> OobRedfishCollectorArgs: return self @model_validator(mode="after") - def _reference_time_requires_operator(self) -> OobRedfishCollectorArgs: + def _reference_time_requires_operator(self) -> Mi3xxCollectorArgs: has_ref = self.reference_time is not None has_op = self.time_operator is not None if has_ref != has_op: @@ -128,11 +128,7 @@ def _reference_time_requires_operator(self) -> OobRedfishCollectorArgs: return self def resolved_event_log_uri(self) -> str: - """Resolve the configured event log URI. - - Returns: - Non-empty URI from uri or rf_event_log_uri, or an empty string. - """ + """Return uri or rf_event_log_uri.""" for candidate in (self.uri, self.rf_event_log_uri): if candidate and str(candidate).strip(): return str(candidate).strip() diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py similarity index 95% rename from nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py rename to nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py index 6ad69a7b..6c9c268f 100644 --- a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py @@ -34,7 +34,7 @@ from nodescraper.models import DataModel -class OobRedfishDeviceInfo(BaseModel): +class Mi3xxDeviceInfo(BaseModel): """Device identity with separate board and product fields.""" board_product_name: Optional[str] = Field( @@ -78,7 +78,7 @@ class OobRedfishDeviceInfo(BaseModel): ) -class OobRedfishResult(BaseModel): +class Mi3xxResult(BaseModel): """Structured serviceability report output.""" node: Optional[str] = None @@ -108,7 +108,7 @@ class OobRedfishResult(BaseModel): ) -def build_oob_redfish_reporting_version_fields( +def build_mi3xx_reporting_version_fields( *, plugin_name: Optional[str] = None, plugin_version: Optional[str] = None, @@ -136,14 +136,14 @@ def build_oob_redfish_reporting_version_fields( } -class OobRedfishDataModel(DataModel): +class Mi3xxDataModel(DataModel): """Collected OOB Redfish serviceability data model.""" collected_data: Dict[str, Any] = Field( default_factory=dict, description="Arbitrary keyed payloads from the collector implementation.", ) - device_info: Dict[str, OobRedfishDeviceInfo] = Field( + device_info: Dict[str, Mi3xxDeviceInfo] = Field( default_factory=dict, description="Optional device identity keyed by implementer-defined labels.", ) @@ -156,7 +156,7 @@ class OobRedfishDataModel(DataModel): description="Optional host or service endpoint label (not necessarily a BMC).", ) log_path: Optional[str] = None - result: Optional[OobRedfishResult] = None + result: Optional[Mi3xxResult] = None def log_model(self, log_path: str) -> None: """Write artifact files and a JSON summary under the log directory. @@ -174,7 +174,7 @@ def log_model(self, log_path: str) -> None: artifact_path = os.path.join(log_path, str(filename).strip()) with open(artifact_path, "w", encoding="utf-8") as handle: json.dump(payload, handle, indent=2) - summary_path = os.path.join(log_path, "oob_redfish_data.json") + summary_path = os.path.join(log_path, "mi3xx_data.json") with open(summary_path, "w", encoding="utf-8") as handle: json.dump( self.model_dump( diff --git a/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py new file mode 100644 index 00000000..ee0c510b --- /dev/null +++ b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py @@ -0,0 +1,44 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.plugins.serviceability.serviceability_data import ( + ServiceabilityDataModel, +) +from nodescraper.plugins.serviceability.serviceability_plugin_base import ( + ServiceabilityPluginBase, +) + +from .mi3xx_analyzer import Mi3xxAnalyzer +from .mi3xx_collector import Mi3xxCollector +from .mi3xx_collector_args import Mi3xxCollectorArgs + + +class ServiceabilityPluginMI3XX(ServiceabilityPluginBase): + """MI3xx OOB Redfish serviceability plugin.""" + + DATA_MODEL = ServiceabilityDataModel + COLLECTOR = Mi3xxCollector + ANALYZER = Mi3xxAnalyzer + COLLECTOR_ARGS = Mi3xxCollectorArgs diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py deleted file mode 100644 index 503d7103..00000000 --- a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py +++ /dev/null @@ -1,76 +0,0 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from __future__ import annotations - -from typing import Any, Optional - -from nodescraper.base import RedfishDataCollector -from nodescraper.enums import ExecutionStatus -from nodescraper.models import TaskResult -from nodescraper.plugins.serviceability.time_utils import satisfies_time_check - -from .oob_redfish_collector_args import OobRedfishCollectorArgs -from .oob_redfish_data import OobRedfishDataModel - - -class OobRedfishCollector( - RedfishDataCollector[OobRedfishDataModel, OobRedfishCollectorArgs], -): - """Collect OOB Redfish serviceability data.""" - - DATA_MODEL = OobRedfishDataModel - - def __init__(self, **kwargs: Any) -> None: - self._log_path: Optional[str] = kwargs.pop("log_path", None) - super().__init__(**kwargs) - - def satisfies_reference_time( - self, - candidate: str, - args: OobRedfishCollectorArgs, - ) -> bool: - """Test a timestamp against optional reference-time filter settings. - - Args: - candidate: Timestamp string to test. - args: Collector arguments that may define reference_time and time_operator. - - Returns: - True when no filter is configured or the comparison succeeds. - """ - if args.reference_time is None or args.time_operator is None: - return True - return satisfies_time_check(candidate, args.reference_time, args.time_operator) - - def _missing_args_result(self) -> tuple[TaskResult, None]: - """Build a not-ran result when collector arguments are missing. - - Returns: - Task result with NOT_RAN status and no data model. - """ - self.result.status = ExecutionStatus.NOT_RAN - self.result.message = "OobRedfishCollectorArgs are required" - return self.result, None diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py new file mode 100644 index 00000000..37b5d74c --- /dev/null +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -0,0 +1,137 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Map node-scraper serviceability models to/from the AMD serviceability-engine API.""" +from __future__ import annotations + +from collections import defaultdict +from typing import Any + +from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution + +_DEFAULT_SOLUTION_TIERS = ( + "primary_fru_events", + "secondary_actions", +) + + +def afid_events_to_engine_input(afid_events: list[AfidEvent]) -> list[dict[str, Any]]: + """Convert plugin AFID events to serviceability-engine wire-format dicts. + + The engine triages on (afid, location, count). Duplicate (afid, unit) pairs + are merged by summing counts. Timestamp is preserved only on the plugin side. + """ + counts: dict[tuple[str, str], int] = defaultdict(int) + for event in afid_events: + key = (str(event.afid), event.serviceable_unit) + counts[key] += 1 + return [ + {"afid": afid, "location": location, "count": count} + for (afid, location), count in sorted(counts.items()) + ] + + +def recommendations_from_report_dict( + report: dict[str, Any], + *, + solution_tiers: tuple[str, ...] = _DEFAULT_SOLUTION_TIERS, +) -> list[dict[str, Any]]: + """Derive grouped recommendations from an :func:`serviceability_engine.api.analyze` report.""" + if "recommendations" in report: + return list(report["recommendations"]) + + grouped: dict[tuple[int, int], list[str]] = defaultdict(list) + for tier in solution_tiers: + for row in report.get(tier, []): + if not isinstance(row, dict): + continue + afid = int(row.get("afid", 0)) + location = str(row.get("location", "")).strip() + action_num = _action_num_from_row(row) + if not location or action_num is None: + continue + key = (afid, action_num) + if location not in grouped[key]: + grouped[key].append(location) + + return [ + { + "afid": afid, + "locations": locations, + "service_action_num": action_num, + } + for (afid, action_num), locations in sorted(grouped.items()) + ] + + +def serviceability_block_from_engine( + afid_events: list[AfidEvent], + report: dict[str, Any], + *, + recommendations: list[dict[str, Any]] | None = None, +) -> ServiceabilityBlock: + """Build the ANC ``serviceability`` block from an engine analysis report.""" + recs = ( + recommendations if recommendations is not None else recommendations_from_report_dict(report) + ) + solutions = [ + ServiceabilitySolution( + afid=int(item["afid"]), + serviceable_unit=list(item["locations"]), + service_action_num=int(item["service_action_num"]), + ) + for item in recs + ] + reasoning = _build_solution_reasoning(afid_events, solutions, report) + return ServiceabilityBlock( + afid_events=list(afid_events), + solution=solutions, + solution_reasoning=reasoning, + ) + + +def _action_num_from_row(row: dict[str, Any]) -> int | None: + if "service_action_num" in row: + return int(row["service_action_num"]) + service_action = row.get("service_action") + if isinstance(service_action, dict) and "id" in service_action: + return int(service_action["id"]) + afid_entry = row.get("afid_entry") + if isinstance(afid_entry, dict) and "service_action_num" in afid_entry: + return int(afid_entry["service_action_num"]) + return None + + +def _build_solution_reasoning( + afid_events: list[AfidEvent], + solutions: list[ServiceabilitySolution], + report: dict[str, Any], +) -> str: + sag_pid = report.get("sag_pid") or "unknown" + sag_revision = report.get("sag_revision") or "unknown" + return ( + f"Serviceability engine (SAG {sag_pid} rev {sag_revision}): " + f"{len(solutions)} recommendation(s) from {len(afid_events)} input event(s)." + ) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py new file mode 100644 index 00000000..75919fc3 --- /dev/null +++ b/nodescraper/plugins/serviceability/se_models.py @@ -0,0 +1,85 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import List, Optional + +from pydantic import BaseModel, Field, field_validator + + +class AfidEvent(BaseModel): + """Serviceability engine input: one AFID occurrence on a serviceable unit.""" + + afid: int = Field(description="AMD Fault ID.") + serviceable_unit: str = Field( + description="Unit label (e.g. gpu02); standardized per platform.", + ) + time: str = Field( + description="First-occurrence timestamp (SE format, e.g. 2026-05-07 12:50:42.096-07:00).", + ) + + @field_validator("serviceable_unit") + @classmethod + def _strip_serviceable_unit(cls, value: str) -> str: + text = str(value).strip() + if not text: + raise ValueError("serviceable_unit must be non-empty") + return text + + +class ServiceabilitySolution(BaseModel): + """Serviceability engine output: recommended action for an AFID.""" + + afid: int + serviceable_unit: List[str] = Field( + description="Affected serviceable units for this AFID and service action.", + ) + service_action_num: int = Field( + description="Service action number from AFID_SAG.json.", + ) + + +class ServiceabilityBlock(BaseModel): + """ANC-style serviceability section: SE input, output, and optional reasoning.""" + + afid_events: List[AfidEvent] = Field( + default_factory=list, + description="Input events passed to the serviceability engine.", + ) + solution: List[ServiceabilitySolution] = Field( + default_factory=list, + description="Engine output: recommended service actions.", + ) + solution_reasoning: Optional[str] = Field( + default=None, + description="Human-readable summary of how the engine reached its conclusions.", + ) + + +class SeInputPayload(BaseModel): + """JSON written to the SE ``--input`` file.""" + + afid_events: List[AfidEvent] = Field(default_factory=list) diff --git a/nodescraper/plugins/serviceability/se_runner.py b/nodescraper/plugins/serviceability/se_runner.py new file mode 100644 index 00000000..df28426f --- /dev/null +++ b/nodescraper/plugins/serviceability/se_runner.py @@ -0,0 +1,269 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Run the AMD serviceability engine (Python API, CLI, or custom subprocess).""" +from __future__ import annotations + +import importlib +import json +import shlex +import subprocess +import tempfile +from pathlib import Path +from typing import Literal, Optional + +from .se_adapter import afid_events_to_engine_input, serviceability_block_from_engine +from .se_models import AfidEvent, SeInputPayload, ServiceabilityBlock + +EngineBackend = Literal["python", "cli", "subprocess"] + + +class SeRunError(RuntimeError): + """Raised when the serviceability engine fails or returns invalid output.""" + + +def resolve_engine_command( + *, + engine_executable: Optional[str] = None, + engine_entry_point: Optional[str] = None, +) -> list[str]: + """Build the argv prefix for a subprocess or CLI-backed SE invocation.""" + has_exe = bool(engine_executable and str(engine_executable).strip()) + has_entry = bool(engine_entry_point and str(engine_entry_point).strip()) + if has_exe and has_entry: + raise ValueError("Provide only one of engine_executable or engine_entry_point.") + if not has_exe and not has_entry: + raise ValueError("Provide engine_executable or engine_entry_point.") + if has_exe: + return [str(engine_executable).strip()] + return shlex.split(str(engine_entry_point).strip()) + + +def run_se( + *, + engine_backend: EngineBackend = "python", + engine_python_module: str = "serviceability_engine", + engine_executable: Optional[str] = None, + engine_entry_point: Optional[str] = None, + afid_events: list[AfidEvent], + afid_sag_path: str, + extra_args: Optional[list[str]] = None, + timeout_seconds: int = 600, + work_dir: Optional[str] = None, +) -> ServiceabilityBlock: + """Run the SE and return a :class:`ServiceabilityBlock`.""" + sag_path = Path(afid_sag_path) + if not sag_path.is_file(): + raise SeRunError(f"AFID_SAG file not found: {afid_sag_path}") + + if engine_backend == "python": + return _run_se_python( + engine_python_module=engine_python_module, + afid_events=afid_events, + afid_sag_path=str(sag_path), + ) + if engine_backend == "cli": + return _run_se_cli( + engine_executable=engine_executable, + engine_entry_point=engine_entry_point, + afid_events=afid_events, + afid_sag_path=str(sag_path), + extra_args=extra_args, + timeout_seconds=timeout_seconds, + work_dir=work_dir, + ) + return _run_se_subprocess( + engine_executable=engine_executable, + engine_entry_point=engine_entry_point, + afid_events=afid_events, + afid_sag_path=str(sag_path), + extra_args=extra_args, + timeout_seconds=timeout_seconds, + work_dir=work_dir, + ) + + +def _run_se_python( + *, + engine_python_module: str, + afid_events: list[AfidEvent], + afid_sag_path: str, +) -> ServiceabilityBlock: + try: + se = importlib.import_module(engine_python_module) + SagDocument = se.SagDocument + ServiceabilityEngine = se.ServiceabilityEngine + EventRecord = se.EventRecord + except (ImportError, AttributeError) as exc: + raise SeRunError( + f"Cannot import {engine_python_module} bindings — install serviceability-engine " + f"and build the Python extension (uv build)." + ) from exc + + wire_events = afid_events_to_engine_input(afid_events) + try: + sag = SagDocument.from_file(afid_sag_path) + records = [ + EventRecord( + afid=str(item["afid"]), + location=str(item["location"]), + count=int(item["count"]), + ) + for item in wire_events + ] + analysis = ServiceabilityEngine(sag).analyze(records) + report = analysis.to_dict() + except Exception as exc: + raise SeRunError(f"Serviceability engine analyze() failed: {exc}") from exc + + return serviceability_block_from_engine(afid_events, report) + + +def _run_se_cli( + *, + engine_executable: Optional[str], + engine_entry_point: Optional[str], + afid_events: list[AfidEvent], + afid_sag_path: str, + extra_args: Optional[list[str]], + timeout_seconds: int, + work_dir: Optional[str], +) -> ServiceabilityBlock: + """Invoke an external engine CLI ``analyze --sag … --input …`` and map stdout JSON.""" + command = resolve_engine_command( + engine_executable=engine_executable, + engine_entry_point=engine_entry_point, + ) + wire_events = afid_events_to_engine_input(afid_events) + + with tempfile.TemporaryDirectory(prefix="nodescraper_se_cli_", dir=work_dir) as tmp: + input_path = Path(tmp) / "events.json" + input_path.write_text(json.dumps(wire_events, indent=2), encoding="utf-8") + argv = [ + *command, + "analyze", + "--sag", + afid_sag_path, + "--input", + str(input_path), + ] + if extra_args: + argv.extend(extra_args) + completed = _run_subprocess(argv, timeout_seconds=timeout_seconds) + + try: + report = json.loads(completed.stdout or "{}") + except json.JSONDecodeError as exc: + raise SeRunError(f"Invalid JSON from serviceability engine CLI: {exc}") from exc + + from .se_adapter import recommendations_from_report_dict + + return serviceability_block_from_engine( + afid_events, + report, + recommendations=recommendations_from_report_dict(report), + ) + + +def _run_se_subprocess( + *, + engine_executable: Optional[str], + engine_entry_point: Optional[str], + afid_events: list[AfidEvent], + afid_sag_path: str, + extra_args: Optional[list[str]], + timeout_seconds: int, + work_dir: Optional[str], +) -> ServiceabilityBlock: + """Custom subprocess protocol: ``--input`` / ``--output`` / ``--afid-sag``.""" + command = resolve_engine_command( + engine_executable=engine_executable, + engine_entry_point=engine_entry_point, + ) + payload = SeInputPayload(afid_events=afid_events) + + with tempfile.TemporaryDirectory(prefix="nodescraper_se_", dir=work_dir) as tmp: + tmp_path = Path(tmp) + input_path = tmp_path / "se_input.json" + output_path = tmp_path / "se_output.json" + input_path.write_text( + json.dumps(payload.model_dump(mode="json"), indent=2), + encoding="utf-8", + ) + argv = [ + *command, + "--input", + str(input_path), + "--output", + str(output_path), + "--afid-sag", + str(Path(afid_sag_path).resolve()), + ] + if extra_args: + argv.extend(extra_args) + _run_subprocess(argv, timeout_seconds=timeout_seconds) + + if not output_path.is_file(): + raise SeRunError(f"Serviceability engine did not write output file: {output_path}") + try: + raw = json.loads(output_path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise SeRunError(f"Invalid JSON from serviceability engine: {exc}") from exc + + block = ServiceabilityBlock.model_validate(raw) + if not block.afid_events: + block.afid_events = list(afid_events) + return block + + +def _run_subprocess(argv: list[str], *, timeout_seconds: int) -> subprocess.CompletedProcess: + exe = Path(argv[0]) + if not exe.is_file() and not _command_on_path(argv[0]): + raise SeRunError(f"Serviceability engine not found or not executable: {argv[0]!r}") + try: + completed = subprocess.run( + argv, + capture_output=True, + text=True, + timeout=timeout_seconds, + check=False, + ) + except subprocess.TimeoutExpired as exc: + raise SeRunError(f"Serviceability engine timed out after {timeout_seconds}s") from exc + except OSError as exc: + raise SeRunError(f"Failed to start serviceability engine: {exc}") from exc + + if completed.returncode != 0: + stderr = (completed.stderr or "").strip() + stdout = (completed.stdout or "").strip() + detail = stderr or stdout or f"exit code {completed.returncode}" + raise SeRunError(f"Serviceability engine failed: {detail}") + return completed + + +def _command_on_path(name: str) -> bool: + from shutil import which + + return which(name) is not None diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py new file mode 100644 index 00000000..961afdf9 --- /dev/null +++ b/nodescraper/plugins/serviceability/serviceability_collector.py @@ -0,0 +1,197 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +import abc +from typing import Any, Generic, Optional, Protocol, TypeVar, cast +from urllib.parse import urlparse + +from nodescraper.base import RedfishDataCollector +from nodescraper.connection.redfish import RF_MEMBERS, RF_MEMBERS_COUNT +from nodescraper.enums import ExecutionStatus +from nodescraper.models import CollectorArgs, TaskResult + +from .serviceability_data import DeviceInfo, ServiceabilityDataModel + + +class _ServiceabilityCollectArg(Protocol): + follow_next_link: bool + max_pages: int + top: Optional[int] + rf_assembly_uri_template: Optional[str] + rf_chassis_devices: Optional[list[str]] + rf_firmware_bundle_uri: Optional[str] + + def resolved_event_log_uri(self) -> str: ... + + +TServiceabilityCollectArg = TypeVar("TServiceabilityCollectArg", bound=_ServiceabilityCollectArg) + + +class ServiceabilityCollectorBase( + RedfishDataCollector[ServiceabilityDataModel, CollectorArgs], + Generic[TServiceabilityCollectArg], +): + """OOB Redfish collection skeleton; subclasses implement filtering, CPER handling, and JSON parsing.""" + + DATA_MODEL = ServiceabilityDataModel + + def __init__(self, **kwargs: Any) -> None: + self._log_path: Optional[str] = kwargs.get("log_path") + super().__init__(**kwargs) + + @abc.abstractmethod + def filter_event_members( + self, + members: list[Any], + args: TServiceabilityCollectArg, + ) -> list[Any]: + """Return the event list to retain for downstream analysis.""" + + @abc.abstractmethod + def is_cper_event(self, event: dict) -> bool: + """Return whether a Redfish event entry should be treated as diagnostic-backed.""" + + @abc.abstractmethod + def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + """Fetch and decode diagnostic attachments for qualifying events (subclass-defined).""" + + @abc.abstractmethod + def parse_assembly_entry( + self, + designation: str, + assembly_member_entry: dict[str, Any], + args: TServiceabilityCollectArg, + ) -> DeviceInfo: + """Map one Assemblies[] member dict into DeviceInfo.""" + + @abc.abstractmethod + def extract_component_details( + self, + firmware_inventory_payload: dict[str, Any], + args: TServiceabilityCollectArg, + ) -> Optional[str]: + """Derive component-details text from a firmware inventory GET payload, or None.""" + + def _fetch_event_log(self, args: TServiceabilityCollectArg, uri: str): + if args.follow_next_link: + return self._run_redfish_get_paged(uri, max_pages=args.max_pages) + return self._run_redfish_get(uri, log_artifact=True) + + def collect_data( + self, args: Optional[CollectorArgs] = None + ) -> tuple[TaskResult, Optional[ServiceabilityDataModel]]: + if args is None: + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = "Collector args are required" + return self.result, None + + svc_args = cast(TServiceabilityCollectArg, args) + event_uri = svc_args.resolved_event_log_uri() + if svc_args.top is not None: + res = self._fetch_top(svc_args, svc_args.top, svc_args.max_pages) + else: + res = self._fetch_event_log(svc_args, event_uri) + + if not res.success or res.data is None: + self.result.status = ExecutionStatus.ERROR + self.result.message = f"Redfish GET failed for {event_uri}: {res.error}" + return self.result, None + + members = res.data.get(RF_MEMBERS, []) + responses = {res.path: res.data} + raw_base_url = getattr(self.connection, "base_url", None) + bmc_host = urlparse(raw_base_url).hostname if raw_base_url else None + + try: + filtered_members = self.filter_event_members(members, svc_args) + except ValueError as exc: + self.result.status = ExecutionStatus.ERROR + self.result.message = f"Event filter failed: {exc}" + return self.result, None + + assembly_info: dict[str, DeviceInfo] = {} + tpl = svc_args.rf_assembly_uri_template + devices = svc_args.rf_chassis_devices + if tpl and devices: + for device in devices: + uri_asm = tpl.format(device=device) + assembly_res = self._run_redfish_get(uri_asm, log_artifact=True) + if not assembly_res.success or assembly_res.data is None: + continue + responses[assembly_res.path] = assembly_res.data + + assemblies = assembly_res.data.get("Assemblies", []) + if not assemblies: + continue + + entry = assemblies[0] + assembly_info[device] = self.parse_assembly_entry(device, entry, svc_args) + + cper_data = self.collect_cper_data(filtered_members or []) + + data = ServiceabilityDataModel( + responses=responses, + rf_events=filtered_members or [], + assembly_info=assembly_info, + cper_data=cper_data, + component_details=self._fetch_component_details(responses, svc_args), + log_path=self._log_path, + bmc_host=bmc_host, + ) + self.result.status = ExecutionStatus.OK + self.result.message = f"Collected {len(members)} event log member(s)" + return self.result, data + + def _fetch_component_details( + self, responses: dict[str, Any], args: TServiceabilityCollectArg + ) -> Optional[str]: + fw_uri = args.rf_firmware_bundle_uri + if not fw_uri or not str(fw_uri).strip(): + return None + fw_uri = str(fw_uri).strip() + fw_res = self._run_redfish_get(fw_uri, log_artifact=True) + if not fw_res.success or fw_res.data is None: + return None + responses[fw_res.path] = fw_res.data + return self.extract_component_details(fw_res.data, args) + + def _fetch_top(self, args: TServiceabilityCollectArg, top: int, max_pages: int): + event_uri = args.resolved_event_log_uri() + probe = self._run_redfish_get(f"{event_uri}?$top=1", log_artifact=True) + if not probe.success or probe.data is None: + return probe + + count = probe.data.get(RF_MEMBERS_COUNT, 0) + + if count <= top: + return self._fetch_event_log(args, event_uri) + + skip = count - top + skip_uri = f"{event_uri}?$skip={skip}" + if args.follow_next_link: + return self._run_redfish_get_paged(skip_uri, max_pages=max_pages) + return self._run_redfish_get(skip_uri, log_artifact=True) diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py new file mode 100644 index 00000000..68a7daea --- /dev/null +++ b/nodescraper/plugins/serviceability/serviceability_data.py @@ -0,0 +1,100 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +import json +import os +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + +from nodescraper.models import DataModel + +from .se_models import AfidEvent, ServiceabilityBlock + + +class DeviceInfo(BaseModel): + """Chassis fields from Assembly parsing; extra vendor keys belong in oem_extensions.""" + + name: Optional[str] = None + part_number: Optional[str] = None + production_date: Optional[str] = None + serial_number: Optional[str] = None + version: Optional[str] = None + oem_extensions: Dict[str, Any] = Field( + default_factory=dict, + description="Opaque vendor/product extensions parsed by the concrete collector.", + ) + + +class ServiceabilityResult(BaseModel): + """Structured serviceability output (typically populated by a downstream analyzer).""" + + node: Optional[str] = None + service_recommendations: Dict[str, List[dict]] = {} + service_action_definitions: Dict[str, dict] = {} + afid_sag_metadata: Dict[str, Any] = {} + node_info: Dict[str, Any] = {} + + +class ServiceabilityDataModel(DataModel): + """Collected Redfish responses and intermediate serviceability fields.""" + + responses: dict[str, Any] = {} + rf_events: list[Any] = [] + assembly_info: Dict[str, DeviceInfo] = {} + cper_data: Dict[str, Any] = {} + component_details: Optional[str] = None + log_path: Optional[str] = None + bmc_host: Optional[str] = None + afid_events: List[AfidEvent] = Field( + default_factory=list, + description="Serviceability engine input; built during analysis when not pre-filled.", + ) + serviceability: Optional[ServiceabilityBlock] = Field( + default=None, + description="ANC-style serviceability block (SE input + output).", + ) + result: Optional[ServiceabilityResult] = None + + def log_model(self, log_path: str) -> None: + """Write collector artifacts and optional serviceability.json under log_path.""" + os.makedirs(log_path, exist_ok=True) + responses_path = os.path.join(log_path, "redfish_responses.json") + with open(responses_path, "w", encoding="utf-8") as f: + json.dump(self.responses, f, indent=2) + if self.cper_data: + cper_path = os.path.join(log_path, "cper_data.json") + with open(cper_path, "w", encoding="utf-8") as f: + json.dump(self.cper_data, f, indent=2) + if self.serviceability is not None: + serviceability_path = os.path.join(log_path, "serviceability.json") + with open(serviceability_path, "w", encoding="utf-8") as f: + json.dump( + self.serviceability.model_dump(mode="json"), + f, + indent=2, + ) diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py b/nodescraper/plugins/serviceability/serviceability_plugin_base.py similarity index 69% rename from nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py rename to nodescraper/plugins/serviceability/serviceability_plugin_base.py index b891c522..991a2f99 100644 --- a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py +++ b/nodescraper/plugins/serviceability/serviceability_plugin_base.py @@ -24,17 +24,22 @@ # ############################################################################### from nodescraper.base import OOBandDataPlugin +from nodescraper.models import CollectorArgs -from .oob_redfish_collector import OobRedfishCollector -from .oob_redfish_collector_args import OobRedfishCollectorArgs -from .oob_redfish_data import OobRedfishDataModel +from .analyzer_args import ServiceabilityAnalyzerArgs +from .serviceability_collector import ServiceabilityCollectorBase +from .serviceability_data import ServiceabilityDataModel -class OobRedfishPlugin( - OOBandDataPlugin[OobRedfishDataModel, OobRedfishCollectorArgs, None], +class ServiceabilityPluginBase( + OOBandDataPlugin[ + ServiceabilityDataModel, + CollectorArgs, + ServiceabilityAnalyzerArgs, + ], ): - """OOB Redfish serviceability plugin base.""" + """OOB Redfish plugin stub; subclass with a concrete COLLECTOR and COLLECTOR_ARGS.""" - DATA_MODEL = OobRedfishDataModel - COLLECTOR = OobRedfishCollector - COLLECTOR_ARGS = OobRedfishCollectorArgs + DATA_MODEL = ServiceabilityDataModel + COLLECTOR = ServiceabilityCollectorBase + ANALYZER_ARGS = ServiceabilityAnalyzerArgs diff --git a/nodescraper/plugins/serviceability/time_utils.py b/nodescraper/plugins/serviceability/time_utils.py index 8bbc8a83..5653f4a9 100644 --- a/nodescraper/plugins/serviceability/time_utils.py +++ b/nodescraper/plugins/serviceability/time_utils.py @@ -49,11 +49,33 @@ def is_valid_iso_datetime(value: str) -> bool: return True +def normalize_se_timestamp(value: str) -> str: + """Normalize a timestamp to the serviceability engine wire format. + + Accepts ISO-8601 (``2026-05-07T12:50:42``) and SE-style strings with a space + separator (``2026-05-07 12:50:42.096-07:00``). + """ + text = str(value).strip() + if not text: + raise ValueError("Empty datetime string") + if " " in text and "T" not in text: + return text + parsed = parse_iso_datetime(text) + micro = parsed.microsecond + base = parsed.strftime("%Y-%m-%d %H:%M:%S") + if micro: + base = f"{base}.{micro:06d}".rstrip("0").rstrip(".") + offset = parsed.strftime("%z") + if offset: + return f"{base}{offset[:3]}:{offset[3:]}" + return base + + def parse_iso_datetime(value: str) -> datetime: - """Parse an ISO-8601 date or date-time string. + """Parse an ISO-8601 or SE-style date-time string. Args: - value: Date (e.g. 2026-05-17) or date-time (e.g. 2026-05-17T13:01:00). + value: Date (e.g. 2026-05-17), ISO date-time, or SE format with a space separator. Returns: Parsed datetime. @@ -63,11 +85,13 @@ def parse_iso_datetime(value: str) -> datetime: raise ValueError("Empty datetime string") if text.endswith("Z"): text = f"{text[:-1]}+00:00" + if " " in text and "T" not in text: + text = text.replace(" ", "T", 1) try: parsed = datetime.fromisoformat(text) except ValueError as exc: raise ValueError(f"Not ISO-8601 compliant: {value!r}") from exc - if "T" not in text and "+" not in text and text.count("-") == 2: + if "T" not in value and "+" not in value and value.count("-") == 2: return parsed.replace(hour=0, minute=0, second=0, microsecond=0) return parsed diff --git a/test/unit/plugin/test_oob_redfish_collector.py b/test/unit/plugin/test_oob_redfish_collector.py deleted file mode 100644 index e729cedc..00000000 --- a/test/unit/plugin/test_oob_redfish_collector.py +++ /dev/null @@ -1,181 +0,0 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from typing import Optional - -import pytest -from pydantic import ValidationError - -from nodescraper.base import OOBandDataPlugin -from nodescraper.connection.redfish import RedfishConnectionManager -from nodescraper.enums import ExecutionStatus -from nodescraper.plugins.serviceability import ( - OobRedfishCollector, - OobRedfishCollectorArgs, - OobRedfishDataModel, - OobRedfishDeviceInfo, - OobRedfishPlugin, - OobRedfishResult, - build_oob_redfish_reporting_version_fields, - compare_iso_datetime, - is_valid_iso_datetime, - satisfies_time_check, -) - -EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" - - -class _StubOobRedfishCollector(OobRedfishCollector): - def collect_data(self, args: Optional[OobRedfishCollectorArgs] = None): - if args is None: - return self._missing_args_result() - data = OobRedfishDataModel( - collected_data={"events": []}, - log_path=self._log_path, - ) - self.result.status = ExecutionStatus.OK - self.result.message = "stub collection complete" - return self.result, data - - -@pytest.fixture -def stub_oob_redfish_collector(system_info, redfish_conn_mock): - return _StubOobRedfishCollector( - system_info=system_info, - connection=redfish_conn_mock, - log_path="/tmp/oob_redfish.log", - ) - - -def test_oob_redfish_collector_args_requires_event_log_uri(): - with pytest.raises(ValidationError): - OobRedfishCollectorArgs() - - -def test_oob_redfish_collector_args_uri_alias(): - args = OobRedfishCollectorArgs(uri=" /events ", rf_event_log_uri="/other") - assert args.resolved_event_log_uri() == "/events" - - -def test_oob_redfish_collector_args_assembly_requires_both_template_and_devices(): - with pytest.raises(ValidationError): - OobRedfishCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", - ) - with pytest.raises(ValidationError): - OobRedfishCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_chassis_devices=["C1"], - ) - - -def test_oob_redfish_collector_args_reference_time_requires_operator(): - with pytest.raises(ValidationError): - OobRedfishCollectorArgs( - rf_event_log_uri=EVENT_URI, - reference_time="2026-05-17", - ) - - -def test_oob_redfish_collector_args_accepts_iso_date_and_datetime(): - date_args = OobRedfishCollectorArgs( - rf_event_log_uri=EVENT_URI, - reference_time="2026-05-17", - time_operator=">=", - ) - assert date_args.reference_time == "2026-05-17" - - -def test_time_utils_iso_validation_and_comparison(): - assert is_valid_iso_datetime("2026-05-17") - assert satisfies_time_check("2026-05-18", "2026-05-17", ">") - assert compare_iso_datetime("2026-05-17T13:01:00", "2026-05-17T13:01:00", "==") - - -def test_oob_redfish_plugin_wiring(): - assert issubclass(OobRedfishPlugin, OOBandDataPlugin) - assert OobRedfishPlugin.DATA_MODEL is OobRedfishDataModel - assert OobRedfishPlugin.COLLECTOR is OobRedfishCollector - assert OobRedfishPlugin.COLLECTOR_ARGS is OobRedfishCollectorArgs - assert OobRedfishPlugin.CONNECTION_TYPE is RedfishConnectionManager - assert OobRedfishPlugin.ANALYZER is None - - -def test_stub_collector_no_args(stub_oob_redfish_collector): - result, data = stub_oob_redfish_collector.collect_data() - assert result.status == ExecutionStatus.NOT_RAN - assert "required" in result.message.lower() - assert data is None - - -def test_stub_collector_success_minimal(stub_oob_redfish_collector): - args = OobRedfishCollectorArgs(rf_event_log_uri=EVENT_URI) - result, data = stub_oob_redfish_collector.collect_data(args=args) - assert result.status == ExecutionStatus.OK - assert data is not None - assert data.collected_data == {"events": []} - - -def test_collector_satisfies_reference_time_helper(stub_oob_redfish_collector): - args = OobRedfishCollectorArgs( - rf_event_log_uri=EVENT_URI, - reference_time="2026-05-17", - time_operator=">=", - ) - assert stub_oob_redfish_collector.satisfies_reference_time("2026-05-18", args) - assert not stub_oob_redfish_collector.satisfies_reference_time("2026-05-16", args) - - -def test_oob_redfish_device_info_fields(): - info = OobRedfishDeviceInfo( - board_product_name="Board-A", - board_serial_number="BSN-1", - product_version="1.0", - ) - assert info.board_product_name == "Board-A" - assert info.product_version == "1.0" - - -def test_oob_redfish_result_reporting_versions(): - version_fields = build_oob_redfish_reporting_version_fields( - plugin_name="example_oob_redfish", - plugin_version="0.1.0", - node_scraper_version="1.2.3", - isa_version="9.8.7", - ) - result = OobRedfishResult(node="node-1", **version_fields) - assert result.plugin_name == "example_oob_redfish" - assert result.reporter_extensions["isa_version"] == "9.8.7" - - -def test_oob_redfish_data_model_log_model(tmp_path): - model = OobRedfishDataModel( - collected_data={"events": [{"id": 1}]}, - artifacts={"events.json": [{"id": 1}]}, - ) - model.log_model(str(tmp_path)) - assert (tmp_path / "events.json").is_file() - assert (tmp_path / "oob_redfish_data.json").is_file() diff --git a/test/unit/plugin/test_serviceability_collector.py b/test/unit/plugin/test_serviceability_collector.py new file mode 100644 index 00000000..d7496288 --- /dev/null +++ b/test/unit/plugin/test_serviceability_collector.py @@ -0,0 +1,329 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from typing import Any, Optional + +import pytest +from pydantic import ValidationError + +from nodescraper.connection.redfish import ( + RF_MEMBERS, + RF_MEMBERS_COUNT, + RedfishGetResult, +) +from nodescraper.enums import ExecutionStatus +from nodescraper.models import CollectorArgs +from nodescraper.plugins.serviceability import ( + DeviceInfo, + Mi3xxCollectorArgs, + ServiceabilityAnalyzerArgs, + ServiceabilityDataModel, + ServiceabilityPluginBase, +) +from nodescraper.plugins.serviceability.serviceability_collector import ( + ServiceabilityCollectorBase, +) + +EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" + + +class _StubServiceabilityCollector(ServiceabilityCollectorBase[Mi3xxCollectorArgs]): + def filter_event_members( + self, + members: list[Any], + args: Mi3xxCollectorArgs, + ) -> list[Any]: + return members + + def is_cper_event(self, event: dict) -> bool: + return False + + def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + return {} + + def parse_assembly_entry( + self, + designation: str, + assembly_member_entry: dict[str, Any], + args: Mi3xxCollectorArgs, + ) -> DeviceInfo: + return DeviceInfo(name=designation, serial_number=assembly_member_entry.get("SerialNumber")) + + def extract_component_details( + self, + firmware_inventory_payload: dict[str, Any], + args: Mi3xxCollectorArgs, + ) -> Optional[str]: + return firmware_inventory_payload.get("Details") + + +@pytest.fixture +def stub_serviceability_collector(system_info, redfish_conn_mock): + redfish_conn_mock.base_url = "https://bmc.example/redfish/v1" + return _StubServiceabilityCollector( + system_info=system_info, + connection=redfish_conn_mock, + log_path="/tmp/serviceability.log", + ) + + +def test_mi3xx_collector_args_requires_event_log_uri(): + with pytest.raises(ValidationError): + Mi3xxCollectorArgs() + + +def test_mi3xx_collector_args_uri_alias_prefers_uri_over_rf_event_log_uri(): + args = Mi3xxCollectorArgs(uri=" /events ", rf_event_log_uri="/other") + assert args.resolved_event_log_uri() == "/events" + + +def test_mi3xx_collector_args_assembly_requires_both_template_and_devices(): + with pytest.raises(ValidationError): + Mi3xxCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", + ) + with pytest.raises(ValidationError): + Mi3xxCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_chassis_devices=["C1"], + ) + + +def test_mi3xx_collector_args_assembly_template_must_include_device_placeholder(): + with pytest.raises(ValidationError): + Mi3xxCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/C1/Assembly", + rf_chassis_devices=["C1"], + ) + + +def test_mi3xx_collector_args_assembly_optional_when_omitted(): + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + assert args.rf_assembly_uri_template is None + assert args.rf_chassis_devices is None + + +def test_serviceability_plugin_base_wiring(): + assert ServiceabilityPluginBase.DATA_MODEL is ServiceabilityDataModel + assert ServiceabilityPluginBase.COLLECTOR is ServiceabilityCollectorBase + assert getattr(ServiceabilityPluginBase, "COLLECTOR_ARGS", CollectorArgs) is CollectorArgs + assert ServiceabilityPluginBase.ANALYZER_ARGS is ServiceabilityAnalyzerArgs + assert ServiceabilityPluginBase.ANALYZER is None + + +def test_stub_collector_no_args(stub_serviceability_collector): + result, data = stub_serviceability_collector.collect_data() + assert result.status == ExecutionStatus.NOT_RAN + assert "required" in result.message.lower() + assert data is None + + +def test_stub_collector_event_log_get_fails(stub_serviceability_collector, redfish_conn_mock): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=False, + error="timeout", + status_code=None, + ) + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.ERROR + assert EVENT_URI in result.message + assert data is None + + +def test_stub_collector_success_minimal(stub_serviceability_collector, redfish_conn_mock): + members = [{"Id": "1"}] + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: members}, + status_code=200, + ) + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rf_events == members + assert EVENT_URI in data.responses + assert data.bmc_host == "bmc.example" + assert data.log_path == "/tmp/serviceability.log" + redfish_conn_mock.run_get_paged.assert_called_once() + + +def test_stub_collector_filter_raises_maps_to_error( + stub_serviceability_collector, redfish_conn_mock +): + class _BadFilter(_StubServiceabilityCollector): + def filter_event_members(self, members, args): + raise ValueError("bad filter") + + collector = _BadFilter( + system_info=stub_serviceability_collector.system_info, + connection=redfish_conn_mock, + ) + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: []}, + status_code=200, + ) + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = collector.collect_data(args=args) + assert result.status == ExecutionStatus.ERROR + assert "Event filter failed" in result.message + assert data is None + + +def test_stub_collector_assembly_and_firmware_paths( + stub_serviceability_collector, redfish_conn_mock +): + tpl = "/redfish/v1/Chassis/{device}/Assembly" + asm_uri = tpl.format(device="C1") + fw_uri = "/redfish/v1/UpdateService/FirmwareInventory" + + def run_get_side_effect(path: str, *_args, **_kwargs): + if path == EVENT_URI: + return RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: []}, + status_code=200, + ) + if path == asm_uri: + return RedfishGetResult( + path=asm_uri, + success=True, + data={"Assemblies": [{"SerialNumber": "SN-ASM"}]}, + status_code=200, + ) + if path == fw_uri: + return RedfishGetResult( + path=fw_uri, + success=True, + data={"Details": "fw-summary"}, + status_code=200, + ) + raise AssertionError(f"unexpected Redfish GET path: {path!r}") + + redfish_conn_mock.run_get.side_effect = run_get_side_effect + + def run_get_paged_forbidden(*_args, **_kwargs): + raise AssertionError("run_get_paged must not run when follow_next_link=False") + + redfish_conn_mock.run_get_paged.side_effect = run_get_paged_forbidden + + args = Mi3xxCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template=tpl, + rf_chassis_devices=["C1"], + rf_firmware_bundle_uri=fw_uri, + follow_next_link=False, + ) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert "C1" in data.assembly_info + assert data.assembly_info["C1"].serial_number == "SN-ASM" + assert data.component_details == "fw-summary" + assert asm_uri in data.responses + + +def test_stub_collector_top_when_count_exceeds_top_uses_skip_and_paged( + stub_serviceability_collector, redfish_conn_mock +): + probe = RedfishGetResult( + path=f"{EVENT_URI}?$top=1", + success=True, + data={RF_MEMBERS_COUNT: 100}, + status_code=200, + ) + window = RedfishGetResult( + path=f"{EVENT_URI}?$skip=90", + success=True, + data={RF_MEMBERS: [{"Id": "last"}]}, + status_code=200, + ) + redfish_conn_mock.run_get.return_value = probe + redfish_conn_mock.run_get_paged.return_value = window + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI, top=10) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rf_events == [{"Id": "last"}] + redfish_conn_mock.run_get.assert_called_once() + assert "?$top=1" in redfish_conn_mock.run_get.call_args[0][0] + redfish_conn_mock.run_get_paged.assert_called_once_with( + f"{EVENT_URI}?$skip=90", max_pages=args.max_pages + ) + + +def test_stub_collector_top_when_count_within_top_fetches_full_log( + stub_serviceability_collector, redfish_conn_mock +): + probe = RedfishGetResult( + path=f"{EVENT_URI}?$top=1", + success=True, + data={RF_MEMBERS_COUNT: 3}, + status_code=200, + ) + full = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: [{"Id": "a"}, {"Id": "b"}]}, + status_code=200, + ) + redfish_conn_mock.run_get.return_value = probe + redfish_conn_mock.run_get_paged.return_value = full + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI, top=50) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.rf_events) == 2 + redfish_conn_mock.run_get_paged.assert_called_once_with(EVENT_URI, max_pages=args.max_pages) + + +def test_serviceability_data_model_log_model_writes_json(tmp_path): + model = ServiceabilityDataModel( + responses={"/x": {"ok": True}}, + cper_data={"slot": {"raw": "data"}}, + ) + model.log_model(str(tmp_path)) + responses_file = tmp_path / "redfish_responses.json" + cper_file = tmp_path / "cper_data.json" + assert responses_file.is_file() + assert cper_file.is_file() + assert json.loads(responses_file.read_text(encoding="utf-8")) == {"/x": {"ok": True}} + assert json.loads(cper_file.read_text(encoding="utf-8")) == {"slot": {"raw": "data"}} + + +def test_serviceability_data_model_log_model_skips_cper_when_empty(tmp_path): + model = ServiceabilityDataModel(responses={}) + model.log_model(str(tmp_path)) + assert (tmp_path / "redfish_responses.json").is_file() + assert not (tmp_path / "cper_data.json").exists() From af9a02d23c32a8fe2d0990aec30229dd7b6eebac Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 26 May 2026 14:32:47 -0500 Subject: [PATCH 05/19] rename --- .../plugins/serviceability/analyzer_args.py | 8 +++--- .../serviceability/mi3xx/mi3xx_analyzer.py | 5 ++-- .../plugins/serviceability/se_adapter.py | 8 +++--- .../plugins/serviceability/se_models.py | 6 ++-- .../plugins/serviceability/se_runner.py | 28 +++++++++---------- .../serviceability/serviceability_data.py | 2 +- .../plugins/serviceability/time_utils.py | 2 +- 7 files changed, 29 insertions(+), 30 deletions(-) diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py index c20366db..d9fa09bb 100644 --- a/nodescraper/plugins/serviceability/analyzer_args.py +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -40,13 +40,13 @@ class ServiceabilityAnalyzerArgs(AnalyzerArgs): engine_backend: EngineBackend = Field( default="python", description=( - "How to invoke the SE: 'python' (serviceability_engine bindings), " + "How to invoke the SE: 'python' (service_hub bindings), " "'cli' (external analyze subcommand), or 'subprocess' (--input/--output protocol)." ), ) engine_python_module: str = Field( - default="serviceability_engine", - description="Python package providing ServiceabilityEngine bindings (python backend).", + default="service_hub", + description="Python package providing ServiceHub bindings (python backend).", ) engine_executable: Optional[str] = Field( default=None, @@ -91,7 +91,7 @@ def _require_engine_config_when_running(self) -> ServiceabilityAnalyzerArgs: if self.skip_engine: return self if not self.afid_sag_path: - raise ValueError("afid_sag_path is required when running the serviceability engine.") + raise ValueError("afid_sag_path is required when running Service Hub.") if self.engine_backend == "python": return self has_exe = self.engine_executable is not None diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index cd67bb58..d74f297a 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -40,7 +40,7 @@ class Mi3xxAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): - """Build AFID events from collected data and run the serviceability engine.""" + """Build AFID events from collected data and run Service Hub.""" DATA_MODEL = ServiceabilityDataModel @@ -82,7 +82,6 @@ def analyze_data( data.serviceability = block self.result.status = ExecutionStatus.OK self.result.message = ( - f"Serviceability engine: {len(block.solution)} solution(s) " - f"from {len(events)} event(s)" + f"Service Hub: {len(block.solution)} solution(s) " f"from {len(events)} event(s)" ) return self.result diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 37b5d74c..4b4c7a2e 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -"""Map node-scraper serviceability models to/from the AMD serviceability-engine API.""" +"""Map node-scraper serviceability models to/from the AMD Service Hub API.""" from __future__ import annotations from collections import defaultdict @@ -38,7 +38,7 @@ def afid_events_to_engine_input(afid_events: list[AfidEvent]) -> list[dict[str, Any]]: - """Convert plugin AFID events to serviceability-engine wire-format dicts. + """Convert plugin AFID events to Service Hub wire-format dicts. The engine triages on (afid, location, count). Duplicate (afid, unit) pairs are merged by summing counts. Timestamp is preserved only on the plugin side. @@ -58,7 +58,7 @@ def recommendations_from_report_dict( *, solution_tiers: tuple[str, ...] = _DEFAULT_SOLUTION_TIERS, ) -> list[dict[str, Any]]: - """Derive grouped recommendations from an :func:`serviceability_engine.api.analyze` report.""" + """Derive grouped recommendations from an :func:`service_hub.api.analyze` report.""" if "recommendations" in report: return list(report["recommendations"]) @@ -132,6 +132,6 @@ def _build_solution_reasoning( sag_pid = report.get("sag_pid") or "unknown" sag_revision = report.get("sag_revision") or "unknown" return ( - f"Serviceability engine (SAG {sag_pid} rev {sag_revision}): " + f"Service Hub (SAG {sag_pid} rev {sag_revision}): " f"{len(solutions)} recommendation(s) from {len(afid_events)} input event(s)." ) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index 75919fc3..f5fc54bb 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -31,7 +31,7 @@ class AfidEvent(BaseModel): - """Serviceability engine input: one AFID occurrence on a serviceable unit.""" + """Service Hub input: one AFID occurrence on a serviceable unit.""" afid: int = Field(description="AMD Fault ID.") serviceable_unit: str = Field( @@ -51,7 +51,7 @@ def _strip_serviceable_unit(cls, value: str) -> str: class ServiceabilitySolution(BaseModel): - """Serviceability engine output: recommended action for an AFID.""" + """Service Hub output: recommended action for an AFID.""" afid: int serviceable_unit: List[str] = Field( @@ -67,7 +67,7 @@ class ServiceabilityBlock(BaseModel): afid_events: List[AfidEvent] = Field( default_factory=list, - description="Input events passed to the serviceability engine.", + description="Input events passed to Service Hub.", ) solution: List[ServiceabilitySolution] = Field( default_factory=list, diff --git a/nodescraper/plugins/serviceability/se_runner.py b/nodescraper/plugins/serviceability/se_runner.py index df28426f..0fda2e5e 100644 --- a/nodescraper/plugins/serviceability/se_runner.py +++ b/nodescraper/plugins/serviceability/se_runner.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -"""Run the AMD serviceability engine (Python API, CLI, or custom subprocess).""" +"""Run the AMD Service Hub (Python API, CLI, or custom subprocess).""" from __future__ import annotations import importlib @@ -41,7 +41,7 @@ class SeRunError(RuntimeError): - """Raised when the serviceability engine fails or returns invalid output.""" + """Raised when Service Hub fails or returns invalid output.""" def resolve_engine_command( @@ -64,7 +64,7 @@ def resolve_engine_command( def run_se( *, engine_backend: EngineBackend = "python", - engine_python_module: str = "serviceability_engine", + engine_python_module: str = "service_hub", engine_executable: Optional[str] = None, engine_entry_point: Optional[str] = None, afid_events: list[AfidEvent], @@ -114,11 +114,11 @@ def _run_se_python( try: se = importlib.import_module(engine_python_module) SagDocument = se.SagDocument - ServiceabilityEngine = se.ServiceabilityEngine + ServiceHub = se.ServiceHub EventRecord = se.EventRecord except (ImportError, AttributeError) as exc: raise SeRunError( - f"Cannot import {engine_python_module} bindings — install serviceability-engine " + f"Cannot import {engine_python_module} bindings — install service-hub " f"and build the Python extension (uv build)." ) from exc @@ -133,10 +133,10 @@ def _run_se_python( ) for item in wire_events ] - analysis = ServiceabilityEngine(sag).analyze(records) + analysis = ServiceHub(sag).analyze(records) report = analysis.to_dict() except Exception as exc: - raise SeRunError(f"Serviceability engine analyze() failed: {exc}") from exc + raise SeRunError(f"Service Hub analyze() failed: {exc}") from exc return serviceability_block_from_engine(afid_events, report) @@ -176,7 +176,7 @@ def _run_se_cli( try: report = json.loads(completed.stdout or "{}") except json.JSONDecodeError as exc: - raise SeRunError(f"Invalid JSON from serviceability engine CLI: {exc}") from exc + raise SeRunError(f"Invalid JSON from Service Hub CLI: {exc}") from exc from .se_adapter import recommendations_from_report_dict @@ -226,11 +226,11 @@ def _run_se_subprocess( _run_subprocess(argv, timeout_seconds=timeout_seconds) if not output_path.is_file(): - raise SeRunError(f"Serviceability engine did not write output file: {output_path}") + raise SeRunError(f"Service Hub did not write output file: {output_path}") try: raw = json.loads(output_path.read_text(encoding="utf-8")) except json.JSONDecodeError as exc: - raise SeRunError(f"Invalid JSON from serviceability engine: {exc}") from exc + raise SeRunError(f"Invalid JSON from Service Hub: {exc}") from exc block = ServiceabilityBlock.model_validate(raw) if not block.afid_events: @@ -241,7 +241,7 @@ def _run_se_subprocess( def _run_subprocess(argv: list[str], *, timeout_seconds: int) -> subprocess.CompletedProcess: exe = Path(argv[0]) if not exe.is_file() and not _command_on_path(argv[0]): - raise SeRunError(f"Serviceability engine not found or not executable: {argv[0]!r}") + raise SeRunError(f"Service Hub not found or not executable: {argv[0]!r}") try: completed = subprocess.run( argv, @@ -251,15 +251,15 @@ def _run_subprocess(argv: list[str], *, timeout_seconds: int) -> subprocess.Comp check=False, ) except subprocess.TimeoutExpired as exc: - raise SeRunError(f"Serviceability engine timed out after {timeout_seconds}s") from exc + raise SeRunError(f"Service Hub timed out after {timeout_seconds}s") from exc except OSError as exc: - raise SeRunError(f"Failed to start serviceability engine: {exc}") from exc + raise SeRunError(f"Failed to start Service Hub: {exc}") from exc if completed.returncode != 0: stderr = (completed.stderr or "").strip() stdout = (completed.stdout or "").strip() detail = stderr or stdout or f"exit code {completed.returncode}" - raise SeRunError(f"Serviceability engine failed: {detail}") + raise SeRunError(f"Service Hub failed: {detail}") return completed diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py index 68a7daea..0c387940 100644 --- a/nodescraper/plugins/serviceability/serviceability_data.py +++ b/nodescraper/plugins/serviceability/serviceability_data.py @@ -72,7 +72,7 @@ class ServiceabilityDataModel(DataModel): bmc_host: Optional[str] = None afid_events: List[AfidEvent] = Field( default_factory=list, - description="Serviceability engine input; built during analysis when not pre-filled.", + description="Service Hub input; built during analysis when not pre-filled.", ) serviceability: Optional[ServiceabilityBlock] = Field( default=None, diff --git a/nodescraper/plugins/serviceability/time_utils.py b/nodescraper/plugins/serviceability/time_utils.py index 5653f4a9..166bca14 100644 --- a/nodescraper/plugins/serviceability/time_utils.py +++ b/nodescraper/plugins/serviceability/time_utils.py @@ -50,7 +50,7 @@ def is_valid_iso_datetime(value: str) -> bool: def normalize_se_timestamp(value: str) -> str: - """Normalize a timestamp to the serviceability engine wire format. + """Normalize a timestamp to the Service Hub wire format. Accepts ISO-8601 (``2026-05-07T12:50:42``) and SE-style strings with a space separator (``2026-05-07 12:50:42.096-07:00``). From a5bfaac2517c672dcf7a9925c7df733182ac6196 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 27 May 2026 11:40:50 -0500 Subject: [PATCH 06/19] cleanup + updates --- nodescraper/interfaces/dataanalyzertask.py | 2 +- nodescraper/interfaces/dataplugin.py | 6 +- .../plugins/serviceability/__init__.py | 45 ++- .../plugins/serviceability/analyzer_args.py | 63 +--- .../plugins/serviceability/mi3xx/__init__.py | 24 +- .../serviceability/mi3xx/mi3xx_analyzer.py | 34 ++- .../serviceability/mi3xx/mi3xx_collector.py | 14 +- .../mi3xx/mi3xx_collector_args.py | 12 +- .../serviceability/mi3xx/mi3xx_data.py | 12 +- .../mi3xx/serviceability_plugin_mi3xx.py | 19 +- .../plugins/serviceability/se_adapter.py | 145 ++++----- .../plugins/serviceability/se_models.py | 12 +- .../plugins/serviceability/se_runner.py | 276 +++++------------- .../taskresulthooks/filesystemloghook.py | 6 +- nodescraper/utils.py | 31 +- .../unit/plugin/fixtures/afid_sag_sample.json | 8 + .../plugin/fixtures/mock_python_engine.py | 40 +++ test/unit/plugin/serviceability_dummy_data.py | 22 ++ test/unit/plugin/test_mi3xx_collector.py | 213 ++++++++++++++ test/unit/plugin/test_se_runner.py | 257 ++++++++++++++++ .../plugin/test_serviceability_collector.py | 76 +++-- 21 files changed, 844 insertions(+), 473 deletions(-) create mode 100644 test/unit/plugin/fixtures/afid_sag_sample.json create mode 100644 test/unit/plugin/fixtures/mock_python_engine.py create mode 100644 test/unit/plugin/serviceability_dummy_data.py create mode 100644 test/unit/plugin/test_mi3xx_collector.py create mode 100644 test/unit/plugin/test_se_runner.py diff --git a/nodescraper/interfaces/dataanalyzertask.py b/nodescraper/interfaces/dataanalyzertask.py index 0e6b3b06..fd6cc284 100644 --- a/nodescraper/interfaces/dataanalyzertask.py +++ b/nodescraper/interfaces/dataanalyzertask.py @@ -99,7 +99,7 @@ def wrapper( result = analyzer.result result.finalize(analyzer.logger) - analyzer._run_hooks(result) + analyzer._run_hooks(result, data=data) return result diff --git a/nodescraper/interfaces/dataplugin.py b/nodescraper/interfaces/dataplugin.py index ed632fb4..43bc3d83 100644 --- a/nodescraper/interfaces/dataplugin.py +++ b/nodescraper/interfaces/dataplugin.py @@ -44,7 +44,7 @@ SystemInfo, TaskResult, ) -from nodescraper.utils import pascal_to_snake +from nodescraper.utils import resolve_log_dir_name from .connectionmanager import TConnectArg, TConnectionManager from .task import SystemCompatibilityError @@ -412,8 +412,8 @@ def find_datamodel_path_in_run(cls, run_path: str) -> Optional[str]: return None collector_dir = os.path.join( run_path, - pascal_to_snake(cls.__name__), - pascal_to_snake(collector_cls.__name__), + resolve_log_dir_name(cls.__name__), + resolve_log_dir_name(collector_cls.__name__), ) if not os.path.isdir(collector_dir): return None diff --git a/nodescraper/plugins/serviceability/__init__.py b/nodescraper/plugins/serviceability/__init__.py index ae190bca..36671691 100644 --- a/nodescraper/plugins/serviceability/__init__.py +++ b/nodescraper/plugins/serviceability/__init__.py @@ -26,23 +26,21 @@ from .afid_events import build_afid_events_from_data from .analyzer_args import ServiceabilityAnalyzerArgs from .mi3xx import ( - Mi3xxAnalyzer, - Mi3xxCollector, - Mi3xxCollectorArgs, - Mi3xxDataModel, - Mi3xxDeviceInfo, - Mi3xxResult, + MI3XXAnalyzer, + MI3XXCollector, + MI3XXCollectorArgs, + MI3XXDataModel, + MI3XXDeviceInfo, + MI3XXResult, ServiceabilityPluginMI3XX, build_mi3xx_reporting_version_fields, ) -from .se_adapter import afid_events_to_engine_input, serviceability_block_from_engine -from .se_models import ( - AfidEvent, - SeInputPayload, - ServiceabilityBlock, - ServiceabilitySolution, +from .se_adapter import ( + format_serviceability_solution_lines, + serviceability_block_from_service_result, ) -from .se_runner import EngineBackend, SeRunError, resolve_engine_command, run_se +from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution +from .se_runner import SeRunError, run_service_engine from .serviceability_collector import ServiceabilityCollectorBase from .serviceability_data import ( DeviceInfo, @@ -62,14 +60,12 @@ __all__ = [ "AfidEvent", "DeviceInfo", - "EngineBackend", - "Mi3xxAnalyzer", - "Mi3xxCollector", - "Mi3xxCollectorArgs", - "Mi3xxDataModel", - "Mi3xxDeviceInfo", - "Mi3xxResult", - "SeInputPayload", + "MI3XXAnalyzer", + "MI3XXCollector", + "MI3XXCollectorArgs", + "MI3XXDataModel", + "MI3XXDeviceInfo", + "MI3XXResult", "SeRunError", "ServiceabilityAnalyzerArgs", "ServiceabilityBlock", @@ -80,15 +76,14 @@ "ServiceabilityResult", "ServiceabilitySolution", "TimeOperator", - "afid_events_to_engine_input", "build_afid_events_from_data", - "serviceability_block_from_engine", "build_mi3xx_reporting_version_fields", "compare_iso_datetime", + "format_serviceability_solution_lines", "is_valid_iso_datetime", "normalize_se_timestamp", "parse_iso_datetime", - "resolve_engine_command", - "run_se", + "run_service_engine", + "serviceability_block_from_service_result", "satisfies_time_check", ] diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py index d9fa09bb..679743dd 100644 --- a/nodescraper/plugins/serviceability/analyzer_args.py +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -25,62 +25,39 @@ ############################################################################### from __future__ import annotations -from typing import List, Literal, Optional +from typing import Optional from pydantic import Field, field_validator, model_validator from nodescraper.models import AnalyzerArgs -EngineBackend = Literal["python", "cli", "subprocess"] - class ServiceabilityAnalyzerArgs(AnalyzerArgs): - """Analyzer args for serviceability plugins.""" + """Analyzer args for MI3XX serviceability (Python engine via plugin config).""" - engine_backend: EngineBackend = Field( - default="python", + engine_python_module: Optional[str] = Field( + default=None, description=( - "How to invoke the SE: 'python' (service_hub bindings), " - "'cli' (external analyze subcommand), or 'subprocess' (--input/--output protocol)." + "Importable Python module providing a service engine class with " + "get_service_info(rf_events, cper_data=...)." ), ) - engine_python_module: str = Field( - default="service_hub", - description="Python package providing ServiceHub bindings (python backend).", - ) - engine_executable: Optional[str] = Field( - default=None, - description="Path to the SE binary (cli or subprocess backends).", - ) - engine_entry_point: Optional[str] = Field( + engine_display_name: Optional[str] = Field( default=None, - description=( - "Command for cli/subprocess backends: executable path or argv prefix on PATH. " - "Required when engine_backend is 'cli' or 'subprocess'." - ), + description="Optional label for analyzer status messages.", ) afid_sag_path: Optional[str] = Field( default=None, description="Path to AFID_SAG.json.", ) - engine_extra_args: List[str] = Field( - default_factory=list, - description="Extra CLI arguments (cli/subprocess backends).", - ) - engine_timeout_seconds: int = Field( - default=600, - ge=1, - le=86_400, - description="Subprocess timeout (cli/subprocess backends).", - ) skip_engine: bool = Field( default=False, - description="If True, only build afid_events without running the SE.", + description="If True, only build afid_events without running the service engine.", ) - @field_validator("engine_executable", "engine_entry_point", "afid_sag_path") + @field_validator("afid_sag_path", "engine_python_module", "engine_display_name") @classmethod - def _strip_optional_paths(cls, value: Optional[str]) -> Optional[str]: + def _strip_optional_strings(cls, value: Optional[str]) -> Optional[str]: if value is None: return None text = str(value).strip() @@ -91,19 +68,7 @@ def _require_engine_config_when_running(self) -> ServiceabilityAnalyzerArgs: if self.skip_engine: return self if not self.afid_sag_path: - raise ValueError("afid_sag_path is required when running Service Hub.") - if self.engine_backend == "python": - return self - has_exe = self.engine_executable is not None - has_entry = self.engine_entry_point is not None - if has_exe and has_entry: - raise ValueError( - "Provide only one of engine_executable or engine_entry_point " - "for cli/subprocess backends." - ) - if not has_exe and not has_entry: - raise ValueError( - "engine_executable or engine_entry_point is required when " - "engine_backend is 'cli' or 'subprocess'." - ) + raise ValueError("afid_sag_path is required when running the service engine.") + if not self.engine_python_module: + raise ValueError("engine_python_module is required when running the service engine.") return self diff --git a/nodescraper/plugins/serviceability/mi3xx/__init__.py b/nodescraper/plugins/serviceability/mi3xx/__init__.py index 25e83a07..b97928b3 100644 --- a/nodescraper/plugins/serviceability/mi3xx/__init__.py +++ b/nodescraper/plugins/serviceability/mi3xx/__init__.py @@ -23,24 +23,24 @@ # SOFTWARE. # ############################################################################### -from .mi3xx_analyzer import Mi3xxAnalyzer -from .mi3xx_collector import Mi3xxCollector -from .mi3xx_collector_args import Mi3xxCollectorArgs +from .mi3xx_analyzer import MI3XXAnalyzer +from .mi3xx_collector import MI3XXCollector +from .mi3xx_collector_args import MI3XXCollectorArgs from .mi3xx_data import ( - Mi3xxDataModel, - Mi3xxDeviceInfo, - Mi3xxResult, + MI3XXDataModel, + MI3XXDeviceInfo, + MI3XXResult, build_mi3xx_reporting_version_fields, ) from .serviceability_plugin_mi3xx import ServiceabilityPluginMI3XX __all__ = [ - "Mi3xxAnalyzer", - "Mi3xxCollector", - "Mi3xxCollectorArgs", - "Mi3xxDataModel", - "Mi3xxDeviceInfo", - "Mi3xxResult", + "MI3XXAnalyzer", + "MI3XXCollector", + "MI3XXCollectorArgs", + "MI3XXDataModel", + "MI3XXDeviceInfo", + "MI3XXResult", "ServiceabilityPluginMI3XX", "build_mi3xx_reporting_version_fields", ] diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index d74f297a..ab001184 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -7,7 +7,7 @@ # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# to use, copy, modify, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # @@ -32,15 +32,18 @@ from nodescraper.models import TaskResult from nodescraper.plugins.serviceability.afid_events import build_afid_events_from_data from nodescraper.plugins.serviceability.analyzer_args import ServiceabilityAnalyzerArgs +from nodescraper.plugins.serviceability.se_adapter import ( + format_serviceability_solution_lines, +) from nodescraper.plugins.serviceability.se_models import ServiceabilityBlock -from nodescraper.plugins.serviceability.se_runner import SeRunError, run_se +from nodescraper.plugins.serviceability.se_runner import SeRunError, run_service_engine from nodescraper.plugins.serviceability.serviceability_data import ( ServiceabilityDataModel, ) -class Mi3xxAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): - """Build AFID events from collected data and run Service Hub.""" +class MI3XXAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): + """Build AFID events from collected data and run the configured service engine.""" DATA_MODEL = ServiceabilityDataModel @@ -61,18 +64,17 @@ def analyze_data( data.serviceability = ServiceabilityBlock(afid_events=events) self.result.status = ExecutionStatus.OK self.result.message = f"Built {len(events)} AFID event(s); engine skipped" + self._log_serviceability_solutions(data.serviceability) return self.result try: - block = run_se( - engine_backend=args.engine_backend, - engine_python_module=args.engine_python_module, - engine_executable=args.engine_executable, - engine_entry_point=args.engine_entry_point, + block = run_service_engine( + engine_python_module=args.engine_python_module, # type: ignore[arg-type] + engine_display_name=args.engine_display_name, afid_events=events, afid_sag_path=args.afid_sag_path, # type: ignore[arg-type] - extra_args=args.engine_extra_args or None, - timeout_seconds=args.engine_timeout_seconds, + rf_events=data.rf_events, + cper_data=data.cper_data or None, ) except (SeRunError, ValueError) as exc: self.result.status = ExecutionStatus.ERROR @@ -80,8 +82,16 @@ def analyze_data( return self.result data.serviceability = block + self._log_serviceability_solutions(block) + engine_label = args.engine_display_name or args.engine_python_module self.result.status = ExecutionStatus.OK self.result.message = ( - f"Service Hub: {len(block.solution)} solution(s) " f"from {len(events)} event(s)" + f"{engine_label}: {len(block.solution)} solution(s) " + f"from {len(data.rf_events)} Redfish event(s)" ) return self.result + + def _log_serviceability_solutions(self, block: ServiceabilityBlock) -> None: + parent = self.parent or self.__class__.__name__ + for line in format_serviceability_solution_lines(block): + self.logger.info("(%s) %s", parent, line) diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py index 8f73941c..63e23e21 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py @@ -33,18 +33,18 @@ from nodescraper.plugins.serviceability.serviceability_data import DeviceInfo from nodescraper.plugins.serviceability.time_utils import satisfies_time_check -from .mi3xx_collector_args import Mi3xxCollectorArgs +from .mi3xx_collector_args import MI3XXCollectorArgs _EVENT_TIMESTAMP_KEYS = ("Created", "EventTimestamp", "Timestamp") -class Mi3xxCollector(ServiceabilityCollectorBase[Mi3xxCollectorArgs]): - """MI3xx OOB Redfish serviceability collector.""" +class MI3XXCollector(ServiceabilityCollectorBase[MI3XXCollectorArgs]): + """MI3XX OOB Redfish serviceability collector.""" def satisfies_reference_time( self, candidate: str, - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> bool: """Test a timestamp against optional reference-time filter settings.""" if args.reference_time is None or args.time_operator is None: @@ -54,7 +54,7 @@ def satisfies_reference_time( def filter_event_members( self, members: list[Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> list[Any]: filtered: list[Any] = [] for member in members: @@ -78,7 +78,7 @@ def parse_assembly_entry( self, designation: str, assembly_member_entry: dict[str, Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> DeviceInfo: return DeviceInfo( name=assembly_member_entry.get("Name") or designation, @@ -91,7 +91,7 @@ def parse_assembly_entry( def extract_component_details( self, firmware_inventory_payload: dict[str, Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> Optional[str]: details = firmware_inventory_payload.get("Details") if details is not None: diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py index ae7555d7..1e95a81b 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py @@ -36,11 +36,11 @@ ) -class Mi3xxCollectorArgs(CollectorArgs): - """MI3xx OOB Redfish serviceability collector arguments.""" +class MI3XXCollectorArgs(CollectorArgs): + """MI3XX OOB Redfish serviceability collector arguments.""" uri: Optional[str] = Field( - default=None, + default="/redfish/v1/Systems/UBB/LogServices/EventLog/Entries", description="Optional alias for ``rf_event_log_uri`` (non-empty string).", ) rf_event_log_uri: Optional[str] = Field( @@ -99,7 +99,7 @@ def _validate_reference_time_iso(cls, value: Optional[str]) -> Optional[str]: return text @model_validator(mode="after") - def _require_event_log_uri(self) -> Mi3xxCollectorArgs: + def _require_event_log_uri(self) -> MI3XXCollectorArgs: if not self.resolved_event_log_uri(): raise ValueError( "Provide a non-empty rf_event_log_uri or uri for the event log collection." @@ -107,7 +107,7 @@ def _require_event_log_uri(self) -> Mi3xxCollectorArgs: return self @model_validator(mode="after") - def _assembly_consistency(self) -> Mi3xxCollectorArgs: + def _assembly_consistency(self) -> MI3XXCollectorArgs: has_tpl = bool( self.rf_assembly_uri_template and "{device}" in self.rf_assembly_uri_template ) @@ -120,7 +120,7 @@ def _assembly_consistency(self) -> Mi3xxCollectorArgs: return self @model_validator(mode="after") - def _reference_time_requires_operator(self) -> Mi3xxCollectorArgs: + def _reference_time_requires_operator(self) -> MI3XXCollectorArgs: has_ref = self.reference_time is not None has_op = self.time_operator is not None if has_ref != has_op: diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py index 6c9c268f..17a60eaa 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py @@ -34,7 +34,7 @@ from nodescraper.models import DataModel -class Mi3xxDeviceInfo(BaseModel): +class MI3XXDeviceInfo(BaseModel): """Device identity with separate board and product fields.""" board_product_name: Optional[str] = Field( @@ -78,7 +78,7 @@ class Mi3xxDeviceInfo(BaseModel): ) -class Mi3xxResult(BaseModel): +class MI3XXResult(BaseModel): """Structured serviceability report output.""" node: Optional[str] = None @@ -136,14 +136,14 @@ def build_mi3xx_reporting_version_fields( } -class Mi3xxDataModel(DataModel): +class MI3XXDataModel(DataModel): """Collected OOB Redfish serviceability data model.""" collected_data: Dict[str, Any] = Field( default_factory=dict, description="Arbitrary keyed payloads from the collector implementation.", ) - device_info: Dict[str, Mi3xxDeviceInfo] = Field( + device_info: Dict[str, MI3XXDeviceInfo] = Field( default_factory=dict, description="Optional device identity keyed by implementer-defined labels.", ) @@ -156,7 +156,7 @@ class Mi3xxDataModel(DataModel): description="Optional host or service endpoint label (not necessarily a BMC).", ) log_path: Optional[str] = None - result: Optional[Mi3xxResult] = None + result: Optional[MI3XXResult] = None def log_model(self, log_path: str) -> None: """Write artifact files and a JSON summary under the log directory. @@ -174,7 +174,7 @@ def log_model(self, log_path: str) -> None: artifact_path = os.path.join(log_path, str(filename).strip()) with open(artifact_path, "w", encoding="utf-8") as handle: json.dump(payload, handle, indent=2) - summary_path = os.path.join(log_path, "mi3xx_data.json") + summary_path = os.path.join(log_path, "MI3XX_data.json") with open(summary_path, "w", encoding="utf-8") as handle: json.dump( self.model_dump( diff --git a/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py index ee0c510b..2f38783f 100644 --- a/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py +++ b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py @@ -29,16 +29,21 @@ from nodescraper.plugins.serviceability.serviceability_plugin_base import ( ServiceabilityPluginBase, ) +from nodescraper.utils import register_log_dir_name -from .mi3xx_analyzer import Mi3xxAnalyzer -from .mi3xx_collector import Mi3xxCollector -from .mi3xx_collector_args import Mi3xxCollectorArgs +from .mi3xx_analyzer import MI3XXAnalyzer +from .mi3xx_collector import MI3XXCollector +from .mi3xx_collector_args import MI3XXCollectorArgs + +register_log_dir_name("ServiceabilityPluginMI3XX", "serviceability_plugin_MI3XX") +register_log_dir_name("MI3XXCollector", "MI3XX_collector") +register_log_dir_name("MI3XXAnalyzer", "MI3XX_analyzer") class ServiceabilityPluginMI3XX(ServiceabilityPluginBase): - """MI3xx OOB Redfish serviceability plugin.""" + """MI3XX OOB Redfish serviceability plugin.""" DATA_MODEL = ServiceabilityDataModel - COLLECTOR = Mi3xxCollector - ANALYZER = Mi3xxAnalyzer - COLLECTOR_ARGS = Mi3xxCollectorArgs + COLLECTOR = MI3XXCollector + ANALYZER = MI3XXAnalyzer + COLLECTOR_ARGS = MI3XXCollectorArgs diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 4b4c7a2e..243b2d7d 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -"""Map node-scraper serviceability models to/from the AMD Service Hub API.""" +"""Map serviceability plugin models to/from Python service engine results.""" from __future__ import annotations from collections import defaultdict @@ -31,107 +31,76 @@ from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution -_DEFAULT_SOLUTION_TIERS = ( - "primary_fru_events", - "secondary_actions", -) - -def afid_events_to_engine_input(afid_events: list[AfidEvent]) -> list[dict[str, Any]]: - """Convert plugin AFID events to Service Hub wire-format dicts. - - The engine triages on (afid, location, count). Duplicate (afid, unit) pairs - are merged by summing counts. Timestamp is preserved only on the plugin side. - """ - counts: dict[tuple[str, str], int] = defaultdict(int) - for event in afid_events: - key = (str(event.afid), event.serviceable_unit) - counts[key] += 1 - return [ - {"afid": afid, "location": location, "count": count} - for (afid, location), count in sorted(counts.items()) - ] +def format_serviceability_solution_lines(block: ServiceabilityBlock) -> list[str]: + """Human-readable lines for logging or console output.""" + lines: list[str] = [] + if block.solution_reasoning: + lines.append(block.solution_reasoning) + if not block.solution: + lines.append("No service actions recommended.") + return lines + for index, solution in enumerate(block.solution, start=1): + units = ", ".join(solution.serviceable_unit) + lines.append( + f"[{index}] AFID {solution.afid}, " + f"service action {solution.service_action_num}, " + f"units: [{units}]" + ) + return lines -def recommendations_from_report_dict( - report: dict[str, Any], +def serviceability_block_from_service_result( + afid_events: list[AfidEvent], + result: Any, *, - solution_tiers: tuple[str, ...] = _DEFAULT_SOLUTION_TIERS, -) -> list[dict[str, Any]]: - """Derive grouped recommendations from an :func:`service_hub.api.analyze` report.""" - if "recommendations" in report: - return list(report["recommendations"]) - + engine_label: str = "Service engine", + rf_event_count: int = 0, +) -> ServiceabilityBlock: + """Build a :class:`ServiceabilityBlock` from an engine result with ``service_info``.""" grouped: dict[tuple[int, int], list[str]] = defaultdict(list) - for tier in solution_tiers: - for row in report.get(tier, []): - if not isinstance(row, dict): + service_info = getattr(result, "service_info", None) or {} + for designation, afid_map in service_info.items(): + if not isinstance(afid_map, dict): + continue + unit = str(designation).strip() if designation is not None else "" + for afid_raw, info in afid_map.items(): + if not isinstance(info, dict): continue - afid = int(row.get("afid", 0)) - location = str(row.get("location", "")).strip() - action_num = _action_num_from_row(row) - if not location or action_num is None: + san_raw = info.get("service_action_number") + if san_raw is None: continue - key = (afid, action_num) - if location not in grouped[key]: - grouped[key].append(location) - - return [ - { - "afid": afid, - "locations": locations, - "service_action_num": action_num, - } - for (afid, action_num), locations in sorted(grouped.items()) - ] - + try: + afid = int(afid_raw) + san = int(san_raw) + except (TypeError, ValueError): + continue + key = (afid, san) + if unit and unit not in grouped[key]: + grouped[key].append(unit) -def serviceability_block_from_engine( - afid_events: list[AfidEvent], - report: dict[str, Any], - *, - recommendations: list[dict[str, Any]] | None = None, -) -> ServiceabilityBlock: - """Build the ANC ``serviceability`` block from an engine analysis report.""" - recs = ( - recommendations if recommendations is not None else recommendations_from_report_dict(report) - ) solutions = [ ServiceabilitySolution( - afid=int(item["afid"]), - serviceable_unit=list(item["locations"]), - service_action_num=int(item["service_action_num"]), + afid=afid, + serviceable_unit=units, + service_action_num=san, ) - for item in recs + for (afid, san), units in sorted(grouped.items()) ] - reasoning = _build_solution_reasoning(afid_events, solutions, report) + metadata = getattr(result, "afid_sag_metadata", None) or {} + version_info = ( + getattr(result, "engine_version_info", None) or getattr(result, "version_info", None) or {} + ) + sag_pid = metadata.get("sag_pid") or metadata.get("pid") or "unknown" + sag_revision = metadata.get("sag_revision") or metadata.get("revision") or "unknown" + engine_version = version_info.get("version") or version_info.get("engine_version") + version_suffix = f", engine {engine_version}" if engine_version else "" + reasoning = ( + f"{engine_label} (SAG {sag_pid} rev {sag_revision}{version_suffix}): " + f"{len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." + ) return ServiceabilityBlock( afid_events=list(afid_events), solution=solutions, solution_reasoning=reasoning, ) - - -def _action_num_from_row(row: dict[str, Any]) -> int | None: - if "service_action_num" in row: - return int(row["service_action_num"]) - service_action = row.get("service_action") - if isinstance(service_action, dict) and "id" in service_action: - return int(service_action["id"]) - afid_entry = row.get("afid_entry") - if isinstance(afid_entry, dict) and "service_action_num" in afid_entry: - return int(afid_entry["service_action_num"]) - return None - - -def _build_solution_reasoning( - afid_events: list[AfidEvent], - solutions: list[ServiceabilitySolution], - report: dict[str, Any], -) -> str: - sag_pid = report.get("sag_pid") or "unknown" - sag_revision = report.get("sag_revision") or "unknown" - return ( - f"Service Hub (SAG {sag_pid} rev {sag_revision}): " - f"{len(solutions)} recommendation(s) from {len(afid_events)} input event(s)." - ) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index f5fc54bb..344ef7c7 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -31,7 +31,7 @@ class AfidEvent(BaseModel): - """Service Hub input: one AFID occurrence on a serviceable unit.""" + """One AFID occurrence on a serviceable unit.""" afid: int = Field(description="AMD Fault ID.") serviceable_unit: str = Field( @@ -51,7 +51,7 @@ def _strip_serviceable_unit(cls, value: str) -> str: class ServiceabilitySolution(BaseModel): - """Service Hub output: recommended action for an AFID.""" + """Recommended service action for an AFID.""" afid: int serviceable_unit: List[str] = Field( @@ -67,7 +67,7 @@ class ServiceabilityBlock(BaseModel): afid_events: List[AfidEvent] = Field( default_factory=list, - description="Input events passed to Service Hub.", + description="Summarized AFID events from collected data.", ) solution: List[ServiceabilitySolution] = Field( default_factory=list, @@ -77,9 +77,3 @@ class ServiceabilityBlock(BaseModel): default=None, description="Human-readable summary of how the engine reached its conclusions.", ) - - -class SeInputPayload(BaseModel): - """JSON written to the SE ``--input`` file.""" - - afid_events: List[AfidEvent] = Field(default_factory=list) diff --git a/nodescraper/plugins/serviceability/se_runner.py b/nodescraper/plugins/serviceability/se_runner.py index 0fda2e5e..aeec1eb7 100644 --- a/nodescraper/plugins/serviceability/se_runner.py +++ b/nodescraper/plugins/serviceability/se_runner.py @@ -23,247 +23,109 @@ # SOFTWARE. # ############################################################################### -"""Run the AMD Service Hub (Python API, CLI, or custom subprocess).""" +"""Invoke a configured Python service engine against collected Redfish events.""" from __future__ import annotations import importlib -import json -import shlex -import subprocess -import tempfile +import inspect from pathlib import Path -from typing import Literal, Optional +from typing import Any, Optional, Type -from .se_adapter import afid_events_to_engine_input, serviceability_block_from_engine -from .se_models import AfidEvent, SeInputPayload, ServiceabilityBlock +from .se_adapter import serviceability_block_from_service_result +from .se_models import AfidEvent, ServiceabilityBlock -EngineBackend = Literal["python", "cli", "subprocess"] +_ENGINE_METHOD = "get_service_info" class SeRunError(RuntimeError): - """Raised when Service Hub fails or returns invalid output.""" + """Raised when the service engine fails or returns invalid output.""" -def resolve_engine_command( +def run_service_engine( *, - engine_executable: Optional[str] = None, - engine_entry_point: Optional[str] = None, -) -> list[str]: - """Build the argv prefix for a subprocess or CLI-backed SE invocation.""" - has_exe = bool(engine_executable and str(engine_executable).strip()) - has_entry = bool(engine_entry_point and str(engine_entry_point).strip()) - if has_exe and has_entry: - raise ValueError("Provide only one of engine_executable or engine_entry_point.") - if not has_exe and not has_entry: - raise ValueError("Provide engine_executable or engine_entry_point.") - if has_exe: - return [str(engine_executable).strip()] - return shlex.split(str(engine_entry_point).strip()) - - -def run_se( - *, - engine_backend: EngineBackend = "python", - engine_python_module: str = "service_hub", - engine_executable: Optional[str] = None, - engine_entry_point: Optional[str] = None, + engine_python_module: str, + engine_display_name: Optional[str] = None, afid_events: list[AfidEvent], afid_sag_path: str, - extra_args: Optional[list[str]] = None, - timeout_seconds: int = 600, - work_dir: Optional[str] = None, + rf_events: list[Any], + cper_data: Optional[dict[str, Any]] = None, ) -> ServiceabilityBlock: - """Run the SE and return a :class:`ServiceabilityBlock`.""" + """Run a Python service engine and return a :class:`ServiceabilityBlock`.""" sag_path = Path(afid_sag_path) if not sag_path.is_file(): raise SeRunError(f"AFID_SAG file not found: {afid_sag_path}") - if engine_backend == "python": - return _run_se_python( - engine_python_module=engine_python_module, - afid_events=afid_events, - afid_sag_path=str(sag_path), - ) - if engine_backend == "cli": - return _run_se_cli( - engine_executable=engine_executable, - engine_entry_point=engine_entry_point, - afid_events=afid_events, - afid_sag_path=str(sag_path), - extra_args=extra_args, - timeout_seconds=timeout_seconds, - work_dir=work_dir, - ) - return _run_se_subprocess( - engine_executable=engine_executable, - engine_entry_point=engine_entry_point, - afid_events=afid_events, - afid_sag_path=str(sag_path), - extra_args=extra_args, - timeout_seconds=timeout_seconds, - work_dir=work_dir, - ) - - -def _run_se_python( - *, - engine_python_module: str, - afid_events: list[AfidEvent], - afid_sag_path: str, -) -> ServiceabilityBlock: - try: - se = importlib.import_module(engine_python_module) - SagDocument = se.SagDocument - ServiceHub = se.ServiceHub - EventRecord = se.EventRecord - except (ImportError, AttributeError) as exc: + if not rf_events: raise SeRunError( - f"Cannot import {engine_python_module} bindings — install service-hub " - f"and build the Python extension (uv build)." - ) from exc + "Collected Redfish events are required; re-run collection or use skip_engine." + ) - wire_events = afid_events_to_engine_input(afid_events) + label = engine_display_name or engine_python_module try: - sag = SagDocument.from_file(afid_sag_path) - records = [ - EventRecord( - afid=str(item["afid"]), - location=str(item["location"]), - count=int(item["count"]), - ) - for item in wire_events - ] - analysis = ServiceHub(sag).analyze(records) - report = analysis.to_dict() - except Exception as exc: - raise SeRunError(f"Service Hub analyze() failed: {exc}") from exc - - return serviceability_block_from_engine(afid_events, report) + mod = importlib.import_module(engine_python_module) + except ImportError as exc: + raise SeRunError(f"Cannot import {engine_python_module}: {exc}") from exc - -def _run_se_cli( - *, - engine_executable: Optional[str], - engine_entry_point: Optional[str], - afid_events: list[AfidEvent], - afid_sag_path: str, - extra_args: Optional[list[str]], - timeout_seconds: int, - work_dir: Optional[str], -) -> ServiceabilityBlock: - """Invoke an external engine CLI ``analyze --sag … --input …`` and map stdout JSON.""" - command = resolve_engine_command( - engine_executable=engine_executable, - engine_entry_point=engine_entry_point, - ) - wire_events = afid_events_to_engine_input(afid_events) - - with tempfile.TemporaryDirectory(prefix="nodescraper_se_cli_", dir=work_dir) as tmp: - input_path = Path(tmp) / "events.json" - input_path.write_text(json.dumps(wire_events, indent=2), encoding="utf-8") - argv = [ - *command, - "analyze", - "--sag", - afid_sag_path, - "--input", - str(input_path), - ] - if extra_args: - argv.extend(extra_args) - completed = _run_subprocess(argv, timeout_seconds=timeout_seconds) + engine_cls = _resolve_engine_class(mod) try: - report = json.loads(completed.stdout or "{}") - except json.JSONDecodeError as exc: - raise SeRunError(f"Invalid JSON from Service Hub CLI: {exc}") from exc + instance = engine_cls(afid_sag=afid_sag_path) + analyze = getattr(instance, _ENGINE_METHOD) + result = analyze( + list(rf_events), + cper_data=dict(cper_data) if cper_data else None, + ) + except Exception as exc: + raise SeRunError(f"{label} {_ENGINE_METHOD}() failed: {exc}") from exc - from .se_adapter import recommendations_from_report_dict + if result is None: + return ServiceabilityBlock( + afid_events=list(afid_events), + solution=[], + solution_reasoning=f"{label}: no service actions after event filtering.", + ) - return serviceability_block_from_engine( + return serviceability_block_from_service_result( afid_events, - report, - recommendations=recommendations_from_report_dict(report), - ) - - -def _run_se_subprocess( - *, - engine_executable: Optional[str], - engine_entry_point: Optional[str], - afid_events: list[AfidEvent], - afid_sag_path: str, - extra_args: Optional[list[str]], - timeout_seconds: int, - work_dir: Optional[str], -) -> ServiceabilityBlock: - """Custom subprocess protocol: ``--input`` / ``--output`` / ``--afid-sag``.""" - command = resolve_engine_command( - engine_executable=engine_executable, - engine_entry_point=engine_entry_point, + result, + engine_label=label, + rf_event_count=len(rf_events), ) - payload = SeInputPayload(afid_events=afid_events) - - with tempfile.TemporaryDirectory(prefix="nodescraper_se_", dir=work_dir) as tmp: - tmp_path = Path(tmp) - input_path = tmp_path / "se_input.json" - output_path = tmp_path / "se_output.json" - input_path.write_text( - json.dumps(payload.model_dump(mode="json"), indent=2), - encoding="utf-8", - ) - argv = [ - *command, - "--input", - str(input_path), - "--output", - str(output_path), - "--afid-sag", - str(Path(afid_sag_path).resolve()), - ] - if extra_args: - argv.extend(extra_args) - _run_subprocess(argv, timeout_seconds=timeout_seconds) - if not output_path.is_file(): - raise SeRunError(f"Service Hub did not write output file: {output_path}") - try: - raw = json.loads(output_path.read_text(encoding="utf-8")) - except json.JSONDecodeError as exc: - raise SeRunError(f"Invalid JSON from Service Hub: {exc}") from exc - block = ServiceabilityBlock.model_validate(raw) - if not block.afid_events: - block.afid_events = list(afid_events) - return block +def _is_engine_class(obj: Any) -> bool: + return inspect.isclass(obj) and callable(getattr(obj, _ENGINE_METHOD, None)) -def _run_subprocess(argv: list[str], *, timeout_seconds: int) -> subprocess.CompletedProcess: - exe = Path(argv[0]) - if not exe.is_file() and not _command_on_path(argv[0]): - raise SeRunError(f"Service Hub not found or not executable: {argv[0]!r}") - try: - completed = subprocess.run( - argv, - capture_output=True, - text=True, - timeout=timeout_seconds, - check=False, - ) - except subprocess.TimeoutExpired as exc: - raise SeRunError(f"Service Hub timed out after {timeout_seconds}s") from exc - except OSError as exc: - raise SeRunError(f"Failed to start Service Hub: {exc}") from exc +def _resolve_engine_class(mod: Any) -> Type[Any]: + """Find the engine class in ``mod`` that implements ``get_service_info``.""" + package = mod.__name__ + candidates: list[Type[Any]] = [] + seen: set[int] = set() - if completed.returncode != 0: - stderr = (completed.stderr or "").strip() - stdout = (completed.stdout or "").strip() - detail = stderr or stdout or f"exit code {completed.returncode}" - raise SeRunError(f"Service Hub failed: {detail}") - return completed + def add_candidate(obj: Any) -> None: + if not _is_engine_class(obj): + return + key = id(obj) + if key in seen: + return + seen.add(key) + candidates.append(obj) + for name in getattr(mod, "__all__", []) or []: + add_candidate(getattr(mod, name, None)) -def _command_on_path(name: str) -> bool: - from shutil import which + for _, obj in inspect.getmembers(mod, inspect.isclass): + obj_module = getattr(obj, "__module__", "") + if obj_module == package or obj_module.startswith(f"{package}."): + add_candidate(obj) - return which(name) is not None + if len(candidates) == 1: + return candidates[0] + if not candidates: + raise SeRunError( + f"No class with {_ENGINE_METHOD}() found in {package}; " + "check engine_python_module in analysis_args." + ) + names = ", ".join(cls.__name__ for cls in candidates) + raise SeRunError(f"Multiple classes with {_ENGINE_METHOD}() in {package}: {names}.") diff --git a/nodescraper/taskresulthooks/filesystemloghook.py b/nodescraper/taskresulthooks/filesystemloghook.py index 831e3fbe..50184b4e 100644 --- a/nodescraper/taskresulthooks/filesystemloghook.py +++ b/nodescraper/taskresulthooks/filesystemloghook.py @@ -28,7 +28,7 @@ from nodescraper.interfaces.taskresulthook import TaskResultHook from nodescraper.models import DataModel, TaskResult -from nodescraper.utils import pascal_to_snake +from nodescraper.utils import resolve_log_dir_name class FileSystemLogHook(TaskResultHook): @@ -43,9 +43,9 @@ def process_result(self, task_result: TaskResult, data: Optional[DataModel] = No """Log task result to the filesystem (single events.json per directory).""" log_path = self.log_base_path if task_result.parent: - log_path = os.path.join(log_path, pascal_to_snake(task_result.parent)) + log_path = os.path.join(log_path, resolve_log_dir_name(task_result.parent)) if task_result.task: - log_path = os.path.join(log_path, pascal_to_snake(task_result.task)) + log_path = os.path.join(log_path, resolve_log_dir_name(task_result.task)) task_result.log_result(log_path) diff --git a/nodescraper/utils.py b/nodescraper/utils.py index 3b9edf34..910f608f 100644 --- a/nodescraper/utils.py +++ b/nodescraper/utils.py @@ -187,18 +187,35 @@ def get_unique_filename(directory, filename) -> str: count += 1 -def pascal_to_snake(input_str: str) -> str: - """Convert PascalCase to snake_case +_LOG_DIR_NAME_OVERRIDES: dict[str, str] = {} - Args: - input_str (str): string to convert - Returns: - str: converted string +def register_log_dir_name(class_name: str, log_dir_name: str) -> None: + """Register a filesystem log directory name for a task or plugin class.""" + _LOG_DIR_NAME_OVERRIDES[class_name] = log_dir_name + + +def resolve_log_dir_name(class_name: str) -> str: + """Map a class name to its log directory (override or snake_case).""" + if class_name in _LOG_DIR_NAME_OVERRIDES: + return _LOG_DIR_NAME_OVERRIDES[class_name] + return pascal_to_snake(class_name) + + +def pascal_to_snake(input_str: str) -> str: + """Convert PascalCase to snake_case. + + Handles embedded acronyms with digits (e.g. ``ServiceabilityPluginMI3XX``, + ``MI3XXCollector``) without splitting into single-letter segments. """ + if not input_str: + return "" if input_str.isupper(): return input_str.lower() - return ("_").join(re.split("(?<=.)(?=[A-Z])", input_str)).lower() + normalized = re.sub(r"([A-Z][A-Z0-9]+)([A-Z][a-z])", r"\1_\2", input_str) + normalized = re.sub(r"([a-z])([A-Z][A-Z0-9]+)", r"\1_\2", normalized) + normalized = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", normalized) + return normalized.lower() def bytes_to_human_readable(input_bytes: int) -> str: diff --git a/test/unit/plugin/fixtures/afid_sag_sample.json b/test/unit/plugin/fixtures/afid_sag_sample.json new file mode 100644 index 00000000..952999e6 --- /dev/null +++ b/test/unit/plugin/fixtures/afid_sag_sample.json @@ -0,0 +1,8 @@ +{ + "9001": { + "service_action_num": 99 + }, + "9002": { + "service_action_num": 88 + } +} diff --git a/test/unit/plugin/fixtures/mock_python_engine.py b/test/unit/plugin/fixtures/mock_python_engine.py new file mode 100644 index 00000000..c45c4803 --- /dev/null +++ b/test/unit/plugin/fixtures/mock_python_engine.py @@ -0,0 +1,40 @@ +"""Mock Python service engine for unit tests.""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any, Optional + +from ..serviceability_dummy_data import ( + DUMMY_ENGINE_VERSION, + DUMMY_SAG_PID, + DUMMY_SAG_REVISION, + DUMMY_SERVICE_ACTION_NUM, + DUMMY_UNIT_A, +) + + +class MockServiceEngine: + def __init__(self, afid_sag: str) -> None: + self.afid_sag = afid_sag + + def get_service_info( + self, + rf_events: list[dict[str, Any]], + cper_data: Optional[dict[str, Any]] = None, + ) -> SimpleNamespace: + del cper_data + service_info: dict[str, dict[str, dict[str, str]]] = {} + for event in rf_events: + afid = event.get("Afid") + unit = event.get("serviceable_unit", DUMMY_UNIT_A) + if afid is None: + continue + service_info.setdefault(str(unit), {})[str(afid)] = { + "service_action_number": str(DUMMY_SERVICE_ACTION_NUM), + } + return SimpleNamespace( + service_info=service_info, + afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + engine_version_info={"version": DUMMY_ENGINE_VERSION}, + ) diff --git a/test/unit/plugin/serviceability_dummy_data.py b/test/unit/plugin/serviceability_dummy_data.py new file mode 100644 index 00000000..c68b521f --- /dev/null +++ b/test/unit/plugin/serviceability_dummy_data.py @@ -0,0 +1,22 @@ +"""Shared dummy values for serviceability unit tests (not production data).""" + +DUMMY_AFID_A = 9001 +DUMMY_AFID_B = 9002 +DUMMY_AFID_C = 9003 +DUMMY_SERVICE_ACTION_NUM = 99 +DUMMY_UNIT_A = "dummy_unit_a" +DUMMY_UNIT_B = "dummy_unit_b" +DUMMY_UNIT_C = "dummy_unit_c" +DUMMY_DESIGNATION_A = "DUMMY_SLOT_A" +DUMMY_DESIGNATION_B = "DUMMY_SLOT_B" +DUMMY_EVENT_URI = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/Entries" +DUMMY_EVENT_URI_ALT = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/EntriesAlt" +DUMMY_TIMESTAMP = "2000-01-01T12:00:00+00:00" +DUMMY_TIMESTAMP_EARLIER = "1999-12-31T12:00:00+00:00" +DUMMY_TIMESTAMP_LATER = "2000-01-02T12:00:00+00:00" +DUMMY_RF_EVENT_COUNT = 2 +DUMMY_SAG_PID = "dummy-sag-pid" +DUMMY_SAG_REVISION = "dummy-rev-0" +DUMMY_ENGINE_VERSION = "0.0.0-dummy" +DUMMY_BMC_HOST = "dummy-bmc.example" +DUMMY_OEM_VENDOR = "DummyVendor" diff --git a/test/unit/plugin/test_mi3xx_collector.py b/test/unit/plugin/test_mi3xx_collector.py new file mode 100644 index 00000000..b89b1b71 --- /dev/null +++ b/test/unit/plugin/test_mi3xx_collector.py @@ -0,0 +1,213 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import pytest +from pydantic import ValidationError + +from nodescraper.connection.redfish import RF_MEMBERS, RedfishGetResult +from nodescraper.enums import ExecutionStatus +from nodescraper.plugins.serviceability import ( + MI3XXAnalyzer, + MI3XXCollector, + MI3XXCollectorArgs, + MI3XXDataModel, + MI3XXDeviceInfo, + MI3XXResult, + ServiceabilityDataModel, + ServiceabilityPluginBase, + ServiceabilityPluginMI3XX, + build_mi3xx_reporting_version_fields, + compare_iso_datetime, + is_valid_iso_datetime, + satisfies_time_check, +) +from test.unit.plugin.serviceability_dummy_data import ( + DUMMY_BMC_HOST, + DUMMY_EVENT_URI, + DUMMY_EVENT_URI_ALT, + DUMMY_TIMESTAMP_EARLIER, + DUMMY_TIMESTAMP_LATER, +) + +EVENT_URI = DUMMY_EVENT_URI + + +@pytest.fixture +def mi3xx_collector(system_info, redfish_conn_mock): + redfish_conn_mock.base_url = f"https://{DUMMY_BMC_HOST}/redfish/v1" + return MI3XXCollector( + system_info=system_info, + connection=redfish_conn_mock, + log_path="/tmp/mi3xx.log", + ) + + +def test_mi3xx_collector_args_default_event_log_uri(): + args = MI3XXCollectorArgs() + uri = args.resolved_event_log_uri() + assert uri.startswith("/redfish/") + assert "EventLog" in uri + + +def test_mi3xx_collector_args_requires_event_log_uri(): + with pytest.raises(ValidationError): + MI3XXCollectorArgs(uri="", rf_event_log_uri="") + + +def test_mi3xx_collector_args_uri_alias(): + args = MI3XXCollectorArgs(uri=f" {DUMMY_EVENT_URI_ALT} ", rf_event_log_uri=DUMMY_EVENT_URI) + assert args.resolved_event_log_uri() == DUMMY_EVENT_URI_ALT + + +def test_mi3xx_collector_args_assembly_requires_both_template_and_devices(): + with pytest.raises(ValidationError): + MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", + ) + with pytest.raises(ValidationError): + MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_chassis_devices=["dummy-chassis"], + ) + + +def test_mi3xx_collector_args_reference_time_requires_operator(): + with pytest.raises(ValidationError): + MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2000-01-01", + ) + + +def test_mi3xx_collector_args_accepts_iso_date_and_datetime(): + date_args = MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2000-01-01", + time_operator=">=", + ) + assert date_args.reference_time == "2000-01-01" + + +def test_time_utils_iso_validation_and_comparison(): + assert is_valid_iso_datetime("2000-01-01") + assert satisfies_time_check("2000-01-02", "2000-01-01", ">") + assert compare_iso_datetime("2000-01-01T00:00:00", "2000-01-01T00:00:00", "==") + + +def test_serviceability_plugin_mi3xx_wiring(): + assert issubclass(ServiceabilityPluginMI3XX, ServiceabilityPluginBase) + assert ServiceabilityPluginMI3XX.DATA_MODEL is ServiceabilityDataModel + assert ServiceabilityPluginMI3XX.COLLECTOR is MI3XXCollector + assert ServiceabilityPluginMI3XX.COLLECTOR_ARGS is MI3XXCollectorArgs + assert ServiceabilityPluginMI3XX.ANALYZER is MI3XXAnalyzer + + +def test_mi3xx_collector_no_args(mi3xx_collector): + result, data = mi3xx_collector.collect_data() + assert result.status == ExecutionStatus.NOT_RAN + assert "required" in result.message.lower() + assert data is None + + +def test_mi3xx_collector_success_minimal(mi3xx_collector, redfish_conn_mock): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: [{"Id": "dummy-1", "Created": DUMMY_TIMESTAMP_LATER}]}, + status_code=200, + ) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = mi3xx_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.rf_events) == 1 + assert data.bmc_host == DUMMY_BMC_HOST + assert data.log_path == "/tmp/mi3xx.log" + + +def test_mi3xx_collector_satisfies_reference_time_helper(mi3xx_collector): + args = MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2000-01-01", + time_operator=">=", + ) + assert mi3xx_collector.satisfies_reference_time(DUMMY_TIMESTAMP_LATER, args) + assert not mi3xx_collector.satisfies_reference_time(DUMMY_TIMESTAMP_EARLIER, args) + + +def test_mi3xx_collector_filters_events_by_reference_time(mi3xx_collector, redfish_conn_mock): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={ + RF_MEMBERS: [ + {"Id": "dummy-1", "Created": DUMMY_TIMESTAMP_LATER}, + {"Id": "dummy-2", "Created": DUMMY_TIMESTAMP_EARLIER}, + ] + }, + status_code=200, + ) + args = MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2000-01-01", + time_operator=">=", + ) + result, data = mi3xx_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert [event["Id"] for event in data.rf_events] == ["dummy-1"] + + +def test_mi3xx_device_info_fields(): + info = MI3XXDeviceInfo( + board_product_name="dummy-board", + board_serial_number="dummy-serial-001", + product_version="0.0-dummy", + ) + assert info.board_product_name == "dummy-board" + assert info.product_version == "0.0-dummy" + + +def test_mi3xx_result_reporting_versions(): + version_fields = build_mi3xx_reporting_version_fields( + plugin_name="dummy_plugin", + plugin_version="0.0-dummy", + node_scraper_version="0.0-dummy", + dummy_engine_version="0.0-dummy", + ) + result = MI3XXResult(node="dummy-node", **version_fields) + assert result.plugin_name == "dummy_plugin" + assert result.reporter_extensions["dummy_engine_version"] == "0.0-dummy" + + +def test_mi3xx_data_model_log_model(tmp_path): + model = MI3XXDataModel( + collected_data={"events": [{"id": 1}]}, + artifacts={"events.json": [{"id": 1}]}, + ) + model.log_model(str(tmp_path)) + assert (tmp_path / "events.json").is_file() + assert (tmp_path / "MI3XX_data.json").is_file() diff --git a/test/unit/plugin/test_se_runner.py b/test/unit/plugin/test_se_runner.py new file mode 100644 index 00000000..cbb5e714 --- /dev/null +++ b/test/unit/plugin/test_se_runner.py @@ -0,0 +1,257 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from pathlib import Path +from types import SimpleNamespace + +import pytest +from pydantic import ValidationError + +from nodescraper.enums import ExecutionStatus +from nodescraper.plugins.serviceability import ( + AfidEvent, + MI3XXAnalyzer, + SeRunError, + ServiceabilityAnalyzerArgs, + ServiceabilityBlock, + ServiceabilityDataModel, + build_afid_events_from_data, + format_serviceability_solution_lines, + normalize_se_timestamp, + run_service_engine, + serviceability_block_from_service_result, +) +from nodescraper.plugins.serviceability.se_models import ServiceabilitySolution +from test.unit.plugin.serviceability_dummy_data import ( + DUMMY_AFID_A, + DUMMY_AFID_B, + DUMMY_AFID_C, + DUMMY_DESIGNATION_A, + DUMMY_DESIGNATION_B, + DUMMY_ENGINE_VERSION, + DUMMY_OEM_VENDOR, + DUMMY_RF_EVENT_COUNT, + DUMMY_SAG_PID, + DUMMY_SAG_REVISION, + DUMMY_SERVICE_ACTION_NUM, + DUMMY_TIMESTAMP, + DUMMY_UNIT_A, + DUMMY_UNIT_B, + DUMMY_UNIT_C, +) + +FIXTURES = Path(__file__).resolve().parent / "fixtures" +AFID_SAG = FIXTURES / "afid_sag_sample.json" +EXAMPLE_EVENTS = [ + AfidEvent(afid=DUMMY_AFID_A, serviceable_unit=DUMMY_UNIT_A, time=DUMMY_TIMESTAMP), + AfidEvent(afid=DUMMY_AFID_B, serviceable_unit=DUMMY_UNIT_B, time=DUMMY_TIMESTAMP), + AfidEvent(afid=DUMMY_AFID_C, serviceable_unit=DUMMY_UNIT_C, time=DUMMY_TIMESTAMP), +] + + +def test_afid_event_requires_non_empty_serviceable_unit(): + with pytest.raises(ValidationError): + AfidEvent(afid=1, serviceable_unit=" ", time=DUMMY_TIMESTAMP) + + +def test_normalize_se_timestamp_preserves_engine_format(): + sample = "2000-01-01 12:00:00.000+00:00" + assert normalize_se_timestamp(sample) == sample + + +def test_analyzer_args_require_engine_config(): + with pytest.raises(ValidationError): + ServiceabilityAnalyzerArgs() + with pytest.raises(ValidationError, match="engine_python_module"): + ServiceabilityAnalyzerArgs(afid_sag_path=str(AFID_SAG)) + args = ServiceabilityAnalyzerArgs( + engine_python_module="dummy.test.module", + afid_sag_path=str(AFID_SAG), + ) + assert args.engine_python_module == "dummy.test.module" + + +def test_format_serviceability_solution_lines(): + block = ServiceabilityBlock( + afid_events=EXAMPLE_EVENTS[:1], + solution=[ + ServiceabilitySolution( + afid=DUMMY_AFID_A, + serviceable_unit=[DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B], + service_action_num=DUMMY_SERVICE_ACTION_NUM, + ) + ], + solution_reasoning="Dummy test reasoning.", + ) + lines = format_serviceability_solution_lines(block) + assert lines[0] == "Dummy test reasoning." + assert f"AFID {DUMMY_AFID_A}" in lines[1] + assert DUMMY_DESIGNATION_A in lines[1] + + +def test_serviceability_block_from_service_result(): + result = SimpleNamespace( + service_info={ + DUMMY_DESIGNATION_A: { + str(DUMMY_AFID_A): { + "service_action_number": str(DUMMY_SERVICE_ACTION_NUM), + "error_category": "dummy_category", + "error_type": "dummy_type", + "title": "Dummy service action", + } + }, + DUMMY_DESIGNATION_B: { + str(DUMMY_AFID_A): { + "service_action_number": str(DUMMY_SERVICE_ACTION_NUM), + "error_category": "dummy_category", + "error_type": "dummy_type", + "title": "Dummy service action", + } + }, + }, + afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + engine_version_info={"version": DUMMY_ENGINE_VERSION}, + ) + block = serviceability_block_from_service_result( + EXAMPLE_EVENTS[:1], + result, + engine_label="Dummy test engine", + rf_event_count=DUMMY_RF_EVENT_COUNT, + ) + assert len(block.solution) == 1 + assert block.solution[0].afid == DUMMY_AFID_A + assert block.solution[0].service_action_num == DUMMY_SERVICE_ACTION_NUM + assert set(block.solution[0].serviceable_unit) == {DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B} + assert f"{DUMMY_RF_EVENT_COUNT} Redfish event(s)" in block.solution_reasoning + assert "Dummy test engine" in block.solution_reasoning + + +def test_resolve_engine_class_finds_package_export(): + import types + + submodule = types.ModuleType("fake_engine.impl") + submodule.__dict__["EngineImpl"] = type( + "EngineImpl", + (), + {"get_service_info": lambda self, rf_events, cper_data=None: None}, + ) + package = types.ModuleType("fake_engine") + package.EngineImpl = submodule.EngineImpl # type: ignore[attr-defined] + package.__all__ = ["EngineImpl"] + + from nodescraper.plugins.serviceability.se_runner import _resolve_engine_class + + assert _resolve_engine_class(package) is submodule.EngineImpl + + +def test_run_service_engine_with_mock_module(): + rf_events = [ + {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, + {"Afid": DUMMY_AFID_C, "serviceable_unit": DUMMY_UNIT_C, "Created": DUMMY_TIMESTAMP}, + ] + block = run_service_engine( + engine_python_module="test.unit.plugin.fixtures.mock_python_engine", + afid_events=EXAMPLE_EVENTS[:2], + afid_sag_path=str(AFID_SAG), + rf_events=rf_events, + ) + assert len(block.solution) == 2 + assert block.solution[0].afid == DUMMY_AFID_A + assert block.solution[0].service_action_num == DUMMY_SERVICE_ACTION_NUM + + +def test_run_service_engine_missing_sag_raises(): + with pytest.raises(SeRunError, match="AFID_SAG"): + run_service_engine( + engine_python_module="test.unit.plugin.fixtures.mock_python_engine", + afid_events=EXAMPLE_EVENTS, + afid_sag_path="/nonexistent/dummy_afid_sag.json", + rf_events=[{"Afid": DUMMY_AFID_A}], + ) + + +def test_build_afid_events_from_rf_members(): + data = ServiceabilityDataModel( + rf_events=[ + { + "Afid": DUMMY_AFID_A, + "serviceable_unit": DUMMY_UNIT_A, + "Created": DUMMY_TIMESTAMP, + }, + { + "Oem": { + DUMMY_OEM_VENDOR: { + "Afid": DUMMY_AFID_B, + "serviceable_unit": DUMMY_UNIT_B, + } + }, + "EventTimestamp": DUMMY_TIMESTAMP, + }, + ] + ) + events = build_afid_events_from_data(data) + assert len(events) == 2 + assert events[0].afid == DUMMY_AFID_A + assert events[1].afid == DUMMY_AFID_B + + +def test_mi3xx_analyzer_runs_python_engine(system_info): + data = ServiceabilityDataModel( + rf_events=[ + { + "Afid": DUMMY_AFID_A, + "serviceable_unit": DUMMY_UNIT_A, + "Created": DUMMY_TIMESTAMP, + }, + { + "Afid": DUMMY_AFID_C, + "serviceable_unit": DUMMY_UNIT_C, + "Created": DUMMY_TIMESTAMP, + }, + ] + ) + analyzer = MI3XXAnalyzer(system_info=system_info) + args = ServiceabilityAnalyzerArgs( + engine_python_module="test.unit.plugin.fixtures.mock_python_engine", + afid_sag_path=str(AFID_SAG), + ) + result = analyzer.analyze_data(data, args=args) + assert result.status == ExecutionStatus.OK + assert data.serviceability is not None + assert len(data.serviceability.solution) == 2 + + +def test_mi3xx_analyzer_writes_serviceability_json(tmp_path, system_info): + data = ServiceabilityDataModel( + afid_events=EXAMPLE_EVENTS[:1], + serviceability=ServiceabilityBlock( + afid_events=EXAMPLE_EVENTS[:1], + solution=[], + ), + ) + data.log_model(str(tmp_path)) + payload = json.loads((tmp_path / "serviceability.json").read_text(encoding="utf-8")) + assert payload["afid_events"][0]["afid"] == DUMMY_AFID_A diff --git a/test/unit/plugin/test_serviceability_collector.py b/test/unit/plugin/test_serviceability_collector.py index d7496288..da31e491 100644 --- a/test/unit/plugin/test_serviceability_collector.py +++ b/test/unit/plugin/test_serviceability_collector.py @@ -38,7 +38,7 @@ from nodescraper.models import CollectorArgs from nodescraper.plugins.serviceability import ( DeviceInfo, - Mi3xxCollectorArgs, + MI3XXCollectorArgs, ServiceabilityAnalyzerArgs, ServiceabilityDataModel, ServiceabilityPluginBase, @@ -46,15 +46,16 @@ from nodescraper.plugins.serviceability.serviceability_collector import ( ServiceabilityCollectorBase, ) +from test.unit.plugin.serviceability_dummy_data import DUMMY_BMC_HOST, DUMMY_EVENT_URI -EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" +EVENT_URI = DUMMY_EVENT_URI -class _StubServiceabilityCollector(ServiceabilityCollectorBase[Mi3xxCollectorArgs]): +class _StubServiceabilityCollector(ServiceabilityCollectorBase[MI3XXCollectorArgs]): def filter_event_members( self, members: list[Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> list[Any]: return members @@ -68,21 +69,21 @@ def parse_assembly_entry( self, designation: str, assembly_member_entry: dict[str, Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> DeviceInfo: return DeviceInfo(name=designation, serial_number=assembly_member_entry.get("SerialNumber")) def extract_component_details( self, firmware_inventory_payload: dict[str, Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> Optional[str]: return firmware_inventory_payload.get("Details") @pytest.fixture def stub_serviceability_collector(system_info, redfish_conn_mock): - redfish_conn_mock.base_url = "https://bmc.example/redfish/v1" + redfish_conn_mock.base_url = f"https://{DUMMY_BMC_HOST}/redfish/v1" return _StubServiceabilityCollector( system_info=system_info, connection=redfish_conn_mock, @@ -90,40 +91,53 @@ def stub_serviceability_collector(system_info, redfish_conn_mock): ) +def test_mi3xx_collector_args_default_event_log_uri(): + args = MI3XXCollectorArgs() + uri = args.resolved_event_log_uri() + assert uri.startswith("/redfish/") + assert "EventLog" in uri + + def test_mi3xx_collector_args_requires_event_log_uri(): with pytest.raises(ValidationError): - Mi3xxCollectorArgs() + MI3XXCollectorArgs(uri="", rf_event_log_uri="") def test_mi3xx_collector_args_uri_alias_prefers_uri_over_rf_event_log_uri(): - args = Mi3xxCollectorArgs(uri=" /events ", rf_event_log_uri="/other") - assert args.resolved_event_log_uri() == "/events" + args = MI3XXCollectorArgs( + uri=" /redfish/v1/Systems/Dummy/LogServices/DummyEventLog/EntriesAlt ", + rf_event_log_uri="/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/Entries", + ) + assert ( + args.resolved_event_log_uri() + == "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/EntriesAlt" + ) def test_mi3xx_collector_args_assembly_requires_both_template_and_devices(): with pytest.raises(ValidationError): - Mi3xxCollectorArgs( + MI3XXCollectorArgs( rf_event_log_uri=EVENT_URI, rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", ) with pytest.raises(ValidationError): - Mi3xxCollectorArgs( + MI3XXCollectorArgs( rf_event_log_uri=EVENT_URI, - rf_chassis_devices=["C1"], + rf_chassis_devices=["dummy-chassis"], ) def test_mi3xx_collector_args_assembly_template_must_include_device_placeholder(): with pytest.raises(ValidationError): - Mi3xxCollectorArgs( + MI3XXCollectorArgs( rf_event_log_uri=EVENT_URI, - rf_assembly_uri_template="/redfish/v1/Chassis/C1/Assembly", - rf_chassis_devices=["C1"], + rf_assembly_uri_template="/redfish/v1/Chassis/dummy-chassis/Assembly", + rf_chassis_devices=["dummy-chassis"], ) def test_mi3xx_collector_args_assembly_optional_when_omitted(): - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) assert args.rf_assembly_uri_template is None assert args.rf_chassis_devices is None @@ -150,7 +164,7 @@ def test_stub_collector_event_log_get_fails(stub_serviceability_collector, redfi error="timeout", status_code=None, ) - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) result, data = stub_serviceability_collector.collect_data(args=args) assert result.status == ExecutionStatus.ERROR assert EVENT_URI in result.message @@ -165,13 +179,13 @@ def test_stub_collector_success_minimal(stub_serviceability_collector, redfish_c data={RF_MEMBERS: members}, status_code=200, ) - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) result, data = stub_serviceability_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None assert data.rf_events == members assert EVENT_URI in data.responses - assert data.bmc_host == "bmc.example" + assert data.bmc_host == DUMMY_BMC_HOST assert data.log_path == "/tmp/serviceability.log" redfish_conn_mock.run_get_paged.assert_called_once() @@ -193,7 +207,7 @@ def filter_event_members(self, members, args): data={RF_MEMBERS: []}, status_code=200, ) - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) result, data = collector.collect_data(args=args) assert result.status == ExecutionStatus.ERROR assert "Event filter failed" in result.message @@ -204,7 +218,7 @@ def test_stub_collector_assembly_and_firmware_paths( stub_serviceability_collector, redfish_conn_mock ): tpl = "/redfish/v1/Chassis/{device}/Assembly" - asm_uri = tpl.format(device="C1") + asm_uri = tpl.format(device="dummy-chassis") fw_uri = "/redfish/v1/UpdateService/FirmwareInventory" def run_get_side_effect(path: str, *_args, **_kwargs): @@ -219,14 +233,14 @@ def run_get_side_effect(path: str, *_args, **_kwargs): return RedfishGetResult( path=asm_uri, success=True, - data={"Assemblies": [{"SerialNumber": "SN-ASM"}]}, + data={"Assemblies": [{"SerialNumber": "dummy-asm-serial"}]}, status_code=200, ) if path == fw_uri: return RedfishGetResult( path=fw_uri, success=True, - data={"Details": "fw-summary"}, + data={"Details": "dummy-fw-summary"}, status_code=200, ) raise AssertionError(f"unexpected Redfish GET path: {path!r}") @@ -238,19 +252,19 @@ def run_get_paged_forbidden(*_args, **_kwargs): redfish_conn_mock.run_get_paged.side_effect = run_get_paged_forbidden - args = Mi3xxCollectorArgs( + args = MI3XXCollectorArgs( rf_event_log_uri=EVENT_URI, rf_assembly_uri_template=tpl, - rf_chassis_devices=["C1"], + rf_chassis_devices=["dummy-chassis"], rf_firmware_bundle_uri=fw_uri, follow_next_link=False, ) result, data = stub_serviceability_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None - assert "C1" in data.assembly_info - assert data.assembly_info["C1"].serial_number == "SN-ASM" - assert data.component_details == "fw-summary" + assert "dummy-chassis" in data.assembly_info + assert data.assembly_info["dummy-chassis"].serial_number == "dummy-asm-serial" + assert data.component_details == "dummy-fw-summary" assert asm_uri in data.responses @@ -271,7 +285,7 @@ def test_stub_collector_top_when_count_exceeds_top_uses_skip_and_paged( ) redfish_conn_mock.run_get.return_value = probe redfish_conn_mock.run_get_paged.return_value = window - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI, top=10) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI, top=10) result, data = stub_serviceability_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None @@ -300,7 +314,7 @@ def test_stub_collector_top_when_count_within_top_fetches_full_log( ) redfish_conn_mock.run_get.return_value = probe redfish_conn_mock.run_get_paged.return_value = full - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI, top=50) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI, top=50) result, data = stub_serviceability_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None From 9ace7e6fea33e98e68b319e4f3109c515f5f3924 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 27 May 2026 12:37:58 -0500 Subject: [PATCH 07/19] updates --- nodescraper/interfaces/datacollectortask.py | 1 - nodescraper/interfaces/task.py | 23 ++- nodescraper/models/event.py | 19 ++- nodescraper/plugins/inband/rocm/rocmdata.py | 55 ++++--- .../plugins/serviceability/analyzer_args.py | 22 ++- .../plugins/serviceability/cper_decode.py | 145 ++++++++++++++++++ .../serviceability/mi3xx/mi3xx_analyzer.py | 58 ++++++- .../serviceability/mi3xx/mi3xx_collector.py | 56 ++++++- .../serviceability_collector.py | 8 +- .../serviceability/serviceability_data.py | 7 + test/unit/plugin/test_mi3xx_collector.py | 33 ++++ .../plugin/test_serviceability_collector.py | 2 +- 12 files changed, 389 insertions(+), 40 deletions(-) create mode 100644 nodescraper/plugins/serviceability/cper_decode.py diff --git a/nodescraper/interfaces/datacollectortask.py b/nodescraper/interfaces/datacollectortask.py index 020bf053..18308a98 100644 --- a/nodescraper/interfaces/datacollectortask.py +++ b/nodescraper/interfaces/datacollectortask.py @@ -151,7 +151,6 @@ def __init__( Args: system_info (SystemInfo): system info object for target system for data collection system_interaction (SystemInteraction): enum to indicate the type of actions that can be performed when interacting with the system - event_reporter (str, optional): Described the reporter of the event. Defaults to DEFAULT_EVENT_REPORTER. logger (Optional[logging.Logger], optional): python logger object. Defaults to None. log_path (Optional[str], optional): file system log path. Defaults to None. """ diff --git a/nodescraper/interfaces/task.py b/nodescraper/interfaces/task.py index 8855a48a..3696673a 100644 --- a/nodescraper/interfaces/task.py +++ b/nodescraper/interfaces/task.py @@ -73,8 +73,10 @@ def __init__( if session_id is not None: try: uuid.UUID(str(session_id)) - except (ValueError, TypeError, AttributeError): - raise ValueError("session_id must be a valid UUID") from None + except (ValueError, TypeError, AttributeError) as e: + raise ValueError( + f"session_id must be a valid UUID string, got: {session_id}" + ) from e self.session_id: Optional[str] = str(session_id) if session_id is not None else None self.result: TaskResult = self._init_result() @@ -166,7 +168,22 @@ def _log_event( ) if console_log: - self.logger.log(getattr(logging, priority.name, logging.INFO), description) + level = getattr(logging, priority.name, logging.INFO) + prefix = "" + if data: + et = data.get("exception_type") + if et: + prefix = f"[{et}] " + self.logger.log(level, "%s%s", prefix, description) + if data: + tb = data.get("traceback") + if tb: + tb_text = "".join(tb) if isinstance(tb, list) else str(tb) + if tb_text.strip(): + self.logger.log(level, "Traceback:\n%s", tb_text.rstrip()) + det = data.get("details") + if det and not tb: + self.logger.log(level, "Details: %s", det) self.result.events.append(event) diff --git a/nodescraper/models/event.py b/nodescraper/models/event.py index 25315ef2..7c959d4e 100644 --- a/nodescraper/models/event.py +++ b/nodescraper/models/event.py @@ -114,13 +114,21 @@ def validate_category(cls, category: Optional[Union[str, Enum]]) -> str: @field_validator("priority", mode="before") @classmethod def validate_priority(cls, priority: Union[str, int, EventPriority]) -> EventPriority: - """Allow priority as EventPriority, enum name string, or IntEnum value (unknown int -> ERROR). + """Allow priority via :class:`EventPriority`, name string, or integer value. + + Integer values use :class:`~enum.IntEnum` construction (same numeric scale as + ``EventPriority``). Values outside the enum (e.g. foreign severity codes) map + to :attr:`EventPriority.ERROR`. Booleans are rejected (``bool`` is a subclass + of ``int`` in Python). Args: - priority: EventPriority, name string, integer matching a level, or unknown int (maps to ERROR). + priority: Enum, member name, or integer severity. Raises: - ValueError: if priority string is invalid, or if a boolean is passed. + ValueError: if *priority* is a boolean or an invalid string name. + + Returns: + Resolved :class:`EventPriority`. """ if type(priority) is bool: raise ValueError("priority must not be a boolean") @@ -138,7 +146,10 @@ def validate_priority(cls, priority: Union[str, int, EventPriority]) -> EventPri ) from e if isinstance(priority, EventPriority): return priority - raise ValueError("priority must be an EventPriority or its name as a string") + raise ValueError( + "priority must be an EventPriority, its name as a string, or an int " + "(unknown ints map to ERROR)" + ) @field_serializer("priority") def serialize_priority(self, priority: EventPriority, _info) -> str: diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index cd1b0537..eb1794c3 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,18 +24,24 @@ # ############################################################################### import re -from typing import ClassVar, List, Optional +from typing import List, Optional -from pydantic import computed_field, field_validator +from pydantic import field_validator from nodescraper.models import DataModel -_ROCM_VERSION_RE = re.compile(r"^(\d+(?:\.\d+){0,3})(?:-(\d+)(?:-gfx\w+(?:;gfx\w+)*)?)?$") +# e.g. 7.13.0, 7.13.0-123, 7.13.0-123-gfx942, 7.13.0-123-gfx942;gfx950 +_ROCM_VERSION_RE = re.compile(r"^\d+(?:\.\d+){0,3}(?:-\d+)?(?:-gfx\d+(?:;gfx\d+)*)?$") +_ROCM_BUILD_NUMBER_RE = re.compile(r"^\d+(?:\.\d+){0,3}-(\d+)") -class RocmDataModel(DataModel): - ROCM_VERSION_FILENAME: ClassVar[str] = "version-rocm" +def _validate_rocm_version_string(rocm_version: str) -> str: + if not _ROCM_VERSION_RE.match(rocm_version): + raise ValueError(f"ROCm version has invalid format: {rocm_version}") + return rocm_version + +class RocmDataModel(DataModel): rocm_version: str rocm_sub_versions: dict[str, str] = {} rocminfo: List[str] = [] @@ -47,28 +53,33 @@ class RocmDataModel(DataModel): clinfo: List[str] = [] kfd_proc: List[str] = [] - @staticmethod - def _validate_version_string(version: str) -> str: - if not _ROCM_VERSION_RE.match(version): - raise ValueError(f"ROCm version has invalid format: {version}") - return version - @field_validator("rocm_version") @classmethod def validate_rocm_version(cls, rocm_version: str) -> str: - return cls._validate_version_string(rocm_version) + """ + Validate the ROCm version format. + + Args: + rocm_version (str): The ROCm version string to validate. + + Raises: + ValueError: If the ROCm version does not match the expected format. + + Returns: + str: The validated ROCm version string. + """ + return _validate_rocm_version_string(rocm_version) @field_validator("rocm_sub_versions") @classmethod - def validate_rocm_sub_versions(cls, sub_versions: dict[str, str]) -> dict[str, str]: - for version in sub_versions.values(): - cls._validate_version_string(version) - return sub_versions + def validate_rocm_sub_versions(cls, rocm_sub_versions: dict[str, str]) -> dict[str, str]: + for value in rocm_sub_versions.values(): + _validate_rocm_version_string(value) + return rocm_sub_versions - @computed_field + @property def build_number(self) -> Optional[str]: - """Build tag from version-rocm sub-version, or rocm_version when absent.""" - rocm_version = self.rocm_sub_versions.get(self.ROCM_VERSION_FILENAME, self.rocm_version) - if "-" in rocm_version: - return rocm_version.split("-")[1] - return None + """ROCm package build number from version-rocm sub-version or rocm_version.""" + version_str = self.rocm_sub_versions.get("version-rocm") or self.rocm_version + match = _ROCM_BUILD_NUMBER_RE.match(version_str) + return match.group(1) if match else None diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py index 679743dd..8d5deea1 100644 --- a/nodescraper/plugins/serviceability/analyzer_args.py +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -54,8 +54,28 @@ class ServiceabilityAnalyzerArgs(AnalyzerArgs): default=False, description="If True, only build afid_events without running the service engine.", ) + cper_decode_module: Optional[str] = Field( + default=None, + description=( + "Import path of the Python module that decodes CPER blobs (e.g. " + "vendor.package.cdump_analyzer). Required when collected events " + "include CPER attachments to decode before running the service engine." + ), + ) + cper_decode_method: str = Field( + default="analyze_cper", + description=( + "Name of the callable on cper_decode_module. It must accept a " + "binary file-like CPER payload and return (return_code, decode_dict)." + ), + ) - @field_validator("afid_sag_path", "engine_python_module", "engine_display_name") + @field_validator( + "afid_sag_path", + "engine_python_module", + "engine_display_name", + "cper_decode_module", + ) @classmethod def _strip_optional_strings(cls, value: Optional[str]) -> Optional[str]: if value is None: diff --git a/nodescraper/plugins/serviceability/cper_decode.py b/nodescraper/plugins/serviceability/cper_decode.py new file mode 100644 index 00000000..6982407a --- /dev/null +++ b/nodescraper/plugins/serviceability/cper_decode.py @@ -0,0 +1,145 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Decode collected CPER attachments via a configured Python decode module.""" +from __future__ import annotations + +import base64 +import binascii +import importlib +import io +import logging +from typing import Any, Callable, Optional + + +class CperDecodeError(RuntimeError): + """Raised when the configured CPER decode module cannot be loaded or decoding fails.""" + + +def _load_decode_callable( + cper_decode_module: str, + cper_decode_method: str, +) -> Callable[[io.BytesIO], tuple[int, Any]]: + """Import a decode callable from analysis_args (module + method name).""" + try: + module = importlib.import_module(cper_decode_module) + except ImportError as exc: + raise CperDecodeError( + f"Cannot import cper_decode_module {cper_decode_module!r}: {exc}" + ) from exc + + decode_fn = getattr(module, cper_decode_method, None) + if decode_fn is None: + raise CperDecodeError( + f"Module {cper_decode_module!r} has no callable {cper_decode_method!r}" + ) + if not callable(decode_fn): + raise CperDecodeError(f"{cper_decode_module!r}.{cper_decode_method!r} is not callable") + return decode_fn + + +def count_ras_err_entries(decode_payload: Any) -> int: + """Count RasErr* keys in a decoded CPER triage_result dict.""" + if not isinstance(decode_payload, dict): + return 0 + triage_result = decode_payload.get("triage_result", {}) + if not isinstance(triage_result, dict): + return 0 + return sum(1 for key in triage_result if str(key).startswith("RasErr")) + + +def decode_cper_raw_attachments( + cper_raw: dict[str, str], + *, + cper_decode_module: str, + cper_decode_method: str = "analyze_cper", + logger: Optional[logging.Logger] = None, +) -> dict[str, Any]: + """Decode base64 CPER blobs keyed by Redfish event Id. + + The decode callable must accept a binary file-like object and return + ``(return_code, decode_dict)``. Results are passed to the service engine as + ``cper_data``; the engine does not perform CPER decoding itself. + + Returns ``{event_id: {"return_code": int, "decode": dict}}``. + """ + if not cper_raw: + return {} + + decode_fn = _load_decode_callable(cper_decode_module, cper_decode_method) + + decoded: dict[str, Any] = {} + errors: list[str] = [] + + for event_id, payload_b64 in cper_raw.items(): + try: + raw = base64.b64decode(payload_b64, validate=True) + except (binascii.Error, ValueError) as exc: + errors.append(f"event {event_id}: invalid base64 ({exc})") + continue + + try: + return_code, decode_payload = decode_fn(io.BytesIO(raw)) + except Exception as exc: # noqa: BLE001 + msg = f"event {event_id}: {exc}" + errors.append(msg) + if logger is not None: + logger.warning("CPER decode failed for Redfish event %s: %s", event_id, exc) + continue + + if return_code != 0: + errors.append(f"event {event_id}: decode return code {return_code}") + + decoded[str(event_id)] = { + "return_code": return_code, + "decode": decode_payload, + } + if logger is not None: + ras_count = count_ras_err_entries(decode_payload) + if return_code == 0: + logger.info( + "CPER decoded for Redfish event %s (return_code=0, %d RasErr entr%s)", + event_id, + ras_count, + "y" if ras_count == 1 else "ies", + ) + else: + logger.warning( + "CPER decoded for Redfish event %s with non-zero return_code=%s " + "(%d RasErr entr%s)", + event_id, + return_code, + ras_count, + "y" if ras_count == 1 else "ies", + ) + + if errors and not decoded: + raise CperDecodeError("; ".join(errors)) + + if logger is not None and errors: + for msg in errors: + logger.warning("CPER decode issue: %s", msg) + + return decoded diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index ab001184..b8fc8373 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -32,6 +32,10 @@ from nodescraper.models import TaskResult from nodescraper.plugins.serviceability.afid_events import build_afid_events_from_data from nodescraper.plugins.serviceability.analyzer_args import ServiceabilityAnalyzerArgs +from nodescraper.plugins.serviceability.cper_decode import ( + CperDecodeError, + decode_cper_raw_attachments, +) from nodescraper.plugins.serviceability.se_adapter import ( format_serviceability_solution_lines, ) @@ -67,6 +71,51 @@ def analyze_data( self._log_serviceability_solutions(data.serviceability) return self.result + parent = self.parent or self.__class__.__name__ + cper_data = data.cper_data or {} + if data.cper_raw and not cper_data: + if not args.cper_decode_module: + self.logger.warning( + "(%s) %d CPER attachment(s) collected but cper_decode_module is " + "not set in analysis_args; skipping CPER decode", + parent, + len(data.cper_raw), + ) + else: + self.logger.info( + "(%s) Decoding %d CPER attachment(s) via %s.%s", + parent, + len(data.cper_raw), + args.cper_decode_module, + args.cper_decode_method, + ) + try: + cper_data = decode_cper_raw_attachments( + data.cper_raw, + cper_decode_module=args.cper_decode_module, + cper_decode_method=args.cper_decode_method, + logger=self.logger, + ) + data.cper_data = cper_data + self.logger.info( + "(%s) CPER decode finished: %d of %d attachment(s) decoded", + parent, + len(cper_data), + len(data.cper_raw), + ) + except CperDecodeError as exc: + self.logger.warning( + "(%s) %s; continuing without decoded CPER", + parent, + exc, + ) + elif cper_data: + self.logger.info( + "(%s) Using %d pre-decoded CPER record(s) from collection", + parent, + len(cper_data), + ) + try: block = run_service_engine( engine_python_module=args.engine_python_module, # type: ignore[arg-type] @@ -74,7 +123,7 @@ def analyze_data( afid_events=events, afid_sag_path=args.afid_sag_path, # type: ignore[arg-type] rf_events=data.rf_events, - cper_data=data.cper_data or None, + cper_data=cper_data or None, ) except (SeRunError, ValueError) as exc: self.result.status = ExecutionStatus.ERROR @@ -85,9 +134,14 @@ def analyze_data( self._log_serviceability_solutions(block) engine_label = args.engine_display_name or args.engine_python_module self.result.status = ExecutionStatus.OK + cper_summary = "" + if cper_data: + cper_summary = f", {len(cper_data)} decoded CPER(s)" + elif data.cper_raw: + cper_summary = f", {len(data.cper_raw)} CPER attachment(s) not decoded" self.result.message = ( f"{engine_label}: {len(block.solution)} solution(s) " - f"from {len(data.rf_events)} Redfish event(s)" + f"from {len(data.rf_events)} Redfish event(s){cper_summary}" ) return self.result diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py index 63e23e21..44594aee 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py @@ -25,6 +25,7 @@ ############################################################################### from __future__ import annotations +import base64 from typing import Any, Optional from nodescraper.plugins.serviceability.serviceability_collector import ( @@ -67,12 +68,63 @@ def filter_event_members( return filtered def is_cper_event(self, event: dict) -> bool: + if "CPER" in event: + return True + if str(event.get("DiagnosticDataType", "")).upper() == "CPER": + return True + if event.get("AdditionalDataURI"): + return True message_id = str(event.get("MessageId", "")).lower() message = str(event.get("Message", "")).lower() return "cper" in message_id or "cper" in message or "diagnostic" in message_id - def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: - return {} + def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: + """Fetch CPER binaries from BMC; decoding runs in the analyzer.""" + parent = self.parent or self.__class__.__name__ + attachments: dict[str, str] = {} + for event in rf_events: + if not isinstance(event, dict) or not self.is_cper_event(event): + continue + uri = event.get("AdditionalDataURI") + event_id = event.get("Id") + if not uri or not event_id: + continue + + try: + resp = self.connection.get_response(uri) + except Exception as exc: # noqa: BLE001 + self.logger.warning( + "(%s) Failed to fetch CPER attachment for event %s: %s", + parent, + event_id, + exc, + ) + continue + if not resp.ok: + self.logger.warning( + "(%s) Failed to fetch CPER attachment for event %s: HTTP %s", + parent, + event_id, + resp.status_code, + ) + continue + + size_bytes = len(resp.content) + attachments[str(event_id)] = base64.b64encode(resp.content).decode("ascii") + self.logger.info( + "(%s) Fetched CPER attachment for Redfish event %s (%d bytes)", + parent, + event_id, + size_bytes, + ) + + if attachments: + self.logger.info( + "(%s) Collected %d CPER attachment(s) for analyzer decode", + parent, + len(attachments), + ) + return attachments def parse_assembly_entry( self, diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py index 961afdf9..3278c113 100644 --- a/nodescraper/plugins/serviceability/serviceability_collector.py +++ b/nodescraper/plugins/serviceability/serviceability_collector.py @@ -76,8 +76,8 @@ def is_cper_event(self, event: dict) -> bool: """Return whether a Redfish event entry should be treated as diagnostic-backed.""" @abc.abstractmethod - def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: - """Fetch and decode diagnostic attachments for qualifying events (subclass-defined).""" + def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: + """Fetch CPER binary attachments for qualifying events (base64 by event Id).""" @abc.abstractmethod def parse_assembly_entry( @@ -151,13 +151,13 @@ def collect_data( entry = assemblies[0] assembly_info[device] = self.parse_assembly_entry(device, entry, svc_args) - cper_data = self.collect_cper_data(filtered_members or []) + cper_raw = self.collect_cper_attachments(filtered_members or []) data = ServiceabilityDataModel( responses=responses, rf_events=filtered_members or [], assembly_info=assembly_info, - cper_data=cper_data, + cper_raw=cper_raw, component_details=self._fetch_component_details(responses, svc_args), log_path=self._log_path, bmc_host=bmc_host, diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py index 0c387940..b275c579 100644 --- a/nodescraper/plugins/serviceability/serviceability_data.py +++ b/nodescraper/plugins/serviceability/serviceability_data.py @@ -66,6 +66,13 @@ class ServiceabilityDataModel(DataModel): responses: dict[str, Any] = {} rf_events: list[Any] = [] assembly_info: Dict[str, DeviceInfo] = {} + cper_raw: Dict[str, str] = Field( + default_factory=dict, + description=( + "Base64-encoded CPER attachment bytes keyed by Redfish event Id; " + "populated during collection and decoded in the analyzer." + ), + ) cper_data: Dict[str, Any] = {} component_details: Optional[str] = None log_path: Optional[str] = None diff --git a/test/unit/plugin/test_mi3xx_collector.py b/test/unit/plugin/test_mi3xx_collector.py index b89b1b71..91ff0ed0 100644 --- a/test/unit/plugin/test_mi3xx_collector.py +++ b/test/unit/plugin/test_mi3xx_collector.py @@ -158,6 +158,39 @@ def test_mi3xx_collector_satisfies_reference_time_helper(mi3xx_collector): assert not mi3xx_collector.satisfies_reference_time(DUMMY_TIMESTAMP_EARLIER, args) +def test_mi3xx_collector_fetches_cper_attachments(mi3xx_collector, redfish_conn_mock): + import base64 + from unittest.mock import MagicMock + + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={ + RF_MEMBERS: [ + { + "Id": "cper-evt-1", + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/1", + } + ] + }, + status_code=200, + ) + response = MagicMock() + response.ok = True + response.status_code = 200 + response.content = b"\x01\x02dummy-cper" + redfish_conn_mock.get_response.return_value = response + + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = mi3xx_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.cper_raw["cper-evt-1"] == base64.b64encode(b"\x01\x02dummy-cper").decode("ascii") + assert data.cper_data == {} + + def test_mi3xx_collector_filters_events_by_reference_time(mi3xx_collector, redfish_conn_mock): redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( path=EVENT_URI, diff --git a/test/unit/plugin/test_serviceability_collector.py b/test/unit/plugin/test_serviceability_collector.py index da31e491..603cf08a 100644 --- a/test/unit/plugin/test_serviceability_collector.py +++ b/test/unit/plugin/test_serviceability_collector.py @@ -62,7 +62,7 @@ def filter_event_members( def is_cper_event(self, event: dict) -> bool: return False - def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: return {} def parse_assembly_entry( From eeb98889178e670787ac1f0622db2685a9bbad17 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 27 May 2026 12:42:36 -0500 Subject: [PATCH 08/19] undid rocmdata changes --- nodescraper/plugins/inband/rocm/rocmdata.py | 55 +++++++++------------ 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index eb1794c3..cd1b0537 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,24 +24,18 @@ # ############################################################################### import re -from typing import List, Optional +from typing import ClassVar, List, Optional -from pydantic import field_validator +from pydantic import computed_field, field_validator from nodescraper.models import DataModel -# e.g. 7.13.0, 7.13.0-123, 7.13.0-123-gfx942, 7.13.0-123-gfx942;gfx950 -_ROCM_VERSION_RE = re.compile(r"^\d+(?:\.\d+){0,3}(?:-\d+)?(?:-gfx\d+(?:;gfx\d+)*)?$") -_ROCM_BUILD_NUMBER_RE = re.compile(r"^\d+(?:\.\d+){0,3}-(\d+)") - - -def _validate_rocm_version_string(rocm_version: str) -> str: - if not _ROCM_VERSION_RE.match(rocm_version): - raise ValueError(f"ROCm version has invalid format: {rocm_version}") - return rocm_version +_ROCM_VERSION_RE = re.compile(r"^(\d+(?:\.\d+){0,3})(?:-(\d+)(?:-gfx\w+(?:;gfx\w+)*)?)?$") class RocmDataModel(DataModel): + ROCM_VERSION_FILENAME: ClassVar[str] = "version-rocm" + rocm_version: str rocm_sub_versions: dict[str, str] = {} rocminfo: List[str] = [] @@ -53,33 +47,28 @@ class RocmDataModel(DataModel): clinfo: List[str] = [] kfd_proc: List[str] = [] + @staticmethod + def _validate_version_string(version: str) -> str: + if not _ROCM_VERSION_RE.match(version): + raise ValueError(f"ROCm version has invalid format: {version}") + return version + @field_validator("rocm_version") @classmethod def validate_rocm_version(cls, rocm_version: str) -> str: - """ - Validate the ROCm version format. - - Args: - rocm_version (str): The ROCm version string to validate. - - Raises: - ValueError: If the ROCm version does not match the expected format. - - Returns: - str: The validated ROCm version string. - """ - return _validate_rocm_version_string(rocm_version) + return cls._validate_version_string(rocm_version) @field_validator("rocm_sub_versions") @classmethod - def validate_rocm_sub_versions(cls, rocm_sub_versions: dict[str, str]) -> dict[str, str]: - for value in rocm_sub_versions.values(): - _validate_rocm_version_string(value) - return rocm_sub_versions + def validate_rocm_sub_versions(cls, sub_versions: dict[str, str]) -> dict[str, str]: + for version in sub_versions.values(): + cls._validate_version_string(version) + return sub_versions - @property + @computed_field def build_number(self) -> Optional[str]: - """ROCm package build number from version-rocm sub-version or rocm_version.""" - version_str = self.rocm_sub_versions.get("version-rocm") or self.rocm_version - match = _ROCM_BUILD_NUMBER_RE.match(version_str) - return match.group(1) if match else None + """Build tag from version-rocm sub-version, or rocm_version when absent.""" + rocm_version = self.rocm_sub_versions.get(self.ROCM_VERSION_FILENAME, self.rocm_version) + if "-" in rocm_version: + return rocm_version.split("-")[1] + return None From 1e235c1e68389cecf8ac41fd901f8ccfc35a3a65 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 15 Jun 2026 15:02:59 -0500 Subject: [PATCH 09/19] updates --- nodescraper/interfaces/datacollectortask.py | 3 +- .../plugins/serviceability/__init__.py | 4 +- .../plugins/serviceability/analyzer_args.py | 96 ++++++++-- .../serviceability/mi3xx/mi3xx_analyzer.py | 19 +- .../plugins/serviceability/se_adapter.py | 100 ++++++++-- .../plugins/serviceability/se_models.py | 16 +- .../plugins/serviceability/se_runner.py | 107 ++++++++--- test/unit/mock_python_engine.py | 43 +++++ test/unit/plugin/test_se_runner.py | 172 ++++++++++++++++-- 9 files changed, 477 insertions(+), 83 deletions(-) create mode 100644 test/unit/mock_python_engine.py diff --git a/nodescraper/interfaces/datacollectortask.py b/nodescraper/interfaces/datacollectortask.py index 3c30a6ea..60826b16 100644 --- a/nodescraper/interfaces/datacollectortask.py +++ b/nodescraper/interfaces/datacollectortask.py @@ -204,7 +204,8 @@ def __init_subclass__(cls, **kwargs) -> None: if not issubclass(cls.DATA_MODEL, DataModel): raise TypeError(f"DATA_MODEL must be a subclass of DataModel in {cls.__name__}") if hasattr(cls, "collect_data"): - cls.collect_data = collect_decorator(cls.collect_data) + if "collect_data" in vars(cls): + cls.collect_data = collect_decorator(cls.collect_data) else: raise TypeError(f"Data collector {cls.__name__} must implement collect_data") diff --git a/nodescraper/plugins/serviceability/__init__.py b/nodescraper/plugins/serviceability/__init__.py index 36671691..c5e9f857 100644 --- a/nodescraper/plugins/serviceability/__init__.py +++ b/nodescraper/plugins/serviceability/__init__.py @@ -40,7 +40,7 @@ serviceability_block_from_service_result, ) from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution -from .se_runner import SeRunError, run_service_engine +from .se_runner import SeRunError, run_service_hub from .serviceability_collector import ServiceabilityCollectorBase from .serviceability_data import ( DeviceInfo, @@ -83,7 +83,7 @@ "is_valid_iso_datetime", "normalize_se_timestamp", "parse_iso_datetime", - "run_service_engine", + "run_service_hub", "serviceability_block_from_service_result", "satisfies_time_check", ] diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py index 8d5deea1..2aa27ccd 100644 --- a/nodescraper/plugins/serviceability/analyzer_args.py +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -25,7 +25,7 @@ ############################################################################### from __future__ import annotations -from typing import Optional +from typing import Any, Optional from pydantic import Field, field_validator, model_validator @@ -33,14 +33,11 @@ class ServiceabilityAnalyzerArgs(AnalyzerArgs): - """Analyzer args for MI3XX serviceability (Python engine via plugin config).""" + """Analyzer args for serviceability plugins that run a configurable Python hub.""" engine_python_module: Optional[str] = Field( default=None, - description=( - "Importable Python module providing a service engine class with " - "get_service_info(rf_events, cper_data=...)." - ), + description="Import path for the hub module (class implements engine_analyze_method); hub_options forwards kwargs.", ) engine_display_name: Optional[str] = Field( default=None, @@ -48,27 +45,86 @@ class ServiceabilityAnalyzerArgs(AnalyzerArgs): ) afid_sag_path: Optional[str] = Field( default=None, - description="Path to AFID_SAG.json.", + description="Path to hub config (e.g. AFID_SAG.json); passed as engine_init_path_kwarg.", + ) + engine_init_path_kwarg: str = Field( + default="afid_sag", + description="Hub __init__ keyword that receives afid_sag_path.", + ) + engine_analyze_method: str = Field( + default="get_service_info", + description="Hub method called with rf_events first (default get_service_info).", ) skip_engine: bool = Field( default=False, - description="If True, only build afid_events without running the service engine.", + description="If True, only build afid_events without running the service hub.", ) cper_decode_module: Optional[str] = Field( default=None, - description=( - "Import path of the Python module that decodes CPER blobs (e.g. " - "vendor.package.cdump_analyzer). Required when collected events " - "include CPER attachments to decode before running the service engine." - ), + description="Module import path for CPER decoding when events include CPER attachments.", ) cper_decode_method: str = Field( default="analyze_cper", - description=( - "Name of the callable on cper_decode_module. It must accept a " - "binary file-like CPER payload and return (return_code, decode_dict)." - ), + description="Callable on cper_decode_module: file-like CPER in, (return_code, decode_dict) out.", + ) + hub_options: Optional[dict[str, Any]] = Field( + default=None, + description="Extra kwargs for hub __init__ and analyze; collected cper_data overrides cper_data key.", + ) + from_ac_cycle: int = Field( + default=-1, + ge=-1, + description="from_ac_cycle kwarg for the hub analyze call (merged after hub_options).", + ) + from_date: Optional[str] = Field( + default=None, + description="Optional from_date for the hub analyze call (merged after hub_options).", ) + designation_serials: Optional[dict[str, str]] = Field( + default=None, + description="Optional designation_serials for the hub analyze call (merged after hub_options).", + ) + suppress_service_actions: Optional[list[str]] = Field( + default=None, + description="Optional suppress_service_actions for the hub analyze call (merged after hub_options).", + ) + + def resolved_hub_options(self) -> dict[str, Any]: + """Merge hub_options with from_ac_cycle, from_date, designation_serials, and suppress_service_actions.""" + merged = dict(self.hub_options or {}) + merged["from_ac_cycle"] = self.from_ac_cycle + if self.from_date is not None: + merged["from_date"] = self.from_date + if self.designation_serials is not None: + merged["designation_serials"] = self.designation_serials + if self.suppress_service_actions is not None: + merged["suppress_service_actions"] = self.suppress_service_actions + return merged + + @field_validator("engine_analyze_method", "engine_init_path_kwarg") + @classmethod + def _strip_non_empty_hub_hooks(cls, value: str) -> str: + text = str(value).strip() + if not text: + raise ValueError("must not be empty") + return text + + @field_validator("hub_options", mode="before") + @classmethod + def _none_empty_hub_options(cls, value: object) -> Optional[dict[str, Any]]: + if value is None: + return None + if isinstance(value, dict) and not value: + return None + return value # type: ignore[return-value] + + @field_validator("from_date", mode="before") + @classmethod + def _strip_from_date(cls, value: object) -> Optional[str]: + if value is None: + return None + text = str(value).strip() + return text or None @field_validator( "afid_sag_path", @@ -84,11 +140,11 @@ def _strip_optional_strings(cls, value: Optional[str]) -> Optional[str]: return text or None @model_validator(mode="after") - def _require_engine_config_when_running(self) -> ServiceabilityAnalyzerArgs: + def _require_hub_config_when_running(self) -> ServiceabilityAnalyzerArgs: if self.skip_engine: return self if not self.afid_sag_path: - raise ValueError("afid_sag_path is required when running the service engine.") + raise ValueError("afid_sag_path is required when running the service hub.") if not self.engine_python_module: - raise ValueError("engine_python_module is required when running the service engine.") + raise ValueError("engine_python_module is required when running the service hub.") return self diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index b8fc8373..0424e8e2 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -40,14 +40,14 @@ format_serviceability_solution_lines, ) from nodescraper.plugins.serviceability.se_models import ServiceabilityBlock -from nodescraper.plugins.serviceability.se_runner import SeRunError, run_service_engine +from nodescraper.plugins.serviceability.se_runner import SeRunError, run_service_hub from nodescraper.plugins.serviceability.serviceability_data import ( ServiceabilityDataModel, ) class MI3XXAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): - """Build AFID events from collected data and run the configured service engine.""" + """Build AFID events from collected data and run the configured service hub.""" DATA_MODEL = ServiceabilityDataModel @@ -67,7 +67,7 @@ def analyze_data( if args.skip_engine: data.serviceability = ServiceabilityBlock(afid_events=events) self.result.status = ExecutionStatus.OK - self.result.message = f"Built {len(events)} AFID event(s); engine skipped" + self.result.message = f"Built {len(events)} AFID event(s); hub skipped" self._log_serviceability_solutions(data.serviceability) return self.result @@ -117,13 +117,16 @@ def analyze_data( ) try: - block = run_service_engine( + block = run_service_hub( engine_python_module=args.engine_python_module, # type: ignore[arg-type] engine_display_name=args.engine_display_name, afid_events=events, afid_sag_path=args.afid_sag_path, # type: ignore[arg-type] rf_events=data.rf_events, cper_data=cper_data or None, + hub_options=args.resolved_hub_options(), + engine_analyze_method=args.engine_analyze_method, + engine_init_path_kwarg=args.engine_init_path_kwarg, ) except (SeRunError, ValueError) as exc: self.result.status = ExecutionStatus.ERROR @@ -139,9 +142,15 @@ def analyze_data( cper_summary = f", {len(cper_data)} decoded CPER(s)" elif data.cper_raw: cper_summary = f", {len(data.cper_raw)} CPER attachment(s) not decoded" + ver_bits: list[str] = [] + if block.hub_version: + ver_bits.append(f"hub {block.hub_version}") + if block.afid_sag_file_version: + ver_bits.append(f"AFID_SAG {block.afid_sag_file_version}") + ver_suffix = f" [{'; '.join(ver_bits)}]" if ver_bits else "" self.result.message = ( f"{engine_label}: {len(block.solution)} solution(s) " - f"from {len(data.rf_events)} Redfish event(s){cper_summary}" + f"from {len(data.rf_events)} Redfish event(s){cper_summary}{ver_suffix}" ) return self.result diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 243b2d7d..0e31135a 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -23,30 +23,80 @@ # SOFTWARE. # ############################################################################### -"""Map serviceability plugin models to/from Python service engine results.""" +"""Map serviceability plugin models to/from Python service hub results.""" from __future__ import annotations from collections import defaultdict -from typing import Any +from typing import Any, Optional from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution +def _hub_version_display(version_info: Any) -> Optional[str]: + """Pick a single hub version string from common hub result version dict layouts.""" + if not isinstance(version_info, dict) or not version_info: + return None + primary = ( + version_info.get("isa_version") + or version_info.get("version") + or version_info.get("engine_version") + or version_info.get("VERSION") + ) + if primary is None: + return None + text = str(primary).strip() + if not text: + return None + bd = version_info.get("build_date") + if bd and str(bd).strip(): + return f"{text} (build {str(bd).strip()})" + return text + + +def _afid_sag_file_version_display(metadata: Any) -> Optional[str]: + """Build a short AFID_SAG file identity string from hub ``afid_sag_metadata``.""" + if not isinstance(metadata, dict) or not metadata: + return None + pid = metadata.get("sag_pid") or metadata.get("pid") + rev = metadata.get("sag_revision") or metadata.get("revision") + extra = ( + metadata.get("sag_version") + or metadata.get("file_version") + or metadata.get("schema_version") + ) + parts: list[str] = [] + if pid and str(pid).strip(): + parts.append(f"PID {str(pid).strip()}") + if rev and str(rev).strip(): + parts.append(f"revision {str(rev).strip()}") + if extra and str(extra).strip(): + ex = str(extra).strip() + if ex not in (str(pid or "").strip(), str(rev or "").strip()): + parts.append(f"version {ex}") + if not parts: + return None + return ", ".join(parts) + + def format_serviceability_solution_lines(block: ServiceabilityBlock) -> list[str]: """Human-readable lines for logging or console output.""" lines: list[str] = [] if block.solution_reasoning: lines.append(block.solution_reasoning) + if block.hub_version: + lines.append(f"Hub version: {block.hub_version}") + if block.afid_sag_file_version: + lines.append(f"AFID_SAG file: {block.afid_sag_file_version}") if not block.solution: lines.append("No service actions recommended.") return lines for index, solution in enumerate(block.solution, start=1): units = ", ".join(solution.serviceable_unit) - lines.append( - f"[{index}] AFID {solution.afid}, " - f"service action {solution.service_action_num}, " - f"units: [{units}]" - ) + title = (solution.service_action_title or "").strip() + action = f"service action {solution.service_action_num}" + if title: + action = f"{action} ({title})" + lines.append(f"[{index}] AFID {solution.afid}, {action}, units: [{units}]") return lines @@ -54,12 +104,22 @@ def serviceability_block_from_service_result( afid_events: list[AfidEvent], result: Any, *, - engine_label: str = "Service engine", + engine_label: str = "Service hub", rf_event_count: int = 0, ) -> ServiceabilityBlock: - """Build a :class:`ServiceabilityBlock` from an engine result with ``service_info``.""" + """Build a :class:`ServiceabilityBlock` from a hub result with ``service_info``.""" grouped: dict[tuple[int, int], list[str]] = defaultdict(list) + titles: dict[tuple[int, int], str] = {} service_info = getattr(result, "service_info", None) or {} + + def _action_title(info: dict[str, Any]) -> str: + raw = info.get("title") or info.get("service_action") or info.get("ServiceAction") + if raw is None: + return "" + if isinstance(raw, dict): + return str(raw.get("title") or raw.get("text") or raw.get("name") or "").strip() + return str(raw).strip() + for designation, afid_map in service_info.items(): if not isinstance(afid_map, dict): continue @@ -78,29 +138,33 @@ def serviceability_block_from_service_result( key = (afid, san) if unit and unit not in grouped[key]: grouped[key].append(unit) + label = _action_title(info) + if label and key not in titles: + titles[key] = label solutions = [ ServiceabilitySolution( afid=afid, serviceable_unit=units, service_action_num=san, + service_action_title=titles.get((afid, san)), ) for (afid, san), units in sorted(grouped.items()) ] metadata = getattr(result, "afid_sag_metadata", None) or {} version_info = ( - getattr(result, "engine_version_info", None) or getattr(result, "version_info", None) or {} - ) - sag_pid = metadata.get("sag_pid") or metadata.get("pid") or "unknown" - sag_revision = metadata.get("sag_revision") or metadata.get("revision") or "unknown" - engine_version = version_info.get("version") or version_info.get("engine_version") - version_suffix = f", engine {engine_version}" if engine_version else "" - reasoning = ( - f"{engine_label} (SAG {sag_pid} rev {sag_revision}{version_suffix}): " - f"{len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." + getattr(result, "engine_version_info", None) + or getattr(result, "isa_version_info", None) + or getattr(result, "version_info", None) + or {} ) + hub_version = _hub_version_display(version_info) + afid_sag_file_version = _afid_sag_file_version_display(metadata) + reasoning = f"{engine_label}: {len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." return ServiceabilityBlock( afid_events=list(afid_events), solution=solutions, solution_reasoning=reasoning, + hub_version=hub_version, + afid_sag_file_version=afid_sag_file_version, ) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index 344ef7c7..60c34083 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -60,6 +60,10 @@ class ServiceabilitySolution(BaseModel): service_action_num: int = Field( description="Service action number from AFID_SAG.json.", ) + service_action_title: Optional[str] = Field( + default=None, + description=("Short service action label from the hub."), + ) class ServiceabilityBlock(BaseModel): @@ -71,9 +75,17 @@ class ServiceabilityBlock(BaseModel): ) solution: List[ServiceabilitySolution] = Field( default_factory=list, - description="Engine output: recommended service actions.", + description="Hub output: recommended service actions.", ) solution_reasoning: Optional[str] = Field( default=None, - description="Human-readable summary of how the engine reached its conclusions.", + description="Human-readable summary of recommendations (counts and hub label).", + ) + hub_version: Optional[str] = Field( + default=None, + description="Service hub package/build version string when the hub returned it.", + ) + afid_sag_file_version: Optional[str] = Field( + default=None, + description="AFID_SAG.json identity/revision string when the hub returned metadata.", ) diff --git a/nodescraper/plugins/serviceability/se_runner.py b/nodescraper/plugins/serviceability/se_runner.py index aeec1eb7..c141b6ec 100644 --- a/nodescraper/plugins/serviceability/se_runner.py +++ b/nodescraper/plugins/serviceability/se_runner.py @@ -23,25 +23,71 @@ # SOFTWARE. # ############################################################################### -"""Invoke a configured Python service engine against collected Redfish events.""" +"""Invoke a configured Python service hub against collected Redfish events.""" from __future__ import annotations import importlib import inspect from pathlib import Path -from typing import Any, Optional, Type +from typing import Any, Callable, Optional, Type from .se_adapter import serviceability_block_from_service_result from .se_models import AfidEvent, ServiceabilityBlock -_ENGINE_METHOD = "get_service_info" + +def _signature_accepts_var_keyword(sig: inspect.Signature) -> bool: + return any(p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()) + + +def _instantiate_hub( + hub_cls: Type[Any], + config_path: str, + init_path_kwarg: str, + hub_options: Optional[dict[str, Any]], +) -> Any: + """Construct the hub with ``config_path`` under ``init_path_kwarg``, plus matching options.""" + init_sig = inspect.signature(hub_cls.__init__) + kwargs: dict[str, Any] = {init_path_kwarg: config_path} + if not hub_options: + return hub_cls(**kwargs) + if _signature_accepts_var_keyword(init_sig): + merged = dict(hub_options) + merged[init_path_kwarg] = config_path + return hub_cls(**merged) + for key, val in hub_options.items(): + if key in init_sig.parameters: + kwargs[key] = val + kwargs[init_path_kwarg] = config_path + return hub_cls(**kwargs) + + +def _call_hub_analyze( + analyze: Callable[..., Any], + rf_events: list[Any], + cper_data: Optional[dict[str, Any]], + hub_options: Optional[dict[str, Any]], +) -> Any: + """Invoke the hub analyze callable with ``cper_data`` and per-parameter ``hub_options``.""" + sig = inspect.signature(analyze) + params = sig.parameters + eo = dict(hub_options or {}) + + if _signature_accepts_var_keyword(sig): + if "cper_data" in params: + eo["cper_data"] = dict(cper_data) if cper_data else None + return analyze(list(rf_events), **eo) + + kw = {k: v for k, v in eo.items() if k in params} + if "cper_data" in params: + kw["cper_data"] = dict(cper_data) if cper_data else None + return analyze(list(rf_events), **kw) class SeRunError(RuntimeError): - """Raised when the service engine fails or returns invalid output.""" + """Raised when the service hub fails or returns invalid output.""" -def run_service_engine( +def run_service_hub( *, engine_python_module: str, engine_display_name: Optional[str] = None, @@ -49,11 +95,21 @@ def run_service_engine( afid_sag_path: str, rf_events: list[Any], cper_data: Optional[dict[str, Any]] = None, + hub_options: Optional[dict[str, Any]] = None, + engine_analyze_method: str = "get_service_info", + engine_init_path_kwarg: str = "afid_sag", ) -> ServiceabilityBlock: - """Run a Python service engine and return a :class:`ServiceabilityBlock`.""" + """Run the configured Python service hub and return a :class:`ServiceabilityBlock`. + + The runner imports ``engine_python_module``, picks the unique class that implements + ``engine_analyze_method``, constructs it with the config file path passed as + ``engine_init_path_kwarg``, then calls the analyze method with ``rf_events`` and any + ``hub_options`` keys that match the method signature (plus ``cper_data`` when + supported). Result mapping is handled by :func:`serviceability_block_from_service_result`. + """ sag_path = Path(afid_sag_path) if not sag_path.is_file(): - raise SeRunError(f"AFID_SAG file not found: {afid_sag_path}") + raise SeRunError(f"Hub config file not found: {afid_sag_path}") if not rf_events: raise SeRunError( @@ -66,17 +122,24 @@ def run_service_engine( except ImportError as exc: raise SeRunError(f"Cannot import {engine_python_module}: {exc}") from exc - engine_cls = _resolve_engine_class(mod) + hub_cls = _resolve_hub_class(mod, engine_analyze_method) try: - instance = engine_cls(afid_sag=afid_sag_path) - analyze = getattr(instance, _ENGINE_METHOD) - result = analyze( - list(rf_events), - cper_data=dict(cper_data) if cper_data else None, + instance = _instantiate_hub( + hub_cls, + afid_sag_path, + engine_init_path_kwarg, + hub_options, + ) + analyze = getattr(instance, engine_analyze_method) + result = _call_hub_analyze( + analyze, + rf_events, + cper_data, + hub_options, ) except Exception as exc: - raise SeRunError(f"{label} {_ENGINE_METHOD}() failed: {exc}") from exc + raise SeRunError(f"{label} {engine_analyze_method}() failed: {exc}") from exc if result is None: return ServiceabilityBlock( @@ -93,18 +156,18 @@ def run_service_engine( ) -def _is_engine_class(obj: Any) -> bool: - return inspect.isclass(obj) and callable(getattr(obj, _ENGINE_METHOD, None)) +def _is_hub_class(obj: Any, analyze_method: str = "get_service_info") -> bool: + return inspect.isclass(obj) and callable(getattr(obj, analyze_method, None)) -def _resolve_engine_class(mod: Any) -> Type[Any]: - """Find the engine class in ``mod`` that implements ``get_service_info``.""" +def _resolve_hub_class(mod: Any, analyze_method: str = "get_service_info") -> Type[Any]: + """Find the hub class in ``mod`` that implements ``analyze_method``.""" package = mod.__name__ candidates: list[Type[Any]] = [] seen: set[int] = set() def add_candidate(obj: Any) -> None: - if not _is_engine_class(obj): + if not _is_hub_class(obj, analyze_method): return key = id(obj) if key in seen: @@ -124,8 +187,8 @@ def add_candidate(obj: Any) -> None: return candidates[0] if not candidates: raise SeRunError( - f"No class with {_ENGINE_METHOD}() found in {package}; " - "check engine_python_module in analysis_args." + f"No class with {analyze_method}() found in {package}; " + "check engine_python_module and engine_analyze_method in analysis_args." ) names = ", ".join(cls.__name__ for cls in candidates) - raise SeRunError(f"Multiple classes with {_ENGINE_METHOD}() in {package}: {names}.") + raise SeRunError(f"Multiple classes with {analyze_method}() in {package}: {names}.") diff --git a/test/unit/mock_python_engine.py b/test/unit/mock_python_engine.py new file mode 100644 index 00000000..515eea38 --- /dev/null +++ b/test/unit/mock_python_engine.py @@ -0,0 +1,43 @@ +"""Mock Python service engine for unit tests.""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any, Optional + +from serviceability_dummy_data import ( + DUMMY_ENGINE_VERSION, + DUMMY_SAG_PID, + DUMMY_SAG_REVISION, + DUMMY_SERVICE_ACTION_NUM, + DUMMY_SERVICE_ACTION_TITLE, + DUMMY_UNIT_A, +) + + +class MockServiceEngine: + def __init__(self, afid_sag: str) -> None: + self.afid_sag = afid_sag + + def get_service_info( + self, + rf_events: list[dict[str, Any]], + cper_data: Optional[dict[str, Any]] = None, + **kwargs: Any, + ) -> SimpleNamespace: + del cper_data, kwargs + service_info: dict[str, dict[str, dict[str, str]]] = {} + for event in rf_events: + afid = event.get("Afid") + unit = event.get("serviceable_unit", DUMMY_UNIT_A) + if afid is None: + continue + service_info.setdefault(str(unit), {})[str(afid)] = { + "service_action_number": str(DUMMY_SERVICE_ACTION_NUM), + "title": DUMMY_SERVICE_ACTION_TITLE, + } + return SimpleNamespace( + service_info=service_info, + afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + engine_version_info={"version": DUMMY_ENGINE_VERSION}, + ) diff --git a/test/unit/plugin/test_se_runner.py b/test/unit/plugin/test_se_runner.py index fd5132f4..d6fdf0d9 100644 --- a/test/unit/plugin/test_se_runner.py +++ b/test/unit/plugin/test_se_runner.py @@ -26,6 +26,7 @@ import json from pathlib import Path from types import SimpleNamespace +from typing import Any import pytest from pydantic import ValidationError @@ -58,7 +59,7 @@ build_afid_events_from_data, format_serviceability_solution_lines, normalize_se_timestamp, - run_service_engine, + run_service_hub, serviceability_block_from_service_result, ) from nodescraper.plugins.serviceability.se_models import ServiceabilitySolution @@ -77,12 +78,12 @@ def test_afid_event_requires_non_empty_serviceable_unit(): AfidEvent(afid=1, serviceable_unit=" ", time=DUMMY_TIMESTAMP) -def test_normalize_se_timestamp_preserves_engine_format(): +def test_normalize_se_timestamp_preserves_format_value(): sample = "2000-01-01 12:00:00.000+00:00" assert normalize_se_timestamp(sample) == sample -def test_analyzer_args_require_engine_config(): +def test_analyzer_args_require_hub_config(): with pytest.raises(ValidationError): ServiceabilityAnalyzerArgs() with pytest.raises(ValidationError, match="engine_python_module"): @@ -94,6 +95,24 @@ def test_analyzer_args_require_engine_config(): assert args.engine_python_module == "dummy.test.module" +def test_resolved_hub_options_explicit_fields_override_options_bag(): + args = ServiceabilityAnalyzerArgs( + engine_python_module="dummy.test.module", + afid_sag_path=str(AFID_SAG), + engine_options={"from_ac_cycle": 9, "extra": 1}, + from_ac_cycle=3, + from_date="2025-01-01", + designation_serials={"U": "S"}, + suppress_service_actions=["99"], + ) + merged = args.resolved_hub_options() + assert merged["from_ac_cycle"] == 3 + assert merged["from_date"] == "2025-01-01" + assert merged["designation_serials"] == {"U": "S"} + assert merged["suppress_service_actions"] == ["99"] + assert merged["extra"] == 1 + + def test_format_serviceability_solution_lines(): block = ServiceabilityBlock( afid_events=EXAMPLE_EVENTS[:1], @@ -102,14 +121,20 @@ def test_format_serviceability_solution_lines(): afid=DUMMY_AFID_A, serviceable_unit=[DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B], service_action_num=DUMMY_SERVICE_ACTION_NUM, + service_action_title="RMA", ) ], solution_reasoning="Dummy test reasoning.", + hub_version="1.0.0-test", + afid_sag_file_version="PID sag-1, revision rev-a", ) lines = format_serviceability_solution_lines(block) assert lines[0] == "Dummy test reasoning." - assert f"AFID {DUMMY_AFID_A}" in lines[1] - assert DUMMY_DESIGNATION_A in lines[1] + assert lines[1] == "Hub version: 1.0.0-test" + assert lines[2] == "AFID_SAG file: PID sag-1, revision rev-a" + assert f"AFID {DUMMY_AFID_A}" in lines[3] + assert DUMMY_DESIGNATION_A in lines[3] + assert "service action 99 (RMA)" in lines[3] def test_serviceability_block_from_service_result(): @@ -144,12 +169,34 @@ def test_serviceability_block_from_service_result(): assert len(block.solution) == 1 assert block.solution[0].afid == DUMMY_AFID_A assert block.solution[0].service_action_num == DUMMY_SERVICE_ACTION_NUM + assert block.solution[0].service_action_title == "Dummy service action" assert set(block.solution[0].serviceable_unit) == {DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B} + assert block.hub_version == DUMMY_ENGINE_VERSION + assert block.afid_sag_file_version is not None + assert DUMMY_SAG_PID in block.afid_sag_file_version + assert DUMMY_SAG_REVISION in block.afid_sag_file_version assert f"{DUMMY_RF_EVENT_COUNT} Redfish event(s)" in block.solution_reasoning assert "Dummy test engine" in block.solution_reasoning -def test_resolve_engine_class_finds_package_export(): +def test_serviceability_block_from_service_result_isa_version_info(): + result = SimpleNamespace( + service_info={}, + afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + isa_version_info={"VERSION": "1.2.3"}, + ) + block = serviceability_block_from_service_result( + EXAMPLE_EVENTS[:1], + result, + engine_label="ISA", + rf_event_count=1, + ) + assert block.hub_version == "1.2.3" + assert block.afid_sag_file_version is not None + assert DUMMY_SAG_PID in block.afid_sag_file_version + + +def test_resolve_hub_class_finds_package_export(): import types submodule = types.ModuleType("fake_engine.impl") @@ -162,17 +209,17 @@ def test_resolve_engine_class_finds_package_export(): package.EngineImpl = submodule.EngineImpl # type: ignore[attr-defined] package.__all__ = ["EngineImpl"] - from nodescraper.plugins.serviceability.se_runner import _resolve_engine_class + from nodescraper.plugins.serviceability.se_runner import _resolve_hub_class - assert _resolve_engine_class(package) is submodule.EngineImpl + assert _resolve_hub_class(package) is submodule.EngineImpl -def test_run_service_engine_with_mock_module(): +def test_run_service_hub_with_mock_module(): rf_events = [ {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, {"Afid": DUMMY_AFID_C, "serviceable_unit": DUMMY_UNIT_C, "Created": DUMMY_TIMESTAMP}, ] - block = run_service_engine( + block = run_service_hub( engine_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS[:2], afid_sag_path=str(AFID_SAG), @@ -183,9 +230,107 @@ def test_run_service_engine_with_mock_module(): assert block.solution[0].service_action_num == DUMMY_SERVICE_ACTION_NUM -def test_run_service_engine_missing_sag_raises(): - with pytest.raises(SeRunError, match="AFID_SAG"): - run_service_engine( +def test_run_service_hub_custom_analyze_method_and_path_kwarg(): + import sys + import types + + init_log: list[tuple[str, bool]] = [] + analyze_log: list[Any] = [] + + class AltEngine: + def __init__(self, rulebook_path: str, debug: bool = False) -> None: + init_log.append((rulebook_path, debug)) + + def analyze_events(self, rf_events, cper_data=None): + analyze_log.append((list(rf_events), cper_data)) + return None + + mod = types.ModuleType("alt_service_engine") + mod.AltEngine = AltEngine + mod.__all__ = ["AltEngine"] + sys.modules["alt_service_engine"] = mod + try: + run_service_hub( + engine_python_module="alt_service_engine", + afid_events=EXAMPLE_EVENTS[:1], + afid_sag_path=str(AFID_SAG), + rf_events=[{"Afid": 1}], + cper_data={"k": 1}, + engine_options={"debug": True}, + engine_analyze_method="analyze_events", + engine_init_path_kwarg="rulebook_path", + ) + finally: + del sys.modules["alt_service_engine"] + + assert init_log[0][0] == str(AFID_SAG) + assert init_log[0][1] is True + assert analyze_log[0][1] == {"k": 1} + + +def test_run_service_hub_accepts_engine_options(): + rf_events = [ + {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, + ] + block = run_service_hub( + engine_python_module="mock_python_engine", + afid_events=EXAMPLE_EVENTS[:1], + afid_sag_path=str(AFID_SAG), + rf_events=rf_events, + engine_options={"reporting_level": "verbose"}, + ) + assert len(block.solution) == 1 + + +def test_run_service_hub_forwards_instinct_shaped_engine_options(): + from instinct_shaped_engine import clear_last_call, get_last_call + + clear_last_call() + rf_events = [ + {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, + ] + run_service_hub( + engine_python_module="instinct_shaped_engine", + afid_events=EXAMPLE_EVENTS[:1], + afid_sag_path=str(AFID_SAG), + rf_events=rf_events, + cper_data={"decoded": True}, + engine_options={ + "from_ac_cycle": 2, + "from_date": "2024-06-01", + "designation_serials": {"GPU0": "SN1"}, + "suppress_service_actions": ["42"], + }, + ) + got = get_last_call() + assert got["from_ac_cycle"] == 2 + assert got["from_date"] == "2024-06-01" + assert got["cper_data"] == {"decoded": True} + assert got["designation_serials"] == {"GPU0": "SN1"} + assert got["suppress_service_actions"] == ["42"] + + +def test_run_service_hub_collected_cper_overrides_engine_options_cper_data(): + from instinct_shaped_engine import clear_last_call, get_last_call + + clear_last_call() + rf_events = [ + {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, + ] + run_service_hub( + engine_python_module="instinct_shaped_engine", + afid_events=EXAMPLE_EVENTS[:1], + afid_sag_path=str(AFID_SAG), + rf_events=rf_events, + cper_data={"from_collector": 1}, + engine_options={"cper_data": {"from_options": 2}, "from_ac_cycle": 0}, + ) + assert get_last_call()["cper_data"] == {"from_collector": 1} + + +def test_run_service_hub_missing_sag_raises(): + with pytest.raises(SeRunError, match="Engine config file not found"): + run_service_hub( engine_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS, afid_sag_path="/nonexistent/dummy_afid_sag.json", @@ -237,6 +382,7 @@ def test_mi3xx_analyzer_runs_python_engine(system_info): args = ServiceabilityAnalyzerArgs( engine_python_module="mock_python_engine", afid_sag_path=str(AFID_SAG), + engine_options={"include_raw_events": False}, ) result = analyzer.analyze_data(data, args=args) assert result.status == ExecutionStatus.OK From bf6bb2f3b1edf6782ff1c1084c57ff937ac82a01 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 16 Jun 2026 08:51:43 -0500 Subject: [PATCH 10/19] utest fix --- nodescraper/interfaces/plugin.py | 4 ++-- test/unit/framework/common/shared_utils.py | 19 +++++++++++++++++-- test/unit/plugin/test_se_runner.py | 20 ++++++++++---------- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/nodescraper/interfaces/plugin.py b/nodescraper/interfaces/plugin.py index 06959b54..9e22d346 100644 --- a/nodescraper/interfaces/plugin.py +++ b/nodescraper/interfaces/plugin.py @@ -26,7 +26,7 @@ import abc import inspect import logging -from typing import Callable, Generic, Optional, Type, Union +from typing import Any, Callable, Generic, Optional, Type, Union from nodescraper.constants import DEFAULT_EVENT_REPORTER, DEFAULT_LOGGER from nodescraper.models import PluginResult, SystemInfo @@ -125,7 +125,7 @@ def _update_queue(self, queue_item: tuple) -> None: self.queue_callback(queue_item) @abc.abstractmethod - def run(self, **kwargs) -> PluginResult: + def run(self, **kwargs: Any) -> PluginResult: """Plugin run function Returns: diff --git a/test/unit/framework/common/shared_utils.py b/test/unit/framework/common/shared_utils.py index 05e77af3..7ba16c16 100644 --- a/test/unit/framework/common/shared_utils.py +++ b/test/unit/framework/common/shared_utils.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -from typing import Any, Optional +from typing import Any, Dict, List, Optional from unittest.mock import MagicMock from nodescraper.constants import DEFAULT_EVENT_REPORTER @@ -87,12 +87,27 @@ class DummyDataModel(DataModel): some_version: str = "0" +# Module-level defaults so ``run`` signatures stay stable for ConfigBuilder tests. +_TEST_PLUGIN_A_LIST_DEFAULT: List[Any] = [1] +_TEST_PLUGIN_A_DICT_DEFAULT: Dict[str, Any] = {} +_TEST_PLUGIN_A_MODEL_DEFAULT = TestModelArg() + + class TestPluginA(PluginInterface[MockConnectionManager, None]): CONNECTION_TYPE = MockConnectionManager ANALYZER_ARGS = TestModelArg - def run(self, **kwargs: Any) -> PluginResult: + def run( + self, + test_bool_arg: bool = True, + test_str_arg: str = "test", + test_list_arg: List[Any] = _TEST_PLUGIN_A_LIST_DEFAULT, + test_dict_arg: Dict[str, Any] = _TEST_PLUGIN_A_DICT_DEFAULT, + test_model_arg: TestModelArg = _TEST_PLUGIN_A_MODEL_DEFAULT, + **kwargs: Any, + ) -> PluginResult: + _ = kwargs return PluginResult( source="testA", status=ExecutionStatus.ERROR, diff --git a/test/unit/plugin/test_se_runner.py b/test/unit/plugin/test_se_runner.py index d6fdf0d9..01f8c4bc 100644 --- a/test/unit/plugin/test_se_runner.py +++ b/test/unit/plugin/test_se_runner.py @@ -99,7 +99,7 @@ def test_resolved_hub_options_explicit_fields_override_options_bag(): args = ServiceabilityAnalyzerArgs( engine_python_module="dummy.test.module", afid_sag_path=str(AFID_SAG), - engine_options={"from_ac_cycle": 9, "extra": 1}, + hub_options={"from_ac_cycle": 9, "extra": 1}, from_ac_cycle=3, from_date="2025-01-01", designation_serials={"U": "S"}, @@ -256,7 +256,7 @@ def analyze_events(self, rf_events, cper_data=None): afid_sag_path=str(AFID_SAG), rf_events=[{"Afid": 1}], cper_data={"k": 1}, - engine_options={"debug": True}, + hub_options={"debug": True}, engine_analyze_method="analyze_events", engine_init_path_kwarg="rulebook_path", ) @@ -268,7 +268,7 @@ def analyze_events(self, rf_events, cper_data=None): assert analyze_log[0][1] == {"k": 1} -def test_run_service_hub_accepts_engine_options(): +def test_run_service_hub_accepts_hub_options(): rf_events = [ {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, ] @@ -277,12 +277,12 @@ def test_run_service_hub_accepts_engine_options(): afid_events=EXAMPLE_EVENTS[:1], afid_sag_path=str(AFID_SAG), rf_events=rf_events, - engine_options={"reporting_level": "verbose"}, + hub_options={"reporting_level": "verbose"}, ) assert len(block.solution) == 1 -def test_run_service_hub_forwards_instinct_shaped_engine_options(): +def test_run_service_hub_forwards_full_hub_options_kwargs(): from instinct_shaped_engine import clear_last_call, get_last_call clear_last_call() @@ -295,7 +295,7 @@ def test_run_service_hub_forwards_instinct_shaped_engine_options(): afid_sag_path=str(AFID_SAG), rf_events=rf_events, cper_data={"decoded": True}, - engine_options={ + hub_options={ "from_ac_cycle": 2, "from_date": "2024-06-01", "designation_serials": {"GPU0": "SN1"}, @@ -310,7 +310,7 @@ def test_run_service_hub_forwards_instinct_shaped_engine_options(): assert got["suppress_service_actions"] == ["42"] -def test_run_service_hub_collected_cper_overrides_engine_options_cper_data(): +def test_run_service_hub_collected_cper_overrides_hub_options_cper_data(): from instinct_shaped_engine import clear_last_call, get_last_call clear_last_call() @@ -323,13 +323,13 @@ def test_run_service_hub_collected_cper_overrides_engine_options_cper_data(): afid_sag_path=str(AFID_SAG), rf_events=rf_events, cper_data={"from_collector": 1}, - engine_options={"cper_data": {"from_options": 2}, "from_ac_cycle": 0}, + hub_options={"cper_data": {"from_options": 2}, "from_ac_cycle": 0}, ) assert get_last_call()["cper_data"] == {"from_collector": 1} def test_run_service_hub_missing_sag_raises(): - with pytest.raises(SeRunError, match="Engine config file not found"): + with pytest.raises(SeRunError, match="Hub config file not found"): run_service_hub( engine_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS, @@ -382,7 +382,7 @@ def test_mi3xx_analyzer_runs_python_engine(system_info): args = ServiceabilityAnalyzerArgs( engine_python_module="mock_python_engine", afid_sag_path=str(AFID_SAG), - engine_options={"include_raw_events": False}, + hub_options={"include_raw_events": False}, ) result = analyzer.analyze_data(data, args=args) assert result.status == ExecutionStatus.OK From dafa092821757a15c977b2446c8a88500f4a2d4b Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 16 Jun 2026 08:58:08 -0500 Subject: [PATCH 11/19] utest fix --- nodescraper/configbuilder.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nodescraper/configbuilder.py b/nodescraper/configbuilder.py index 7823b95a..bc8f1b8a 100644 --- a/nodescraper/configbuilder.py +++ b/nodescraper/configbuilder.py @@ -24,6 +24,7 @@ # ############################################################################### import enum +import inspect import logging from typing import Any, Optional, Type, Union @@ -64,9 +65,17 @@ def gen_config(self, plugin_names: list[str]) -> PluginConfig: @classmethod def _build_plugin_config(cls, plugin_class: Type[PluginInterface]) -> dict: type_map = TypeUtils.get_func_arg_types(plugin_class.run, plugin_class) + run_sig = inspect.signature(plugin_class.run) config = {} for arg, arg_data in type_map.items(): + param = run_sig.parameters.get(arg) + # abstraction level for the ServiceabilityPlugin to allow kwargs for hub call + if param is not None and param.kind in ( + inspect.Parameter.VAR_KEYWORD, + inspect.Parameter.VAR_POSITIONAL, + ): + continue cls._update_config(arg, arg_data, config) return config From 9ac384ddbc9cf9a075b896ad6d96ecb537074c12 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 17 Jun 2026 11:51:46 -0500 Subject: [PATCH 12/19] Parse AFID from Oem.AMDFieldIdentifiers[]+ resolve the serviceable unit from Links.OriginOfCondition so OpenBMC UBB log entries feed hub correctly. --- .../plugins/serviceability/afid_events.py | 50 +++++++- .../plugin/test_afid_events_bmc_schema.py | 110 ++++++++++++++++++ 2 files changed, 157 insertions(+), 3 deletions(-) create mode 100644 test/unit/plugin/test_afid_events_bmc_schema.py diff --git a/nodescraper/plugins/serviceability/afid_events.py b/nodescraper/plugins/serviceability/afid_events.py index 2138c0cf..a84af503 100644 --- a/nodescraper/plugins/serviceability/afid_events.py +++ b/nodescraper/plugins/serviceability/afid_events.py @@ -100,10 +100,33 @@ def _extract_afid(payload: dict[str, Any]) -> Optional[int]: oem = payload.get("Oem") if isinstance(oem, dict): for vendor_payload in oem.values(): - if isinstance(vendor_payload, dict): + found = _extract_afid_from_oem_fragment(vendor_payload) + if found is not None: + return found + return None + + +def _extract_afid_from_oem_fragment(vendor_payload: Any) -> Optional[int]: + """Resolve AFID from one ``Oem`` property value (dict or list of dicts, e.g. ``AMDFieldIdentifiers``).""" + if isinstance(vendor_payload, dict): + for key in _AFID_KEYS: + if key in vendor_payload and vendor_payload[key] is not None: + return int(vendor_payload[key]) + elif isinstance(vendor_payload, list): + for item in vendor_payload: + if isinstance(item, dict): for key in _AFID_KEYS: - if key in vendor_payload and vendor_payload[key] is not None: - return int(vendor_payload[key]) + if key in item and item[key] is not None: + return int(item[key]) + return None + + +def _origin_dict_to_unit(value: Any) -> Optional[str]: + if not isinstance(value, dict): + return None + odata_id = value.get("@odata.id") or value.get("odata.id") + if odata_id: + return _unit_from_odata_id(str(odata_id)) return None @@ -119,6 +142,18 @@ def _extract_serviceable_unit(payload: dict[str, Any]) -> Optional[str]: text = str(value).strip() if text: return _unit_from_odata_id(text) if "/" in text else text + + links = payload.get("Links") or payload.get("links") + if isinstance(links, dict): + ooc = ( + links.get("OriginOfCondition") + or links.get("originOfCondition") + or links.get("OriginofCondition") + ) + unit = _origin_dict_to_unit(ooc) + if unit: + return unit + oem = payload.get("Oem") if isinstance(oem, dict): for vendor_payload in oem.values(): @@ -128,6 +163,15 @@ def _extract_serviceable_unit(payload: dict[str, Any]) -> Optional[str]: ) if unit is not None and str(unit).strip(): return str(unit).strip() + elif isinstance(vendor_payload, list): + for item in vendor_payload: + if not isinstance(item, dict): + continue + su = item.get("ServiceableUnits") or item.get("serviceable_units") + if isinstance(su, list) and su: + u = _origin_dict_to_unit(su[0]) + if u: + return u return None diff --git a/test/unit/plugin/test_afid_events_bmc_schema.py b/test/unit/plugin/test_afid_events_bmc_schema.py new file mode 100644 index 00000000..7c54364f --- /dev/null +++ b/test/unit/plugin/test_afid_events_bmc_schema.py @@ -0,0 +1,110 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +############################################################################### +"""AFID / serviceable unit extraction for OpenBMC-style LogEntry payloads.""" +from __future__ import annotations + +from nodescraper.plugins.serviceability.afid_events import ( + _afid_event_from_rf_member, + build_afid_events_from_data, +) +from nodescraper.plugins.serviceability.serviceability_data import ( + ServiceabilityDataModel, +) + +# Shape from after_clear_rma_case.json: AFID under Oem.AMDFieldIdentifiers[], OOC under Links. +_SAMPLE_LOG_ENTRY = { + "@odata.id": "/redfish/v1/Systems/UBB/LogServices/EventLog/Entries/1", + "Created": "2026-06-16T20:25:22+00:00", + "Id": "1", + "Links": { + "OriginOfCondition": { + "@odata.id": "/redfish/v1/Chassis/OAM_7", + } + }, + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": 22, + "Description": "On-die ECC, Uncorrected, Non-fatal", + "ServiceableUnits": [ + {"@odata.id": "/redfish/v1/Chassis/OAM_7"}, + ], + "ServiceableUnits@odata.count": 1, + } + ], + "AMDFieldIdentifiers@Members.count": 1, + }, +} + + +def test_afid_event_from_openbmc_log_entry_with_links_and_amd_field_identifiers(): + ev = _afid_event_from_rf_member(_SAMPLE_LOG_ENTRY) + assert ev is not None + assert ev.afid == 22 + assert ev.serviceable_unit == "OAM_7" + assert "2026-06-16" in ev.time + + +def test_serviceable_unit_from_oem_serviceable_units_when_no_links(): + member = { + "Created": "2026-06-16T20:25:22+00:00", + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": 23, + "ServiceableUnits": [ + {"@odata.id": "/redfish/v1/Chassis/OAM_3"}, + ], + } + ], + }, + } + ev = _afid_event_from_rf_member(member) + assert ev is not None + assert ev.afid == 23 + assert ev.serviceable_unit == "OAM_3" + + +# Minimal slice of smci350 command_artifacts.json first CPER row (Links + AMDFieldIdentifiers[]). +_SMCI350_STYLE_ENTRY = { + "Created": "2026-06-16T18:53:21+00:00", + "Id": "1", + "Links": { + "OriginOfCondition": {"@odata.id": "/redfish/v1/Chassis/OAM_2"}, + }, + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": 25, + "Description": "All Other HBM, Fatal", + "ServiceableUnits": [{"@odata.id": "/redfish/v1/Chassis/OAM_2"}], + "ServiceableUnits@odata.count": 1, + } + ], + "AMDFieldIdentifiers@Members.count": 1, + }, +} + + +def test_afid_event_smci350_style_fatal_hbm_entry(): + ev = _afid_event_from_rf_member(_SMCI350_STYLE_ENTRY) + assert ev is not None + assert ev.afid == 25 + assert ev.serviceable_unit == "OAM_2" + + +def test_build_afid_events_from_data_includes_openbmc_entries(): + data = ServiceabilityDataModel( + rf_events=[_SAMPLE_LOG_ENTRY, _SMCI350_STYLE_ENTRY], + cper_data={}, + ) + events = build_afid_events_from_data(data) + assert len(events) == 2 + by_afid_oam = {(e.afid, e.serviceable_unit) for e in events} + assert (22, "OAM_7") in by_afid_oam + assert (25, "OAM_2") in by_afid_oam From 03d0d1a1a5f29dbdc0f694dc362bf22a2e99c730 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 18 Jun 2026 16:27:07 -0500 Subject: [PATCH 13/19] updates --- .../serviceability/mi3xx/mi3xx_analyzer.py | 20 +- .../plugins/serviceability/se_adapter.py | 178 +++++++++++++++++- .../plugins/serviceability/se_models.py | 13 +- .../serviceability_collector.py | 77 +++++++- 4 files changed, 273 insertions(+), 15 deletions(-) diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index 0424e8e2..931366df 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -25,7 +25,9 @@ ############################################################################### from __future__ import annotations -from typing import Optional +from typing import Any, ClassVar, Optional + +from pydantic import BaseModel, Field from nodescraper.enums import ExecutionStatus from nodescraper.interfaces import DataAnalyzer @@ -46,6 +48,14 @@ ) +class AfidSagMetadataArtifact(BaseModel): + """Hub AFID_SAG metadata snapshot; written to ``afid_sag_metadata.json``.""" + + ARTIFACT_LOG_BASENAME: ClassVar[str] = "afid_sag_metadata" + + metadata: dict[str, Any] = Field(default_factory=dict) + + class MI3XXAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): """Build AFID events from collected data and run the configured service hub.""" @@ -134,6 +144,7 @@ def analyze_data( return self.result data.serviceability = block + self._append_afid_sag_metadata_artifact(block) self._log_serviceability_solutions(block) engine_label = args.engine_display_name or args.engine_python_module self.result.status = ExecutionStatus.OK @@ -154,6 +165,13 @@ def analyze_data( ) return self.result + def _append_afid_sag_metadata_artifact(self, block: ServiceabilityBlock) -> None: + if block.afid_sag_metadata is None: + return + self.result.artifacts.append( + AfidSagMetadataArtifact(metadata=dict(block.afid_sag_metadata)) + ) + def _log_serviceability_solutions(self, block: ServiceabilityBlock) -> None: parent = self.parent or self.__class__.__name__ for line in format_serviceability_solution_lines(block): diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 0e31135a..04321c82 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -26,11 +26,34 @@ """Map serviceability plugin models to/from Python service hub results.""" from __future__ import annotations +import json from collections import defaultdict -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Tuple from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution +# Hub payload keys commonly holding a one-line human summary (not raw OEM metadata). +_SUMMARY_VALUE_KEYS: Tuple[str, ...] = ( + "short_service", + "short_service_info", + "summary", + "message", + "title", + "recommendation", + "solution", + "service_recommendation", + "action", +) +_UNIT_LABEL_KEYS: Tuple[str, ...] = ( + "oem", + "OEM", + "unit", + "serviceable_unit", + "designation", + "chassis", + "device", +) + def _hub_version_display(version_info: Any) -> Optional[str]: """Pick a single hub version string from common hub result version dict layouts.""" @@ -78,9 +101,151 @@ def _afid_sag_file_version_display(metadata: Any) -> Optional[str]: return ", ".join(parts) +def _human_summary_line_from_hub_value(value: Any) -> Optional[str]: + """Pick a single human-readable line from a hub fragment (string, number, or dict).""" + if value is None: + return None + if isinstance(value, str): + text = value.strip() + return text or None + if isinstance(value, (int, float)) and not isinstance(value, bool): + return str(value).strip() or None + if isinstance(value, dict): + for key in _SUMMARY_VALUE_KEYS: + if key not in value: + continue + got = _human_summary_line_from_hub_value(value[key]) + if got: + return got + for key in ("service_action", "ServiceAction"): + if key not in value: + continue + raw = value[key] + if isinstance(raw, dict): + inner = ( + raw.get("title") + or raw.get("text") + or raw.get("name") + or raw.get("service_action") + ) + if isinstance(inner, str) and inner.strip(): + return inner.strip() + got = _human_summary_line_from_hub_value(raw) + if got: + return got + else: + s = str(raw).strip() + if s: + return s + for alt in ("text", "name", "description", "details"): + if isinstance(value.get(alt), str) and str(value[alt]).strip(): + return str(value[alt]).strip() + return None + text = str(value).strip() + return text or None + + +def _unit_label_from_short_service_item(item: dict[str, Any]) -> str: + for key in _UNIT_LABEL_KEYS: + raw = item.get(key) + if raw is not None and str(raw).strip(): + return str(raw).strip() + return "" + + +def _maybe_unwrap_outer_unit_map(d: dict[str, Any]) -> dict[str, Any]: + """If the hub wraps {wrapper: {unit: {...}}}, return the inner unit map.""" + if len(d) != 1: + return d + _, inner = next(iter(d.items())) + if isinstance(inner, dict) and inner and all(isinstance(v, dict) for v in inner.values()): + return inner + return d + + +def _merged_short_service_lines_from_unit_messages(entries: List[Tuple[str, str]]) -> List[str]: + """Group (unit, message) rows by message; merge units when the message is identical.""" + by_message: dict[str, list[str]] = defaultdict(list) + for unit, msg in entries: + if not msg: + continue + by_message[msg].append(unit or "") + + lines: list[str] = [] + for msg in sorted(by_message.keys(), key=lambda m: (-len(by_message[m]), m.lower())): + units = sorted({u for u in by_message[msg] if u}) + if len(units) <= 1: + u = units[0] if units else "" + lines.append(f"{msg} ({u})" if u else msg) + else: + lines.append(f"{msg} — OEMs/units: {', '.join(units)}") + return lines + + +def _format_short_service_info_for_block(raw: Any) -> Optional[str]: + """Turn hub ``short_service_info`` into multiline log/LLM text (no JSON dump of unit maps).""" + if raw is None: + return None + if isinstance(raw, str): + text = raw.strip() + return text or None + if isinstance(raw, (list, tuple)): + if raw and all(isinstance(x, dict) for x in raw): + entries: list[tuple[str, str]] = [] + for item in raw: + assert isinstance(item, dict) + unit = _unit_label_from_short_service_item(item) + msg = _human_summary_line_from_hub_value( + item + ) or _human_summary_line_from_hub_value(item.get("short_service_info")) + if msg: + entries.append((unit, msg)) + lines = _merged_short_service_lines_from_unit_messages(entries) + out = "\n".join(lines).strip() + return out or None + parts = [str(x).strip() for x in raw if x is not None and str(x).strip()] + return "\n".join(parts) if parts else None + if isinstance(raw, dict): + d = _maybe_unwrap_outer_unit_map(raw) + if d and all(isinstance(v, dict) for v in d.values()): + entries = [] + for unit_key, inner in d.items(): + msg = _human_summary_line_from_hub_value(inner) + if msg: + entries.append((str(unit_key).strip(), msg)) + lines = _merged_short_service_lines_from_unit_messages(entries) + out = "\n".join(lines).strip() + if out: + return out + flat_lines: list[str] = [] + for key in sorted(d.keys(), key=lambda x: str(x).lower()): + val = d[key] + if isinstance(val, dict): + msg = _human_summary_line_from_hub_value(val) + if msg: + flat_lines.append(f"{key}: {msg}") + elif val is not None and str(val).strip(): + flat_lines.append(f"{key}: {str(val).strip()}") + if flat_lines: + return "\n".join(flat_lines) + try: + compact = json.dumps(d, sort_keys=True) + except TypeError: + compact = str(d) + compact = compact.strip() + return compact or None + text = str(raw).strip() + return text or None + + def format_serviceability_solution_lines(block: ServiceabilityBlock) -> list[str]: """Human-readable lines for logging or console output.""" lines: list[str] = [] + if block.short_service_info: + lines.append("short_service_info:") + for part in block.short_service_info.splitlines(): + lines.append(f" {part}" if part else " ") + lines.append("") if block.solution_reasoning: lines.append(block.solution_reasoning) if block.hub_version: @@ -107,7 +272,7 @@ def serviceability_block_from_service_result( engine_label: str = "Service hub", rf_event_count: int = 0, ) -> ServiceabilityBlock: - """Build a :class:`ServiceabilityBlock` from a hub result with ``service_info``.""" + """Build a ``ServiceabilityBlock`` from a hub result with ``service_info``.""" grouped: dict[tuple[int, int], list[str]] = defaultdict(list) titles: dict[tuple[int, int], str] = {} service_info = getattr(result, "service_info", None) or {} @@ -151,7 +316,8 @@ def _action_title(info: dict[str, Any]) -> str: ) for (afid, san), units in sorted(grouped.items()) ] - metadata = getattr(result, "afid_sag_metadata", None) or {} + raw_metadata = getattr(result, "afid_sag_metadata", None) + metadata: Dict[str, Any] = raw_metadata if isinstance(raw_metadata, dict) else {} version_info = ( getattr(result, "engine_version_info", None) or getattr(result, "isa_version_info", None) @@ -161,10 +327,16 @@ def _action_title(info: dict[str, Any]) -> str: hub_version = _hub_version_display(version_info) afid_sag_file_version = _afid_sag_file_version_display(metadata) reasoning = f"{engine_label}: {len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." + meta_out: Optional[dict[str, Any]] = dict(metadata) if isinstance(raw_metadata, dict) else None + short_service_info = _format_short_service_info_for_block( + getattr(result, "short_service_info", None) + ) return ServiceabilityBlock( afid_events=list(afid_events), solution=solutions, solution_reasoning=reasoning, hub_version=hub_version, afid_sag_file_version=afid_sag_file_version, + afid_sag_metadata=meta_out, + short_service_info=short_service_info, ) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index 60c34083..8a3f50f3 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -25,7 +25,7 @@ ############################################################################### from __future__ import annotations -from typing import List, Optional +from typing import Any, List, Optional from pydantic import BaseModel, Field, field_validator @@ -89,3 +89,14 @@ class ServiceabilityBlock(BaseModel): default=None, description="AFID_SAG.json identity/revision string when the hub returned metadata.", ) + afid_sag_metadata: Optional[dict[str, Any]] = Field( + default=None, + description="Hub-reported AFID_SAG metadata dict when the engine exposes afid_sag_metadata.", + ) + short_service_info: Optional[str] = Field( + default=None, + description=( + "Brief hub summary derived from short_service_info (human-readable lines; " + "per-unit dict payloads are collapsed, identical messages merged with unit lists)." + ), + ) diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py index 3278c113..0ad28643 100644 --- a/nodescraper/plugins/serviceability/serviceability_collector.py +++ b/nodescraper/plugins/serviceability/serviceability_collector.py @@ -26,17 +26,50 @@ from __future__ import annotations import abc -from typing import Any, Generic, Optional, Protocol, TypeVar, cast +from typing import Any, ClassVar, Generic, Literal, Optional, Protocol, TypeVar, cast from urllib.parse import urlparse +from pydantic import BaseModel, Field + from nodescraper.base import RedfishDataCollector -from nodescraper.connection.redfish import RF_MEMBERS, RF_MEMBERS_COUNT +from nodescraper.connection.redfish import ( + RF_MEMBERS, + RF_MEMBERS_COUNT, + RedfishGetResult, +) from nodescraper.enums import ExecutionStatus from nodescraper.models import CollectorArgs, TaskResult from .serviceability_data import DeviceInfo, ServiceabilityDataModel +class ServiceabilityUriManifestArtifact(BaseModel): + """Resolved Redfish URIs for this serviceability run (``serviceability_uri_manifest.json``).""" + + ARTIFACT_LOG_BASENAME: ClassVar[str] = "serviceability_uri_manifest" + + artifact_kind: Literal["ServiceabilityUriManifest"] = "ServiceabilityUriManifest" + event_log_uri: str + assembly_get_uris: list[str] = Field(default_factory=list) + firmware_inventory_uri: Optional[str] = None + + +class FirmwareInventoryArtifact(BaseModel): + """Firmware inventory Redfish GET; written to ``firmware_inventory.json`` with path, success, data, error, and status_code fields (same layout as a Redfish GET artifact row).""" + + ARTIFACT_LOG_BASENAME: ClassVar[str] = "firmware_inventory" + + path: str + success: bool + data: Optional[dict[str, Any]] = None + error: Optional[str] = None + status_code: Optional[int] = None + + @classmethod + def from_redfish_get(cls, res: RedfishGetResult) -> FirmwareInventoryArtifact: + return cls.model_validate(res.model_dump(mode="python")) + + class _ServiceabilityCollectArg(Protocol): follow_next_link: bool max_pages: int @@ -98,7 +131,7 @@ def extract_component_details( def _fetch_event_log(self, args: TServiceabilityCollectArg, uri: str): if args.follow_next_link: - return self._run_redfish_get_paged(uri, max_pages=args.max_pages) + return self._run_redfish_get_paged(uri, max_pages=args.max_pages, log_artifact=True) return self._run_redfish_get(uri, log_artifact=True) def collect_data( @@ -111,6 +144,11 @@ def collect_data( svc_args = cast(TServiceabilityCollectArg, args) event_uri = svc_args.resolved_event_log_uri() + self.logger.info( + "Serviceability: event log Redfish URI %s (follow_next_link=%s)", + event_uri, + svc_args.follow_next_link, + ) if svc_args.top is not None: res = self._fetch_top(svc_args, svc_args.top, svc_args.max_pages) else: @@ -134,11 +172,18 @@ def collect_data( return self.result, None assembly_info: dict[str, DeviceInfo] = {} + assembly_get_uris: list[str] = [] tpl = svc_args.rf_assembly_uri_template devices = svc_args.rf_chassis_devices if tpl and devices: for device in devices: uri_asm = tpl.format(device=device) + assembly_get_uris.append(uri_asm) + self.logger.info( + "Serviceability: assembly Redfish GET %s (chassis designation=%s)", + uri_asm, + device, + ) assembly_res = self._run_redfish_get(uri_asm, log_artifact=True) if not assembly_res.success or assembly_res.data is None: continue @@ -153,31 +198,43 @@ def collect_data( cper_raw = self.collect_cper_attachments(filtered_members or []) + component_details, firmware_uri_used = self._fetch_component_details(responses, svc_args) + data = ServiceabilityDataModel( responses=responses, rf_events=filtered_members or [], assembly_info=assembly_info, cper_raw=cper_raw, - component_details=self._fetch_component_details(responses, svc_args), + component_details=component_details, log_path=self._log_path, bmc_host=bmc_host, ) + self.result.artifacts.append( + ServiceabilityUriManifestArtifact( + event_log_uri=event_uri, + assembly_get_uris=assembly_get_uris, + firmware_inventory_uri=firmware_uri_used, + ) + ) self.result.status = ExecutionStatus.OK self.result.message = f"Collected {len(members)} event log member(s)" return self.result, data def _fetch_component_details( self, responses: dict[str, Any], args: TServiceabilityCollectArg - ) -> Optional[str]: + ) -> tuple[Optional[str], Optional[str]]: + """Return ``(component_details, firmware_uri)``; firmware_uri is set when a GET was attempted.""" fw_uri = args.rf_firmware_bundle_uri if not fw_uri or not str(fw_uri).strip(): - return None + return None, None fw_uri = str(fw_uri).strip() - fw_res = self._run_redfish_get(fw_uri, log_artifact=True) + self.logger.info("Serviceability: firmware inventory Redfish GET %s", fw_uri) + fw_res = self._run_redfish_get(fw_uri, log_artifact=False) + self.result.artifacts.append(FirmwareInventoryArtifact.from_redfish_get(fw_res)) if not fw_res.success or fw_res.data is None: - return None + return None, fw_uri responses[fw_res.path] = fw_res.data - return self.extract_component_details(fw_res.data, args) + return self.extract_component_details(fw_res.data, args), fw_uri def _fetch_top(self, args: TServiceabilityCollectArg, top: int, max_pages: int): event_uri = args.resolved_event_log_uri() @@ -193,5 +250,5 @@ def _fetch_top(self, args: TServiceabilityCollectArg, top: int, max_pages: int): skip = count - top skip_uri = f"{event_uri}?$skip={skip}" if args.follow_next_link: - return self._run_redfish_get_paged(skip_uri, max_pages=max_pages) + return self._run_redfish_get_paged(skip_uri, max_pages=max_pages, log_artifact=True) return self._run_redfish_get(skip_uri, log_artifact=True) From 45b2ce577a8afe78567737beddca4eb77b9f9b98 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 19 Jun 2026 12:36:07 -0500 Subject: [PATCH 14/19] avoiding decode calls when cper is decoded with event --- .../serviceability/mi3xx/mi3xx_analyzer.py | 47 ++++++- .../serviceability/mi3xx/mi3xx_collector.py | 11 ++ .../serviceability/mi3xx/mi3xx_cper_utils.py | 117 ++++++++++++++++++ test/unit/plugin/test_mi3xx_collector.py | 80 ++++++++++++ test/unit/plugin/test_mi3xx_cper_utils.py | 108 ++++++++++++++++ test/unit/serviceability_dummy_data.py | 23 ++++ 6 files changed, 380 insertions(+), 6 deletions(-) create mode 100644 nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py create mode 100644 test/unit/plugin/test_mi3xx_cper_utils.py create mode 100644 test/unit/serviceability_dummy_data.py diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index 931366df..e0eab28c 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -47,6 +47,8 @@ ServiceabilityDataModel, ) +from .mi3xx_cper_utils import RF_CPER_AFID_MIN, should_skip_cper_fetch_or_decode + class AfidSagMetadataArtifact(BaseModel): """Hub AFID_SAG metadata snapshot; written to ``afid_sag_metadata.json``.""" @@ -83,25 +85,35 @@ def analyze_data( parent = self.parent or self.__class__.__name__ cper_data = data.cper_data or {} - if data.cper_raw and not cper_data: + cper_raw_to_decode = self._cper_raw_needing_decode(data) + skipped_cper = len(data.cper_raw or {}) - len(cper_raw_to_decode) + if skipped_cper: + self.logger.info( + "(%s) Skipping CPER decode for %d CPER attachment(s); Redfish log " + "already has usable ACA fields (AFID<%s or no serial on decode)", + parent, + skipped_cper, + RF_CPER_AFID_MIN, + ) + if cper_raw_to_decode and not cper_data: if not args.cper_decode_module: self.logger.warning( "(%s) %d CPER attachment(s) collected but cper_decode_module is " "not set in analysis_args; skipping CPER decode", parent, - len(data.cper_raw), + len(cper_raw_to_decode), ) else: self.logger.info( "(%s) Decoding %d CPER attachment(s) via %s.%s", parent, - len(data.cper_raw), + len(cper_raw_to_decode), args.cper_decode_module, args.cper_decode_method, ) try: cper_data = decode_cper_raw_attachments( - data.cper_raw, + cper_raw_to_decode, cper_decode_module=args.cper_decode_module, cper_decode_method=args.cper_decode_method, logger=self.logger, @@ -111,7 +123,7 @@ def analyze_data( "(%s) CPER decode finished: %d of %d attachment(s) decoded", parent, len(cper_data), - len(data.cper_raw), + len(cper_raw_to_decode), ) except CperDecodeError as exc: self.logger.warning( @@ -151,8 +163,10 @@ def analyze_data( cper_summary = "" if cper_data: cper_summary = f", {len(cper_data)} decoded CPER(s)" + elif cper_raw_to_decode: + cper_summary = f", {len(cper_raw_to_decode)} CPER attachment(s) not decoded" elif data.cper_raw: - cper_summary = f", {len(data.cper_raw)} CPER attachment(s) not decoded" + cper_summary = f", {len(data.cper_raw)} CPER attachment(s) omitted (ACA on log entry)" ver_bits: list[str] = [] if block.hub_version: ver_bits.append(f"hub {block.hub_version}") @@ -165,6 +179,27 @@ def analyze_data( ) return self.result + @staticmethod + def _cper_raw_needing_decode(data: ServiceabilityDataModel) -> dict[str, str]: + """Subset of ``cper_raw`` that still needs configured CPER decode (not already on the log).""" + raw = data.cper_raw or {} + if not raw: + return {} + by_id: dict[str, dict[str, Any]] = {} + for member in data.rf_events: + if not isinstance(member, dict): + continue + eid = member.get("Id") + if eid is not None: + by_id[str(eid)] = member + out: dict[str, str] = {} + for event_id, blob in raw.items(): + ev = by_id.get(str(event_id)) + if ev is not None and should_skip_cper_fetch_or_decode(ev): + continue + out[str(event_id)] = blob + return out + def _append_afid_sag_metadata_artifact(self, block: ServiceabilityBlock) -> None: if block.afid_sag_metadata is None: return diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py index 44594aee..8921796c 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py @@ -35,6 +35,7 @@ from nodescraper.plugins.serviceability.time_utils import satisfies_time_check from .mi3xx_collector_args import MI3XXCollectorArgs +from .mi3xx_cper_utils import RF_CPER_AFID_MIN, should_skip_cper_fetch_or_decode _EVENT_TIMESTAMP_KEYS = ("Created", "EventTimestamp", "Timestamp") @@ -90,6 +91,16 @@ def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: if not uri or not event_id: continue + if should_skip_cper_fetch_or_decode(event): + self.logger.info( + "(%s) Skipping CPER attachment fetch for Redfish event %s " + "(ACA decode already on log entry; AFID<%s check or no serial)", + parent, + event_id, + RF_CPER_AFID_MIN, + ) + continue + try: resp = self.connection.get_response(uri) except Exception as exc: # noqa: BLE001 diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py new file mode 100644 index 00000000..fe9661dc --- /dev/null +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py @@ -0,0 +1,117 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import Any + +# Redfish CPER (RF) style AFIDs start at this value; lower values are in-band / +# OEM-field AFIDs already reflected on the log entry. +RF_CPER_AFID_MIN = 10000 + +_SERIAL_KEYS = ("SerialNumber", "serial_number", "UbbSerial", "ubb_serial") + + +def event_afids_from_oem(event: dict[str, Any]) -> list[int]: + """AFIDs from ``Oem.AMDFieldIdentifiers`` (or similar list-of-dicts).""" + oem = event.get("Oem") + if not isinstance(oem, dict): + return [] + raw = oem.get("AMDFieldIdentifiers") + if not isinstance(raw, list): + return [] + out: list[int] = [] + for item in raw: + if not isinstance(item, dict): + continue + for key in ("AFID", "Afid", "afid"): + if key in item and item[key] is not None: + try: + out.append(int(item[key])) + except (TypeError, ValueError): + pass + break + return out + + +def _err_data_arr_entries(event: dict[str, Any]) -> list[dict[str, Any]]: + oem = event.get("Oem") + if not isinstance(oem, dict): + return [] + arr = oem.get("ErrDataArr") + if not isinstance(arr, list): + return [] + return [e for e in arr if isinstance(e, dict)] + + +def event_has_aca_decode(event: dict[str, Any]) -> bool: + """True when the log entry includes ACA-style ``DecodedData`` under ``ErrDataArr``.""" + for entry in _err_data_arr_entries(event): + decoded = entry.get("DecodedData") + if isinstance(decoded, dict) and decoded: + return True + return False + + +def _nonempty_serial_in_mapping(obj: Any) -> bool: + if not isinstance(obj, dict): + return False + for key in _SERIAL_KEYS: + val = obj.get(key) + if val is not None and str(val).strip(): + return True + return False + + +def event_aca_includes_serial(event: dict[str, Any]) -> bool: + """Serial (or UBB serial) present on any ``ErrDataArr`` row (typically ``MetaData``).""" + for entry in _err_data_arr_entries(event): + meta = entry.get("MetaData") + if _nonempty_serial_in_mapping(meta): + return True + decoded = entry.get("DecodedData") + if _nonempty_serial_in_mapping(decoded): + return True + return False + + +def should_skip_cper_fetch_or_decode(event: dict[str, Any]) -> bool: + """Whether to omit CPER binary fetch and configured CPER decode for this Redfish member. + + Skip when: + + * Every OEM-listed AFID is below ``RF_CPER_AFID_MIN`` (non-RF CPER range), + ACA ``DecodedData`` is present, and a serial is present on the entry; or + * ACA ``DecodedData`` is present but no serial — the CPER blob does not add + actionable identity beyond what is already missing from the log. + """ + if not event_has_aca_decode(event): + return False + if not event_aca_includes_serial(event): + return True + afids = event_afids_from_oem(event) + if not afids: + return False + return all(afid < RF_CPER_AFID_MIN for afid in afids) diff --git a/test/unit/plugin/test_mi3xx_collector.py b/test/unit/plugin/test_mi3xx_collector.py index d6aef464..625d1165 100644 --- a/test/unit/plugin/test_mi3xx_collector.py +++ b/test/unit/plugin/test_mi3xx_collector.py @@ -50,6 +50,7 @@ is_valid_iso_datetime, satisfies_time_check, ) +from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import RF_CPER_AFID_MIN EVENT_URI = DUMMY_EVENT_URI @@ -201,6 +202,85 @@ def test_mi3xx_collector_fetches_cper_attachments(mi3xx_collector, redfish_conn_ assert data.cper_data == {} +def test_mi3xx_collector_skips_cper_when_aca_serial_and_low_afids( + mi3xx_collector, redfish_conn_mock +): + redfish_conn_mock.get_response.reset_mock() + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={ + RF_MEMBERS: [ + { + "Id": "cper-evt-skip", + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/1", + "Oem": { + "AMDFieldIdentifiers": [{"AFID": 22}], + "ErrDataArr": [ + { + "DecodedData": {"error_type": "On-die ECC"}, + "MetaData": {"SerialNumber": "692545012569"}, + } + ], + }, + } + ] + }, + status_code=200, + ) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = mi3xx_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.cper_raw == {} + redfish_conn_mock.get_response.assert_not_called() + + +def test_mi3xx_collector_fetches_cper_when_rf_afid(mi3xx_collector, redfish_conn_mock): + import base64 + from unittest.mock import MagicMock + + redfish_conn_mock.get_response.reset_mock() + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={ + RF_MEMBERS: [ + { + "Id": "cper-evt-rf", + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/2", + "Oem": { + "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], + "ErrDataArr": [ + { + "DecodedData": {"error_type": "x"}, + "MetaData": {"SerialNumber": "692545012569"}, + } + ], + }, + } + ] + }, + status_code=200, + ) + response = MagicMock() + response.ok = True + response.status_code = 200 + response.content = b"\xaa\xbb" + redfish_conn_mock.get_response.return_value = response + + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = mi3xx_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.cper_raw["cper-evt-rf"] == base64.b64encode(b"\xaa\xbb").decode("ascii") + redfish_conn_mock.get_response.assert_called_once() + + def test_mi3xx_collector_filters_events_by_reference_time(mi3xx_collector, redfish_conn_mock): redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( path=EVENT_URI, diff --git a/test/unit/plugin/test_mi3xx_cper_utils.py b/test/unit/plugin/test_mi3xx_cper_utils.py new file mode 100644 index 00000000..e5de352d --- /dev/null +++ b/test/unit/plugin/test_mi3xx_cper_utils.py @@ -0,0 +1,108 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import pytest + +from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import ( + RF_CPER_AFID_MIN, + event_aca_includes_serial, + event_afids_from_oem, + event_has_aca_decode, + should_skip_cper_fetch_or_decode, +) + +_DUMMY_META_SERIAL = "DUMMY-GPU-SERIAL-0001" +_DUMMY_DECODED_FIELD = "dummy_error_type" + + +def _oem_err_row(*, serial: bool = True, decoded: bool = True): + meta = {"SerialNumber": _DUMMY_META_SERIAL} if serial else {"GpuFw": "dummy-fw"} + dec = {"error_type": _DUMMY_DECODED_FIELD} if decoded else {} + return {"DecodedData": dec, "MetaData": meta} + + +def test_skip_when_afids_below_threshold_and_aca_has_serial(): + event = { + "Oem": { + "AMDFieldIdentifiers": [{"AFID": 22}], + "ErrDataArr": [_oem_err_row()], + } + } + assert event_afids_from_oem(event) == [22] + assert should_skip_cper_fetch_or_decode(event) is True + + +def test_no_skip_when_rf_range_afid_even_with_aca_serial(): + event = { + "Oem": { + "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], + "ErrDataArr": [_oem_err_row()], + } + } + assert should_skip_cper_fetch_or_decode(event) is False + + +def test_skip_when_aca_decode_without_serial(): + event = { + "Oem": { + "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], + "ErrDataArr": [_oem_err_row(serial=False)], + } + } + assert event_has_aca_decode(event) is True + assert event_aca_includes_serial(event) is False + assert should_skip_cper_fetch_or_decode(event) is True + + +def test_no_skip_when_no_err_data_decoded(): + event = { + "Oem": { + "AMDFieldIdentifiers": [{"AFID": 22}], + } + } + assert should_skip_cper_fetch_or_decode(event) is False + + +def test_no_skip_when_aca_serial_but_no_afid_list(): + event = { + "Oem": { + "ErrDataArr": [_oem_err_row()], + } + } + assert event_afids_from_oem(event) == [] + assert should_skip_cper_fetch_or_decode(event) is False + + +@pytest.mark.parametrize( + "afids,expect_skip", + [ + ([22, 28], True), + ([22, RF_CPER_AFID_MIN], False), + ], +) +def test_skip_requires_all_afids_below_rf_threshold(afids, expect_skip): + identifiers = [{"AFID": a} for a in afids] + event = {"Oem": {"AMDFieldIdentifiers": identifiers, "ErrDataArr": [_oem_err_row()]}} + assert should_skip_cper_fetch_or_decode(event) is expect_skip diff --git a/test/unit/serviceability_dummy_data.py b/test/unit/serviceability_dummy_data.py new file mode 100644 index 00000000..0542c866 --- /dev/null +++ b/test/unit/serviceability_dummy_data.py @@ -0,0 +1,23 @@ +"""Shared dummy values for serviceability unit tests (not production data).""" + +DUMMY_AFID_A = 9001 +DUMMY_AFID_B = 9002 +DUMMY_AFID_C = 9003 +DUMMY_SERVICE_ACTION_NUM = 99 +DUMMY_SERVICE_ACTION_TITLE = "Dummy service action" +DUMMY_UNIT_A = "dummy_unit_a" +DUMMY_UNIT_B = "dummy_unit_b" +DUMMY_UNIT_C = "dummy_unit_c" +DUMMY_DESIGNATION_A = "DUMMY_SLOT_A" +DUMMY_DESIGNATION_B = "DUMMY_SLOT_B" +DUMMY_EVENT_URI = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/Entries" +DUMMY_EVENT_URI_ALT = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/EntriesAlt" +DUMMY_TIMESTAMP = "2000-01-01T12:00:00+00:00" +DUMMY_TIMESTAMP_EARLIER = "1999-12-31T12:00:00+00:00" +DUMMY_TIMESTAMP_LATER = "2000-01-02T12:00:00+00:00" +DUMMY_RF_EVENT_COUNT = 2 +DUMMY_SAG_PID = "dummy-sag-pid" +DUMMY_SAG_REVISION = "dummy-rev-0" +DUMMY_ENGINE_VERSION = "0.0.0-dummy" +DUMMY_BMC_HOST = "dummy-bmc.example" +DUMMY_OEM_VENDOR = "DummyVendor" From a7dd97d98c1e0629ab9010a54db267c842435a64 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 13:36:49 -0500 Subject: [PATCH 15/19] renames --- .../plugins/serviceability/analyzer_args.py | 26 +-- .../plugins/serviceability/cper_decode.py | 4 +- .../serviceability/mi3xx/mi3xx_analyzer.py | 14 +- .../plugins/serviceability/se_adapter.py | 6 +- .../plugins/serviceability/se_models.py | 2 +- .../plugins/serviceability/se_runner.py | 34 ++-- test/unit/mock_python_engine.py | 6 +- .../plugin/test_afid_events_bmc_schema.py | 118 +++++-------- test/unit/plugin/test_mi3xx_collector.py | 75 +++------ test/unit/plugin/test_mi3xx_cper_utils.py | 40 ++--- test/unit/plugin/test_se_runner.py | 40 ++--- test/unit/serviceability_dummy_data.py | 159 +++++++++++++++++- 12 files changed, 308 insertions(+), 216 deletions(-) diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py index 2aa27ccd..639822cc 100644 --- a/nodescraper/plugins/serviceability/analyzer_args.py +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -35,27 +35,27 @@ class ServiceabilityAnalyzerArgs(AnalyzerArgs): """Analyzer args for serviceability plugins that run a configurable Python hub.""" - engine_python_module: Optional[str] = Field( + hub_python_module: Optional[str] = Field( default=None, - description="Import path for the hub module (class implements engine_analyze_method); hub_options forwards kwargs.", + description="Import path for the hub module (class implements hub_analyze_method); hub_options forwards kwargs.", ) - engine_display_name: Optional[str] = Field( + hub_display_name: Optional[str] = Field( default=None, description="Optional label for analyzer status messages.", ) afid_sag_path: Optional[str] = Field( default=None, - description="Path to hub config (e.g. AFID_SAG.json); passed as engine_init_path_kwarg.", + description="Path to hub config (e.g. AFID_SAG.json); passed as hub_init_path_kwarg.", ) - engine_init_path_kwarg: str = Field( + hub_init_path_kwarg: str = Field( default="afid_sag", description="Hub __init__ keyword that receives afid_sag_path.", ) - engine_analyze_method: str = Field( + hub_analyze_method: str = Field( default="get_service_info", description="Hub method called with rf_events first (default get_service_info).", ) - skip_engine: bool = Field( + skip_hub: bool = Field( default=False, description="If True, only build afid_events without running the service hub.", ) @@ -101,7 +101,7 @@ def resolved_hub_options(self) -> dict[str, Any]: merged["suppress_service_actions"] = self.suppress_service_actions return merged - @field_validator("engine_analyze_method", "engine_init_path_kwarg") + @field_validator("hub_analyze_method", "hub_init_path_kwarg") @classmethod def _strip_non_empty_hub_hooks(cls, value: str) -> str: text = str(value).strip() @@ -128,8 +128,8 @@ def _strip_from_date(cls, value: object) -> Optional[str]: @field_validator( "afid_sag_path", - "engine_python_module", - "engine_display_name", + "hub_python_module", + "hub_display_name", "cper_decode_module", ) @classmethod @@ -141,10 +141,10 @@ def _strip_optional_strings(cls, value: Optional[str]) -> Optional[str]: @model_validator(mode="after") def _require_hub_config_when_running(self) -> ServiceabilityAnalyzerArgs: - if self.skip_engine: + if self.skip_hub: return self if not self.afid_sag_path: raise ValueError("afid_sag_path is required when running the service hub.") - if not self.engine_python_module: - raise ValueError("engine_python_module is required when running the service hub.") + if not self.hub_python_module: + raise ValueError("hub_python_module is required when running the service hub.") return self diff --git a/nodescraper/plugins/serviceability/cper_decode.py b/nodescraper/plugins/serviceability/cper_decode.py index 6982407a..d4e9b20e 100644 --- a/nodescraper/plugins/serviceability/cper_decode.py +++ b/nodescraper/plugins/serviceability/cper_decode.py @@ -80,8 +80,8 @@ def decode_cper_raw_attachments( """Decode base64 CPER blobs keyed by Redfish event Id. The decode callable must accept a binary file-like object and return - ``(return_code, decode_dict)``. Results are passed to the service engine as - ``cper_data``; the engine does not perform CPER decoding itself. + ``(return_code, decode_dict)``. Results are passed to the service hub as + ``cper_data``; the hub does not perform CPER decoding itself. Returns ``{event_id: {"return_code": int, "decode": dict}}``. """ diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index e0eab28c..6150398e 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -76,7 +76,7 @@ def analyze_data( events = data.afid_events or build_afid_events_from_data(data) data.afid_events = events - if args.skip_engine: + if args.skip_hub: data.serviceability = ServiceabilityBlock(afid_events=events) self.result.status = ExecutionStatus.OK self.result.message = f"Built {len(events)} AFID event(s); hub skipped" @@ -140,15 +140,15 @@ def analyze_data( try: block = run_service_hub( - engine_python_module=args.engine_python_module, # type: ignore[arg-type] - engine_display_name=args.engine_display_name, + hub_python_module=args.hub_python_module, # type: ignore[arg-type] + hub_display_name=args.hub_display_name, afid_events=events, afid_sag_path=args.afid_sag_path, # type: ignore[arg-type] rf_events=data.rf_events, cper_data=cper_data or None, hub_options=args.resolved_hub_options(), - engine_analyze_method=args.engine_analyze_method, - engine_init_path_kwarg=args.engine_init_path_kwarg, + hub_analyze_method=args.hub_analyze_method, + hub_init_path_kwarg=args.hub_init_path_kwarg, ) except (SeRunError, ValueError) as exc: self.result.status = ExecutionStatus.ERROR @@ -158,7 +158,7 @@ def analyze_data( data.serviceability = block self._append_afid_sag_metadata_artifact(block) self._log_serviceability_solutions(block) - engine_label = args.engine_display_name or args.engine_python_module + hub_label = args.hub_display_name or args.hub_python_module self.result.status = ExecutionStatus.OK cper_summary = "" if cper_data: @@ -174,7 +174,7 @@ def analyze_data( ver_bits.append(f"AFID_SAG {block.afid_sag_file_version}") ver_suffix = f" [{'; '.join(ver_bits)}]" if ver_bits else "" self.result.message = ( - f"{engine_label}: {len(block.solution)} solution(s) " + f"{hub_label}: {len(block.solution)} solution(s) " f"from {len(data.rf_events)} Redfish event(s){cper_summary}{ver_suffix}" ) return self.result diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 04321c82..bea1d4a0 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -269,7 +269,7 @@ def serviceability_block_from_service_result( afid_events: list[AfidEvent], result: Any, *, - engine_label: str = "Service hub", + hub_label: str = "Service hub", rf_event_count: int = 0, ) -> ServiceabilityBlock: """Build a ``ServiceabilityBlock`` from a hub result with ``service_info``.""" @@ -326,7 +326,9 @@ def _action_title(info: dict[str, Any]) -> str: ) hub_version = _hub_version_display(version_info) afid_sag_file_version = _afid_sag_file_version_display(metadata) - reasoning = f"{engine_label}: {len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." + reasoning = ( + f"{hub_label}: {len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." + ) meta_out: Optional[dict[str, Any]] = dict(metadata) if isinstance(raw_metadata, dict) else None short_service_info = _format_short_service_info_for_block( getattr(result, "short_service_info", None) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index 8a3f50f3..6aa855a3 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -91,7 +91,7 @@ class ServiceabilityBlock(BaseModel): ) afid_sag_metadata: Optional[dict[str, Any]] = Field( default=None, - description="Hub-reported AFID_SAG metadata dict when the engine exposes afid_sag_metadata.", + description="Hub-reported AFID_SAG metadata dict when the hub exposes afid_sag_metadata.", ) short_service_info: Optional[str] = Field( default=None, diff --git a/nodescraper/plugins/serviceability/se_runner.py b/nodescraper/plugins/serviceability/se_runner.py index c141b6ec..6ff8b60e 100644 --- a/nodescraper/plugins/serviceability/se_runner.py +++ b/nodescraper/plugins/serviceability/se_runner.py @@ -89,21 +89,21 @@ class SeRunError(RuntimeError): def run_service_hub( *, - engine_python_module: str, - engine_display_name: Optional[str] = None, + hub_python_module: str, + hub_display_name: Optional[str] = None, afid_events: list[AfidEvent], afid_sag_path: str, rf_events: list[Any], cper_data: Optional[dict[str, Any]] = None, hub_options: Optional[dict[str, Any]] = None, - engine_analyze_method: str = "get_service_info", - engine_init_path_kwarg: str = "afid_sag", + hub_analyze_method: str = "get_service_info", + hub_init_path_kwarg: str = "afid_sag", ) -> ServiceabilityBlock: """Run the configured Python service hub and return a :class:`ServiceabilityBlock`. - The runner imports ``engine_python_module``, picks the unique class that implements - ``engine_analyze_method``, constructs it with the config file path passed as - ``engine_init_path_kwarg``, then calls the analyze method with ``rf_events`` and any + The runner imports ``hub_python_module``, picks the unique class that implements + ``hub_analyze_method``, constructs it with the config file path passed as + ``hub_init_path_kwarg``, then calls the analyze method with ``rf_events`` and any ``hub_options`` keys that match the method signature (plus ``cper_data`` when supported). Result mapping is handled by :func:`serviceability_block_from_service_result`. """ @@ -113,25 +113,25 @@ def run_service_hub( if not rf_events: raise SeRunError( - "Collected Redfish events are required; re-run collection or use skip_engine." + "Collected Redfish events are required; re-run collection or use skip_hub." ) - label = engine_display_name or engine_python_module + label = hub_display_name or hub_python_module try: - mod = importlib.import_module(engine_python_module) + mod = importlib.import_module(hub_python_module) except ImportError as exc: - raise SeRunError(f"Cannot import {engine_python_module}: {exc}") from exc + raise SeRunError(f"Cannot import {hub_python_module}: {exc}") from exc - hub_cls = _resolve_hub_class(mod, engine_analyze_method) + hub_cls = _resolve_hub_class(mod, hub_analyze_method) try: instance = _instantiate_hub( hub_cls, afid_sag_path, - engine_init_path_kwarg, + hub_init_path_kwarg, hub_options, ) - analyze = getattr(instance, engine_analyze_method) + analyze = getattr(instance, hub_analyze_method) result = _call_hub_analyze( analyze, rf_events, @@ -139,7 +139,7 @@ def run_service_hub( hub_options, ) except Exception as exc: - raise SeRunError(f"{label} {engine_analyze_method}() failed: {exc}") from exc + raise SeRunError(f"{label} {hub_analyze_method}() failed: {exc}") from exc if result is None: return ServiceabilityBlock( @@ -151,7 +151,7 @@ def run_service_hub( return serviceability_block_from_service_result( afid_events, result, - engine_label=label, + hub_label=label, rf_event_count=len(rf_events), ) @@ -188,7 +188,7 @@ def add_candidate(obj: Any) -> None: if not candidates: raise SeRunError( f"No class with {analyze_method}() found in {package}; " - "check engine_python_module and engine_analyze_method in analysis_args." + "check hub_python_module and hub_analyze_method in analysis_args." ) names = ", ".join(cls.__name__ for cls in candidates) raise SeRunError(f"Multiple classes with {analyze_method}() in {package}: {names}.") diff --git a/test/unit/mock_python_engine.py b/test/unit/mock_python_engine.py index 515eea38..f48a7e43 100644 --- a/test/unit/mock_python_engine.py +++ b/test/unit/mock_python_engine.py @@ -1,4 +1,4 @@ -"""Mock Python service engine for unit tests.""" +"""Mock Python service hub for unit tests.""" from __future__ import annotations @@ -6,7 +6,7 @@ from typing import Any, Optional from serviceability_dummy_data import ( - DUMMY_ENGINE_VERSION, + DUMMY_HUB_VERSION, DUMMY_SAG_PID, DUMMY_SAG_REVISION, DUMMY_SERVICE_ACTION_NUM, @@ -39,5 +39,5 @@ def get_service_info( return SimpleNamespace( service_info=service_info, afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, - engine_version_info={"version": DUMMY_ENGINE_VERSION}, + engine_version_info={"version": DUMMY_HUB_VERSION}, ) diff --git a/test/unit/plugin/test_afid_events_bmc_schema.py b/test/unit/plugin/test_afid_events_bmc_schema.py index 7c54364f..8529577c 100644 --- a/test/unit/plugin/test_afid_events_bmc_schema.py +++ b/test/unit/plugin/test_afid_events_bmc_schema.py @@ -4,10 +4,41 @@ # # Copyright (c) 2026 Advanced Micro Devices, Inc. # +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ############################################################################### """AFID / serviceable unit extraction for OpenBMC-style LogEntry payloads.""" from __future__ import annotations +from serviceability_dummy_data import ( + DUMMY_AFID_A, + DUMMY_AFID_BELOW_RF, + DUMMY_AFID_FATAL_HBM, + DUMMY_TIMESTAMP, + DUMMY_UNIT_A, + DUMMY_UNIT_B, + DUMMY_UNIT_C, + dummy_fatal_hbm_log_entry, + dummy_openbmc_log_entry, + dummy_openbmc_log_entry_serviceable_units_only, +) + from nodescraper.plugins.serviceability.afid_events import ( _afid_event_from_rf_member, build_afid_events_from_data, @@ -16,95 +47,36 @@ ServiceabilityDataModel, ) -# Shape from after_clear_rma_case.json: AFID under Oem.AMDFieldIdentifiers[], OOC under Links. -_SAMPLE_LOG_ENTRY = { - "@odata.id": "/redfish/v1/Systems/UBB/LogServices/EventLog/Entries/1", - "Created": "2026-06-16T20:25:22+00:00", - "Id": "1", - "Links": { - "OriginOfCondition": { - "@odata.id": "/redfish/v1/Chassis/OAM_7", - } - }, - "Oem": { - "AMDFieldIdentifiers": [ - { - "AFID": 22, - "Description": "On-die ECC, Uncorrected, Non-fatal", - "ServiceableUnits": [ - {"@odata.id": "/redfish/v1/Chassis/OAM_7"}, - ], - "ServiceableUnits@odata.count": 1, - } - ], - "AMDFieldIdentifiers@Members.count": 1, - }, -} - def test_afid_event_from_openbmc_log_entry_with_links_and_amd_field_identifiers(): - ev = _afid_event_from_rf_member(_SAMPLE_LOG_ENTRY) + ev = _afid_event_from_rf_member(dummy_openbmc_log_entry()) assert ev is not None - assert ev.afid == 22 - assert ev.serviceable_unit == "OAM_7" - assert "2026-06-16" in ev.time + assert ev.afid == DUMMY_AFID_BELOW_RF + assert ev.serviceable_unit == DUMMY_UNIT_A + assert DUMMY_TIMESTAMP[:10] in ev.time def test_serviceable_unit_from_oem_serviceable_units_when_no_links(): - member = { - "Created": "2026-06-16T20:25:22+00:00", - "Oem": { - "AMDFieldIdentifiers": [ - { - "AFID": 23, - "ServiceableUnits": [ - {"@odata.id": "/redfish/v1/Chassis/OAM_3"}, - ], - } - ], - }, - } - ev = _afid_event_from_rf_member(member) + ev = _afid_event_from_rf_member(dummy_openbmc_log_entry_serviceable_units_only()) assert ev is not None - assert ev.afid == 23 - assert ev.serviceable_unit == "OAM_3" - - -# Minimal slice of smci350 command_artifacts.json first CPER row (Links + AMDFieldIdentifiers[]). -_SMCI350_STYLE_ENTRY = { - "Created": "2026-06-16T18:53:21+00:00", - "Id": "1", - "Links": { - "OriginOfCondition": {"@odata.id": "/redfish/v1/Chassis/OAM_2"}, - }, - "Oem": { - "AMDFieldIdentifiers": [ - { - "AFID": 25, - "Description": "All Other HBM, Fatal", - "ServiceableUnits": [{"@odata.id": "/redfish/v1/Chassis/OAM_2"}], - "ServiceableUnits@odata.count": 1, - } - ], - "AMDFieldIdentifiers@Members.count": 1, - }, -} + assert ev.afid == DUMMY_AFID_A + assert ev.serviceable_unit == DUMMY_UNIT_B -def test_afid_event_smci350_style_fatal_hbm_entry(): - ev = _afid_event_from_rf_member(_SMCI350_STYLE_ENTRY) +def test_afid_event_fatal_hbm_log_entry(): + ev = _afid_event_from_rf_member(dummy_fatal_hbm_log_entry()) assert ev is not None - assert ev.afid == 25 - assert ev.serviceable_unit == "OAM_2" + assert ev.afid == DUMMY_AFID_FATAL_HBM + assert ev.serviceable_unit == DUMMY_UNIT_C def test_build_afid_events_from_data_includes_openbmc_entries(): data = ServiceabilityDataModel( - rf_events=[_SAMPLE_LOG_ENTRY, _SMCI350_STYLE_ENTRY], + rf_events=[dummy_openbmc_log_entry(), dummy_fatal_hbm_log_entry()], cper_data={}, ) events = build_afid_events_from_data(data) assert len(events) == 2 by_afid_oam = {(e.afid, e.serviceable_unit) for e in events} - assert (22, "OAM_7") in by_afid_oam - assert (25, "OAM_2") in by_afid_oam + assert (DUMMY_AFID_BELOW_RF, DUMMY_UNIT_A) in by_afid_oam + assert (DUMMY_AFID_FATAL_HBM, DUMMY_UNIT_C) in by_afid_oam diff --git a/test/unit/plugin/test_mi3xx_collector.py b/test/unit/plugin/test_mi3xx_collector.py index 625d1165..1cddc2f3 100644 --- a/test/unit/plugin/test_mi3xx_collector.py +++ b/test/unit/plugin/test_mi3xx_collector.py @@ -27,10 +27,17 @@ from pydantic import ValidationError from serviceability_dummy_data import ( DUMMY_BMC_HOST, + DUMMY_CPER_BYTES_BASIC, + DUMMY_CPER_BYTES_RF, + DUMMY_CPER_EVENT_ID_BASIC, + DUMMY_CPER_EVENT_ID_RF, DUMMY_EVENT_URI, DUMMY_EVENT_URI_ALT, DUMMY_TIMESTAMP_EARLIER, DUMMY_TIMESTAMP_LATER, + dummy_cper_basic_member, + dummy_cper_rf_member, + dummy_cper_skip_member, ) from nodescraper.connection.redfish import RF_MEMBERS, RedfishGetResult @@ -50,7 +57,6 @@ is_valid_iso_datetime, satisfies_time_check, ) -from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import RF_CPER_AFID_MIN EVENT_URI = DUMMY_EVENT_URI @@ -176,29 +182,22 @@ def test_mi3xx_collector_fetches_cper_attachments(mi3xx_collector, redfish_conn_ redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( path=EVENT_URI, success=True, - data={ - RF_MEMBERS: [ - { - "Id": "cper-evt-1", - "Created": DUMMY_TIMESTAMP_LATER, - "DiagnosticDataType": "CPER", - "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/1", - } - ] - }, + data={RF_MEMBERS: [dummy_cper_basic_member()]}, status_code=200, ) response = MagicMock() response.ok = True response.status_code = 200 - response.content = b"\x01\x02dummy-cper" + response.content = DUMMY_CPER_BYTES_BASIC redfish_conn_mock.get_response.return_value = response args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) result, data = mi3xx_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None - assert data.cper_raw["cper-evt-1"] == base64.b64encode(b"\x01\x02dummy-cper").decode("ascii") + assert data.cper_raw[DUMMY_CPER_EVENT_ID_BASIC] == base64.b64encode( + DUMMY_CPER_BYTES_BASIC + ).decode("ascii") assert data.cper_data == {} @@ -209,25 +208,7 @@ def test_mi3xx_collector_skips_cper_when_aca_serial_and_low_afids( redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( path=EVENT_URI, success=True, - data={ - RF_MEMBERS: [ - { - "Id": "cper-evt-skip", - "Created": DUMMY_TIMESTAMP_LATER, - "DiagnosticDataType": "CPER", - "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/1", - "Oem": { - "AMDFieldIdentifiers": [{"AFID": 22}], - "ErrDataArr": [ - { - "DecodedData": {"error_type": "On-die ECC"}, - "MetaData": {"SerialNumber": "692545012569"}, - } - ], - }, - } - ] - }, + data={RF_MEMBERS: [dummy_cper_skip_member()]}, status_code=200, ) args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) @@ -246,38 +227,22 @@ def test_mi3xx_collector_fetches_cper_when_rf_afid(mi3xx_collector, redfish_conn redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( path=EVENT_URI, success=True, - data={ - RF_MEMBERS: [ - { - "Id": "cper-evt-rf", - "Created": DUMMY_TIMESTAMP_LATER, - "DiagnosticDataType": "CPER", - "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/2", - "Oem": { - "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], - "ErrDataArr": [ - { - "DecodedData": {"error_type": "x"}, - "MetaData": {"SerialNumber": "692545012569"}, - } - ], - }, - } - ] - }, + data={RF_MEMBERS: [dummy_cper_rf_member()]}, status_code=200, ) response = MagicMock() response.ok = True response.status_code = 200 - response.content = b"\xaa\xbb" + response.content = DUMMY_CPER_BYTES_RF redfish_conn_mock.get_response.return_value = response args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) result, data = mi3xx_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None - assert data.cper_raw["cper-evt-rf"] == base64.b64encode(b"\xaa\xbb").decode("ascii") + assert data.cper_raw[DUMMY_CPER_EVENT_ID_RF] == base64.b64encode(DUMMY_CPER_BYTES_RF).decode( + "ascii" + ) redfish_conn_mock.get_response.assert_called_once() @@ -319,11 +284,11 @@ def test_mi3xx_result_reporting_versions(): plugin_name="dummy_plugin", plugin_version="0.0-dummy", node_scraper_version="0.0-dummy", - dummy_engine_version="0.0-dummy", + dummy_hub_version="0.0-dummy", ) result = MI3XXResult(node="dummy-node", **version_fields) assert result.plugin_name == "dummy_plugin" - assert result.reporter_extensions["dummy_engine_version"] == "0.0-dummy" + assert result.reporter_extensions["dummy_hub_version"] == "0.0-dummy" def test_mi3xx_data_model_log_model(tmp_path): diff --git a/test/unit/plugin/test_mi3xx_cper_utils.py b/test/unit/plugin/test_mi3xx_cper_utils.py index e5de352d..b156b930 100644 --- a/test/unit/plugin/test_mi3xx_cper_utils.py +++ b/test/unit/plugin/test_mi3xx_cper_utils.py @@ -24,41 +24,37 @@ # ############################################################################### import pytest +from serviceability_dummy_data import ( + DUMMY_AFID_B, + DUMMY_AFID_BELOW_RF, + DUMMY_RF_CPER_AFID, + dummy_aca_err_row, +) from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import ( - RF_CPER_AFID_MIN, event_aca_includes_serial, event_afids_from_oem, event_has_aca_decode, should_skip_cper_fetch_or_decode, ) -_DUMMY_META_SERIAL = "DUMMY-GPU-SERIAL-0001" -_DUMMY_DECODED_FIELD = "dummy_error_type" - - -def _oem_err_row(*, serial: bool = True, decoded: bool = True): - meta = {"SerialNumber": _DUMMY_META_SERIAL} if serial else {"GpuFw": "dummy-fw"} - dec = {"error_type": _DUMMY_DECODED_FIELD} if decoded else {} - return {"DecodedData": dec, "MetaData": meta} - def test_skip_when_afids_below_threshold_and_aca_has_serial(): event = { "Oem": { - "AMDFieldIdentifiers": [{"AFID": 22}], - "ErrDataArr": [_oem_err_row()], + "AMDFieldIdentifiers": [{"AFID": DUMMY_AFID_BELOW_RF}], + "ErrDataArr": [dummy_aca_err_row()], } } - assert event_afids_from_oem(event) == [22] + assert event_afids_from_oem(event) == [DUMMY_AFID_BELOW_RF] assert should_skip_cper_fetch_or_decode(event) is True def test_no_skip_when_rf_range_afid_even_with_aca_serial(): event = { "Oem": { - "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], - "ErrDataArr": [_oem_err_row()], + "AMDFieldIdentifiers": [{"AFID": DUMMY_RF_CPER_AFID}], + "ErrDataArr": [dummy_aca_err_row()], } } assert should_skip_cper_fetch_or_decode(event) is False @@ -67,8 +63,8 @@ def test_no_skip_when_rf_range_afid_even_with_aca_serial(): def test_skip_when_aca_decode_without_serial(): event = { "Oem": { - "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], - "ErrDataArr": [_oem_err_row(serial=False)], + "AMDFieldIdentifiers": [{"AFID": DUMMY_RF_CPER_AFID}], + "ErrDataArr": [dummy_aca_err_row(serial=False)], } } assert event_has_aca_decode(event) is True @@ -79,7 +75,7 @@ def test_skip_when_aca_decode_without_serial(): def test_no_skip_when_no_err_data_decoded(): event = { "Oem": { - "AMDFieldIdentifiers": [{"AFID": 22}], + "AMDFieldIdentifiers": [{"AFID": DUMMY_AFID_BELOW_RF}], } } assert should_skip_cper_fetch_or_decode(event) is False @@ -88,7 +84,7 @@ def test_no_skip_when_no_err_data_decoded(): def test_no_skip_when_aca_serial_but_no_afid_list(): event = { "Oem": { - "ErrDataArr": [_oem_err_row()], + "ErrDataArr": [dummy_aca_err_row()], } } assert event_afids_from_oem(event) == [] @@ -98,11 +94,11 @@ def test_no_skip_when_aca_serial_but_no_afid_list(): @pytest.mark.parametrize( "afids,expect_skip", [ - ([22, 28], True), - ([22, RF_CPER_AFID_MIN], False), + ([DUMMY_AFID_BELOW_RF, DUMMY_AFID_B], True), + ([DUMMY_AFID_BELOW_RF, DUMMY_RF_CPER_AFID], False), ], ) def test_skip_requires_all_afids_below_rf_threshold(afids, expect_skip): identifiers = [{"AFID": a} for a in afids] - event = {"Oem": {"AMDFieldIdentifiers": identifiers, "ErrDataArr": [_oem_err_row()]}} + event = {"Oem": {"AMDFieldIdentifiers": identifiers, "ErrDataArr": [dummy_aca_err_row()]}} assert should_skip_cper_fetch_or_decode(event) is expect_skip diff --git a/test/unit/plugin/test_se_runner.py b/test/unit/plugin/test_se_runner.py index 01f8c4bc..554f0ccc 100644 --- a/test/unit/plugin/test_se_runner.py +++ b/test/unit/plugin/test_se_runner.py @@ -36,7 +36,7 @@ DUMMY_AFID_C, DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B, - DUMMY_ENGINE_VERSION, + DUMMY_HUB_VERSION, DUMMY_OEM_VENDOR, DUMMY_RF_EVENT_COUNT, DUMMY_SAG_PID, @@ -86,18 +86,18 @@ def test_normalize_se_timestamp_preserves_format_value(): def test_analyzer_args_require_hub_config(): with pytest.raises(ValidationError): ServiceabilityAnalyzerArgs() - with pytest.raises(ValidationError, match="engine_python_module"): + with pytest.raises(ValidationError, match="hub_python_module"): ServiceabilityAnalyzerArgs(afid_sag_path=str(AFID_SAG)) args = ServiceabilityAnalyzerArgs( - engine_python_module="dummy.test.module", + hub_python_module="dummy.test.module", afid_sag_path=str(AFID_SAG), ) - assert args.engine_python_module == "dummy.test.module" + assert args.hub_python_module == "dummy.test.module" def test_resolved_hub_options_explicit_fields_override_options_bag(): args = ServiceabilityAnalyzerArgs( - engine_python_module="dummy.test.module", + hub_python_module="dummy.test.module", afid_sag_path=str(AFID_SAG), hub_options={"from_ac_cycle": 9, "extra": 1}, from_ac_cycle=3, @@ -158,12 +158,12 @@ def test_serviceability_block_from_service_result(): }, }, afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, - engine_version_info={"version": DUMMY_ENGINE_VERSION}, + engine_version_info={"version": DUMMY_HUB_VERSION}, ) block = serviceability_block_from_service_result( EXAMPLE_EVENTS[:1], result, - engine_label="Dummy test engine", + hub_label="Dummy test hub", rf_event_count=DUMMY_RF_EVENT_COUNT, ) assert len(block.solution) == 1 @@ -171,12 +171,12 @@ def test_serviceability_block_from_service_result(): assert block.solution[0].service_action_num == DUMMY_SERVICE_ACTION_NUM assert block.solution[0].service_action_title == "Dummy service action" assert set(block.solution[0].serviceable_unit) == {DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B} - assert block.hub_version == DUMMY_ENGINE_VERSION + assert block.hub_version == DUMMY_HUB_VERSION assert block.afid_sag_file_version is not None assert DUMMY_SAG_PID in block.afid_sag_file_version assert DUMMY_SAG_REVISION in block.afid_sag_file_version assert f"{DUMMY_RF_EVENT_COUNT} Redfish event(s)" in block.solution_reasoning - assert "Dummy test engine" in block.solution_reasoning + assert "Dummy test hub" in block.solution_reasoning def test_serviceability_block_from_service_result_isa_version_info(): @@ -188,7 +188,7 @@ def test_serviceability_block_from_service_result_isa_version_info(): block = serviceability_block_from_service_result( EXAMPLE_EVENTS[:1], result, - engine_label="ISA", + hub_label="ISA", rf_event_count=1, ) assert block.hub_version == "1.2.3" @@ -220,7 +220,7 @@ def test_run_service_hub_with_mock_module(): {"Afid": DUMMY_AFID_C, "serviceable_unit": DUMMY_UNIT_C, "Created": DUMMY_TIMESTAMP}, ] block = run_service_hub( - engine_python_module="mock_python_engine", + hub_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS[:2], afid_sag_path=str(AFID_SAG), rf_events=rf_events, @@ -251,14 +251,14 @@ def analyze_events(self, rf_events, cper_data=None): sys.modules["alt_service_engine"] = mod try: run_service_hub( - engine_python_module="alt_service_engine", + hub_python_module="alt_service_engine", afid_events=EXAMPLE_EVENTS[:1], afid_sag_path=str(AFID_SAG), rf_events=[{"Afid": 1}], cper_data={"k": 1}, hub_options={"debug": True}, - engine_analyze_method="analyze_events", - engine_init_path_kwarg="rulebook_path", + hub_analyze_method="analyze_events", + hub_init_path_kwarg="rulebook_path", ) finally: del sys.modules["alt_service_engine"] @@ -273,7 +273,7 @@ def test_run_service_hub_accepts_hub_options(): {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, ] block = run_service_hub( - engine_python_module="mock_python_engine", + hub_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS[:1], afid_sag_path=str(AFID_SAG), rf_events=rf_events, @@ -290,7 +290,7 @@ def test_run_service_hub_forwards_full_hub_options_kwargs(): {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, ] run_service_hub( - engine_python_module="instinct_shaped_engine", + hub_python_module="instinct_shaped_engine", afid_events=EXAMPLE_EVENTS[:1], afid_sag_path=str(AFID_SAG), rf_events=rf_events, @@ -318,7 +318,7 @@ def test_run_service_hub_collected_cper_overrides_hub_options_cper_data(): {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, ] run_service_hub( - engine_python_module="instinct_shaped_engine", + hub_python_module="instinct_shaped_engine", afid_events=EXAMPLE_EVENTS[:1], afid_sag_path=str(AFID_SAG), rf_events=rf_events, @@ -331,7 +331,7 @@ def test_run_service_hub_collected_cper_overrides_hub_options_cper_data(): def test_run_service_hub_missing_sag_raises(): with pytest.raises(SeRunError, match="Hub config file not found"): run_service_hub( - engine_python_module="mock_python_engine", + hub_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS, afid_sag_path="/nonexistent/dummy_afid_sag.json", rf_events=[{"Afid": DUMMY_AFID_A}], @@ -363,7 +363,7 @@ def test_build_afid_events_from_rf_members(): assert events[1].afid == DUMMY_AFID_B -def test_mi3xx_analyzer_runs_python_engine(system_info): +def test_mi3xx_analyzer_runs_python_hub(system_info): data = ServiceabilityDataModel( rf_events=[ { @@ -380,7 +380,7 @@ def test_mi3xx_analyzer_runs_python_engine(system_info): ) analyzer = MI3XXAnalyzer(system_info=system_info) args = ServiceabilityAnalyzerArgs( - engine_python_module="mock_python_engine", + hub_python_module="mock_python_engine", afid_sag_path=str(AFID_SAG), hub_options={"include_raw_events": False}, ) diff --git a/test/unit/serviceability_dummy_data.py b/test/unit/serviceability_dummy_data.py index 0542c866..379727d1 100644 --- a/test/unit/serviceability_dummy_data.py +++ b/test/unit/serviceability_dummy_data.py @@ -1,8 +1,40 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### """Shared dummy values for serviceability unit tests (not production data).""" +from __future__ import annotations + +from typing import Any + DUMMY_AFID_A = 9001 DUMMY_AFID_B = 9002 DUMMY_AFID_C = 9003 +DUMMY_AFID_BELOW_RF = 22 +DUMMY_AFID_FATAL_HBM = 25 +DUMMY_RF_CPER_AFID = 10000 DUMMY_SERVICE_ACTION_NUM = 99 DUMMY_SERVICE_ACTION_TITLE = "Dummy service action" DUMMY_UNIT_A = "dummy_unit_a" @@ -12,12 +44,137 @@ DUMMY_DESIGNATION_B = "DUMMY_SLOT_B" DUMMY_EVENT_URI = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/Entries" DUMMY_EVENT_URI_ALT = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/EntriesAlt" +DUMMY_EVENT_LOG_BASE = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog" +DUMMY_CPER_ATTACHMENT_URI_1 = f"{DUMMY_EVENT_LOG_BASE}/Attachments/1" +DUMMY_CPER_ATTACHMENT_URI_2 = f"{DUMMY_EVENT_LOG_BASE}/Attachments/2" DUMMY_TIMESTAMP = "2000-01-01T12:00:00+00:00" DUMMY_TIMESTAMP_EARLIER = "1999-12-31T12:00:00+00:00" DUMMY_TIMESTAMP_LATER = "2000-01-02T12:00:00+00:00" DUMMY_RF_EVENT_COUNT = 2 DUMMY_SAG_PID = "dummy-sag-pid" DUMMY_SAG_REVISION = "dummy-rev-0" -DUMMY_ENGINE_VERSION = "0.0.0-dummy" +DUMMY_HUB_VERSION = "0.0.0-dummy" DUMMY_BMC_HOST = "dummy-bmc.example" DUMMY_OEM_VENDOR = "DummyVendor" +DUMMY_GPU_SERIAL_NUMBER = "DUMMY-GPU-SERIAL-0001" +DUMMY_DECODED_ERROR_TYPE = "dummy_error_type" +DUMMY_RF_EVENT_ID_1 = "dummy-rf-evt-1" +DUMMY_RF_EVENT_ID_2 = "dummy-rf-evt-2" +DUMMY_CPER_EVENT_ID_BASIC = "dummy-cper-evt-1" +DUMMY_CPER_EVENT_ID_SKIP = "dummy-cper-evt-skip" +DUMMY_CPER_EVENT_ID_RF = "dummy-cper-evt-rf" +DUMMY_CPER_BYTES_BASIC = b"\x01\x02dummy-cper" +DUMMY_CPER_BYTES_RF = b"\xaa\xbb" + + +def dummy_chassis_uri(unit: str) -> str: + return f"/redfish/v1/Chassis/{unit}" + + +def dummy_aca_err_row(*, serial: bool = True, decoded: bool = True) -> dict[str, Any]: + meta = {"SerialNumber": DUMMY_GPU_SERIAL_NUMBER} if serial else {"GpuFw": "dummy-fw"} + decoded_data = {"error_type": DUMMY_DECODED_ERROR_TYPE} if decoded else {} + return {"DecodedData": decoded_data, "MetaData": meta} + + +def dummy_cper_rf_member() -> dict[str, Any]: + """RF-range AFID with ACA decode + serial (CPER attachment fetch expected).""" + return { + "Id": DUMMY_CPER_EVENT_ID_RF, + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_2, + "Oem": { + "AMDFieldIdentifiers": [{"AFID": DUMMY_RF_CPER_AFID}], + "ErrDataArr": [dummy_aca_err_row()], + }, + } + + +def dummy_cper_skip_member() -> dict[str, Any]: + """Low AFID with ACA decode + serial (CPER attachment fetch skipped).""" + return { + "Id": DUMMY_CPER_EVENT_ID_SKIP, + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_1, + "Oem": { + "AMDFieldIdentifiers": [{"AFID": DUMMY_AFID_BELOW_RF}], + "ErrDataArr": [ + { + "DecodedData": {"error_type": "dummy_on_die_ecc"}, + "MetaData": {"SerialNumber": DUMMY_GPU_SERIAL_NUMBER}, + } + ], + }, + } + + +def dummy_cper_basic_member() -> dict[str, Any]: + """CPER event without OEM ACA block (attachment fetch expected).""" + return { + "Id": DUMMY_CPER_EVENT_ID_BASIC, + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_1, + } + + +def dummy_openbmc_log_entry() -> dict[str, Any]: + """OpenBMC-style LogEntry with Links OOC and AMDFieldIdentifiers[].""" + return { + "@odata.id": f"{DUMMY_EVENT_URI}/1", + "Created": DUMMY_TIMESTAMP, + "Id": DUMMY_RF_EVENT_ID_1, + "Links": { + "OriginOfCondition": {"@odata.id": dummy_chassis_uri(DUMMY_UNIT_A)}, + }, + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": DUMMY_AFID_BELOW_RF, + "Description": "dummy on-die ECC, uncorrected, non-fatal", + "ServiceableUnits": [{"@odata.id": dummy_chassis_uri(DUMMY_UNIT_A)}], + "ServiceableUnits@odata.count": 1, + } + ], + "AMDFieldIdentifiers@Members.count": 1, + }, + } + + +def dummy_openbmc_log_entry_serviceable_units_only() -> dict[str, Any]: + """LogEntry with ServiceableUnits only (no Links OOC).""" + return { + "Created": DUMMY_TIMESTAMP, + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": DUMMY_AFID_A, + "ServiceableUnits": [{"@odata.id": dummy_chassis_uri(DUMMY_UNIT_B)}], + } + ], + }, + } + + +def dummy_fatal_hbm_log_entry() -> dict[str, Any]: + """Minimal CPER-style row with Links + AMDFieldIdentifiers[].""" + return { + "Created": DUMMY_TIMESTAMP_LATER, + "Id": DUMMY_RF_EVENT_ID_2, + "Links": { + "OriginOfCondition": {"@odata.id": dummy_chassis_uri(DUMMY_UNIT_C)}, + }, + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": DUMMY_AFID_FATAL_HBM, + "Description": "dummy fatal HBM", + "ServiceableUnits": [{"@odata.id": dummy_chassis_uri(DUMMY_UNIT_C)}], + "ServiceableUnits@odata.count": 1, + } + ], + "AMDFieldIdentifiers@Members.count": 1, + }, + } From f8e7e47e25fb84bc2069e4d8dbe91f03cb8e53a4 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 17:08:35 -0500 Subject: [PATCH 16/19] Add instinct_shaped_engine test helper for hub options forwarding --- test/unit/instinct_shaped_engine.py | 68 +++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 test/unit/instinct_shaped_engine.py diff --git a/test/unit/instinct_shaped_engine.py b/test/unit/instinct_shaped_engine.py new file mode 100644 index 00000000..6fa7f234 --- /dev/null +++ b/test/unit/instinct_shaped_engine.py @@ -0,0 +1,68 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +from __future__ import annotations + +from typing import Any, Optional + +__all__ = ["InstinctShapedEngine"] + +_LAST_CALL: dict[str, Any] = {} + + +def clear_last_call() -> None: + _LAST_CALL.clear() + + +def get_last_call() -> dict[str, Any]: + return dict(_LAST_CALL) + + +class InstinctShapedEngine: + """Mirrors keyword parameters of ``InstinctServiceAssistant.get_service_info``.""" + + def __init__(self, afid_sag: str) -> None: + self.afid_sag = afid_sag + + def get_service_info( + self, + rf_events: list[Any], + from_ac_cycle: int = -1, + from_date: Optional[str] = None, + cper_data: Optional[dict[str, Any]] = None, + designation_serials: Optional[dict[str, str]] = None, + suppress_service_actions: Optional[list[str]] = None, + ) -> None: + _LAST_CALL.clear() + _LAST_CALL.update( + from_ac_cycle=from_ac_cycle, + from_date=from_date, + cper_data=cper_data, + designation_serials=designation_serials, + suppress_service_actions=suppress_service_actions, + rf_len=len(rf_events), + ) + return None From af8a3f3e756f66fe639f7ab71fbb8e43d24d0a35 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 18:25:42 -0500 Subject: [PATCH 17/19] updated for documentation for to include serviceability plugin --- .mypy.ini | 1 + .pre-commit-config.yaml | 2 +- docs/PLUGIN_DOC.md | 90 +++++++++++++++++++ docs/generate_plugin_doc_bundle.py | 2 +- .../mi3xx/serviceability_plugin_mi3xx.py | 4 +- pyproject.toml | 2 + 6 files changed, 98 insertions(+), 3 deletions(-) diff --git a/.mypy.ini b/.mypy.ini index f9d68f19..cf6c2344 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -1,5 +1,6 @@ [mypy] # Global mypy configuration +mypy_path = test/unit [mypy-nodescraper.base.regexanalyzer] ignore_errors = True diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9919bb08..85a64e4f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,5 +28,5 @@ repos: rev: v1.15.0 hooks: - id: mypy - args: [--install-types, --non-interactive, --allow-redefinition] + args: [--install-types, --non-interactive, --explicit-package-bases, --allow-redefinition] language: system diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 88c06e42..80d4e012 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -41,6 +41,8 @@ | OobBmcArchivePlugin | SSH (BMC) shell: tar+gzip archives for each path in collection_args (see PathSpec entries).
Uses sudo on the BMC when collection_args paths require elevated access. | - | **Collection Args:**
- `paths`: list[nodescraper.plugins.ooband.bmc_archive.collector_args.PathSpec] — Named BMC paths to archive with tar czf -. Configure in plugin config under plugins.OobBmcArchivePlugin.collection_ar...
- `sudo`: bool — Default sudo setting for paths that do not specify sudo.
- `timeout`: int — Default per-path tar timeout in seconds.
- `skip_if_missing`: bool — Skip paths that do not exist on the BMC instead of failing collection.
- `ignore_failed_read`: bool — When true, pass GNU tar's --ignore-failed-read when the remote tar supports it. | [BmcArchiveDataModel](#BmcArchiveDataModel-Model) | [BmcArchiveCollector](#Collector-Class-BmcArchiveCollector) | - | | RedfishEndpointPlugin | Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).
Optional paged GET following the Members collection OData nextLink field when follow_next_link is true.
Redfish GET tree: when discover_tree is true, walks from api_root using OData resource id links and Members navigation (depth and endpoint caps from collection_args). | For each entry in analysis_args.checks, reads JSON paths in collected responses and compares values to constraints (eq, min/max, anyOf, regex, etc.).
URI key "*" runs checks against every collected response body.
**Analyzer Args:**
- `checks`: dict[str, dict[str, Union[int, float, str, bool, dict[str, Any]]]] — Map: URI or '*' -> { property_path: constraint }. URI keys must match a key in the collected responses (exact match).... | **Collection Args:**
- `uris`: list[str] — Redfish URIs to GET. Ignored when discover_tree is True.
- `discover_tree`: bool — If True, discover endpoints from the BMC Redfish tree (service root and links) instead of using uris.
- `tree_max_depth`: int — When discover_tree is True: max traversal depth (1=service root only, 2=root + collections, 3=+ members).
- `tree_max_endpoints`: int — When discover_tree is True: max endpoints to discover (0=no limit).
- `max_workers`: int — Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.
- `follow_next_link`: bool — If True, follow Redfish Members collection OData nextLink pagination for each URI and merge all pages into a single r...
- `max_pages`: int — When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200). | [RedfishEndpointDataModel](#RedfishEndpointDataModel-Model) | [RedfishEndpointCollector](#Collector-Class-RedfishEndpointCollector) | [RedfishEndpointAnalyzer](#Data-Analyzer-Class-RedfishEndpointAnalyzer) | | RedfishOemDiagPlugin | Redfish LogService.CollectDiagnosticData for each entry in collection_args.oem_diagnostic_types (collection_args.log_service_path selects the LogService).
Optional binary archives under the plugin log path when log_path is set. | Summarizes success/failure per OEM diagnostic type from collected results.
When analysis_args.require_all_success is true, fails the run if any type failed collection.
**Analyzer Args:**
- `require_all_success`: bool — If True, analysis fails when any OEM type collection failed. | **Collection Args:**
- `log_service_path`: str — Redfish path to the LogService (e.g. DiagLogs).
- `oem_diagnostic_types_allowable`: Optional[list[str]] — Allowable OEM diagnostic types for this architecture/BMC. When set, used for validation and as default for oem_diagno...
- `oem_diagnostic_types`: list[str] — OEM diagnostic types to collect. When empty and oem_diagnostic_types_allowable is set, defaults to that list.
- `task_timeout_s`: int — Max seconds to wait for each BMC task. | [RedfishOemDiagDataModel](#RedfishOemDiagDataModel-Model) | [RedfishOemDiagCollector](#Collector-Class-RedfishOemDiagCollector) | [RedfishOemDiagAnalyzer](#Data-Analyzer-Class-RedfishOemDiagAnalyzer) | +| ServiceabilityPluginMI3XX | - | **Analyzer Args:**
- `hub_python_module`: Optional[str] — Import path for the hub module (class implements hub_analyze_method); hub_options forwards kwargs.
- `hub_display_name`: Optional[str] — Optional label for analyzer status messages.
- `afid_sag_path`: Optional[str] — Path to hub config (e.g. AFID_SAG.json); passed as hub_init_path_kwarg.
- `hub_init_path_kwarg`: str — Hub __init__ keyword that receives afid_sag_path.
- `hub_analyze_method`: str — Hub method called with rf_events first (default get_service_info).
- `skip_hub`: bool — If True, only build afid_events without running the service hub.
- `cper_decode_module`: Optional[str] — Module import path for CPER decoding when events include CPER attachments.
- `cper_decode_method`: str — Callable on cper_decode_module: file-like CPER in, (return_code, decode_dict) out.
- `hub_options`: Optional[dict[str, Any]] — Extra kwargs for hub __init__ and analyze; collected cper_data overrides cper_data key.
- `from_ac_cycle`: int — from_ac_cycle kwarg for the hub analyze call (merged after hub_options).
- `from_date`: Optional[str] — Optional from_date for the hub analyze call (merged after hub_options).
- `designation_serials`: Optional[dict[str, str]] — Optional designation_serials for the hub analyze call (merged after hub_options).
- `suppress_service_actions`: Optional[list[str]] — Optional suppress_service_actions for the hub analyze call (merged after hub_options). | **Collection Args:**
- `uri`: Optional[str] — Optional alias for ``rf_event_log_uri``. When both ``uri`` and ``rf_event_log_uri`` are explicitly set to non-empty v...
- `rf_event_log_uri`: str — Redfish URI for the event log ``Entries`` collection.
- `rf_chassis_devices`: Optional[List[str]] — Chassis designations for Assembly GETs; required with ``rf_assembly_uri_template``.
- `rf_assembly_uri_template`: Optional[str] — Redfish URI template containing ``{device}`` for each chassis Assembly resource.
- `rf_firmware_bundle_uri`: Optional[str] — Redfish URI for firmware bundle inventory when subclasses extract component details.
- `follow_next_link`: bool — If True, follow Members@odata.nextLink up to max_pages; else single GET.
- `max_pages`: int — Safety cap on the number of pages when following event log pagination.
- `top`: Optional[int] — Most recent N entries via $skip after count probe; None collects full window.
- `reference_time`: Optional[str] — Optional ISO-8601 date or date-time used with time_operator (e.g. 2026-05-17 or 2026-05-17T13:01:00).
- `time_operator`: Optional[Literal['>', '>=', '<', '<=', '==']] — Comparison operator applied when reference_time is set. | [ServiceabilityDataModel](#ServiceabilityDataModel-Model) | [MI3XXCollector](#Collector-Class-MI3XXCollector) | [MI3XXAnalyzer](#Data-Analyzer-Class-MI3XXAnalyzer) | +| ServiceabilityPluginBase | - | - | - | [ServiceabilityDataModel](#ServiceabilityDataModel-Model) | [ServiceabilityCollectorBase](#Collector-Class-ServiceabilityCollectorBase) | - | # Collectors @@ -1045,6 +1047,34 @@ RedfishOemDiagDataModel - Redfish LogService.CollectDiagnosticData for each entry in collection_args.oem_diagnostic_types (collection_args.log_service_path selects the LogService). - Optional binary archives under the plugin log path when log_path is set. +## Collector Class MI3XXCollector + +### Description + +MI3XX OOB Redfish serviceability collector. + +**Bases**: ['ServiceabilityCollectorBase'] + +**Link to code**: [mi3xx_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py) + +### Provides Data + +ServiceabilityDataModel + +## Collector Class ServiceabilityCollectorBase + +### Description + +OOB Redfish collection skeleton; subclasses implement filtering, CPER handling, and JSON parsing. + +**Bases**: ['RedfishDataCollector', 'Generic'] + +**Link to code**: [serviceability_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/serviceability/serviceability_collector.py) + +### Provides Data + +ServiceabilityDataModel + # Data Models ## GenericCollectionDataModel Model @@ -1549,6 +1579,30 @@ Collected Redfish OEM diagnostic log results: OEM type -> result (success, error - **results**: `dict[str, nodescraper.plugins.ooband.redfish_oem_diag.oem_diag_data.OemDiagTypeResult]` +## ServiceabilityDataModel Model + +### Description + +Collected Redfish responses and intermediate serviceability fields. + +**Link to code**: [serviceability_data.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/serviceability/serviceability_data.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **responses**: `dict[str, Any]` +- **rf_events**: `list[Any]` +- **assembly_info**: `Dict[str, DeviceInfo]` +- **cper_raw**: `Dict[str, str]` +- **cper_data**: `Dict[str, Any]` +- **component_details**: `Optional[str]` +- **log_path**: `Optional[str]` +- **bmc_host**: `Optional[str]` +- **afid_events**: `List[AfidEvent]` +- **serviceability**: `Optional[ServiceabilityBlock]` +- **result**: `Optional[ServiceabilityResult]` + # Data Analyzers ## Data Analyzer Class GenericAnalyzer @@ -1978,6 +2032,16 @@ Analyzes Redfish OEM diagnostic log collection results. - Summarizes success/failure per OEM diagnostic type from collected results. - When analysis_args.require_all_success is true, fails the run if any type failed collection. +## Data Analyzer Class MI3XXAnalyzer + +### Description + +Build AFID events from collected data and run the configured service hub. + +**Bases**: ['DataAnalyzer'] + +**Link to code**: [mi3xx_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py) + # Analyzer Args ## Analyzer Args Class GenericAnalyzerArgs @@ -2300,3 +2364,29 @@ Analyzer args for Redfish OEM diagnostic log results. ### Annotations / fields - **require_all_success**: `bool` — If True, analysis fails when any OEM type collection failed. + +## Analyzer Args Class ServiceabilityAnalyzerArgs + +### Description + +Analyzer args for serviceability plugins that run a configurable Python hub. + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/serviceability/analyzer_args.py) + +### Annotations / fields + +- **hub_python_module**: `Optional[str]` — Import path for the hub module (class implements hub_analyze_method); hub_options forwards kwargs. +- **hub_display_name**: `Optional[str]` — Optional label for analyzer status messages. +- **afid_sag_path**: `Optional[str]` — Path to hub config (e.g. AFID_SAG.json); passed as hub_init_path_kwarg. +- **hub_init_path_kwarg**: `str` — Hub __init__ keyword that receives afid_sag_path. +- **hub_analyze_method**: `str` — Hub method called with rf_events first (default get_service_info). +- **skip_hub**: `bool` — If True, only build afid_events without running the service hub. +- **cper_decode_module**: `Optional[str]` — Module import path for CPER decoding when events include CPER attachments. +- **cper_decode_method**: `str` — Callable on cper_decode_module: file-like CPER in, (return_code, decode_dict) out. +- **hub_options**: `Optional[dict[str, Any]]` — Extra kwargs for hub __init__ and analyze; collected cper_data overrides cper_data key. +- **from_ac_cycle**: `int` — from_ac_cycle kwarg for the hub analyze call (merged after hub_options). +- **from_date**: `Optional[str]` — Optional from_date for the hub analyze call (merged after hub_options). +- **designation_serials**: `Optional[dict[str, str]]` — Optional designation_serials for the hub analyze call (merged after hub_options). +- **suppress_service_actions**: `Optional[list[str]]` — Optional suppress_service_actions for the hub analyze call (merged after hub_options). diff --git a/docs/generate_plugin_doc_bundle.py b/docs/generate_plugin_doc_bundle.py index 4d873ca5..cd9897b0 100644 --- a/docs/generate_plugin_doc_bundle.py +++ b/docs/generate_plugin_doc_bundle.py @@ -41,7 +41,7 @@ from typing import Any, Iterable, List, Optional, Type LINK_BASE_DEFAULT = "https://github.com/amd/node-scraper/blob/HEAD/" -REL_ROOT_DEFAULT = "nodescraper/plugins/inband" +REL_ROOT_DEFAULT = "nodescraper/plugins" # Import and document every concrete plugin under nodescraper.plugins (inband, ooband, # generic_collection, regex_search, serviceability, …). PACKAGE_PLUGINS_ROOT = "nodescraper.plugins" diff --git a/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py index 2f38783f..d578d949 100644 --- a/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py +++ b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py @@ -23,6 +23,7 @@ # SOFTWARE. # ############################################################################### +from nodescraper.plugins.serviceability.analyzer_args import ServiceabilityAnalyzerArgs from nodescraper.plugins.serviceability.serviceability_data import ( ServiceabilityDataModel, ) @@ -41,9 +42,10 @@ class ServiceabilityPluginMI3XX(ServiceabilityPluginBase): - """MI3XX OOB Redfish serviceability plugin.""" + """MI3XX OOB Redfish serviceability: BMC event log, CPER attachments, and service hub analysis.""" DATA_MODEL = ServiceabilityDataModel COLLECTOR = MI3XXCollector ANALYZER = MI3XXAnalyzer COLLECTOR_ARGS = MI3XXCollectorArgs + ANALYZER_ARGS = ServiceabilityAnalyzerArgs diff --git a/pyproject.toml b/pyproject.toml index e3f0220a..8cf05b74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dev = [ "pytest-cov", "mypy", "types-paramiko", + "types-requests", "types-setuptools", ] @@ -83,6 +84,7 @@ ignore = ["E501", "N806"] [tool.mypy] python_version = "3.9" mypy_path = ["test/unit"] +explicit_package_bases = true [tool.setuptools_scm] version_scheme = "post-release" From 7ee4f921298d030d4c8cd41b0f4775901f51c030 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 23 Jun 2026 09:05:38 -0500 Subject: [PATCH 18/19] addresses reviews --- .../serviceability/mi3xx/mi3xx_analyzer.py | 6 +- .../serviceability/mi3xx/mi3xx_collector.py | 24 +++---- .../serviceability/mi3xx/mi3xx_cper_utils.py | 69 +++++++++++-------- test/unit/instinct_shaped_engine.py | 1 - test/unit/plugin/test_mi3xx_collector.py | 18 +++++ test/unit/plugin/test_mi3xx_cper_utils.py | 37 +++++++++- test/unit/serviceability_dummy_data.py | 3 + 7 files changed, 111 insertions(+), 47 deletions(-) diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index 6150398e..b3e2644d 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -47,7 +47,7 @@ ServiceabilityDataModel, ) -from .mi3xx_cper_utils import RF_CPER_AFID_MIN, should_skip_cper_fetch_or_decode +from .mi3xx_cper_utils import CPER_METHOD_AFID_MAX, should_skip_cper_fetch_or_decode class AfidSagMetadataArtifact(BaseModel): @@ -90,10 +90,10 @@ def analyze_data( if skipped_cper: self.logger.info( "(%s) Skipping CPER decode for %d CPER attachment(s); Redfish log " - "already has usable ACA fields (AFID<%s or no serial on decode)", + "already has usable ACA fields (CPER-method AFID<=%s or no serial on decode)", parent, skipped_cper, - RF_CPER_AFID_MIN, + CPER_METHOD_AFID_MAX, ) if cper_raw_to_decode and not cper_data: if not args.cper_decode_module: diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py index 8921796c..d155f14a 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py @@ -35,13 +35,14 @@ from nodescraper.plugins.serviceability.time_utils import satisfies_time_check from .mi3xx_collector_args import MI3XXCollectorArgs -from .mi3xx_cper_utils import RF_CPER_AFID_MIN, should_skip_cper_fetch_or_decode +from .mi3xx_cper_utils import CPER_METHOD_AFID_MAX, should_skip_cper_fetch_or_decode _EVENT_TIMESTAMP_KEYS = ("Created", "EventTimestamp", "Timestamp") class MI3XXCollector(ServiceabilityCollectorBase[MI3XXCollectorArgs]): - """MI3XX OOB Redfish serviceability collector.""" + """Collect MI3XX BMC Redfish data: event log members (with pagination), firmware inventory, + CPER attachment bytes for qualifying events, and optional assembly/chassis metadata.""" def satisfies_reference_time( self, @@ -69,15 +70,12 @@ def filter_event_members( return filtered def is_cper_event(self, event: dict) -> bool: - if "CPER" in event: - return True - if str(event.get("DiagnosticDataType", "")).upper() == "CPER": - return True - if event.get("AdditionalDataURI"): - return True - message_id = str(event.get("MessageId", "")).lower() - message = str(event.get("Message", "")).lower() - return "cper" in message_id or "cper" in message or "diagnostic" in message_id + """True when the log entry is a Redfish CPER attachment event.""" + return ( + "CPER" in event + and str(event.get("DiagnosticDataType", "")).upper() == "CPER" + and bool(event.get("AdditionalDataURI")) + ) def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: """Fetch CPER binaries from BMC; decoding runs in the analyzer.""" @@ -94,10 +92,10 @@ def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: if should_skip_cper_fetch_or_decode(event): self.logger.info( "(%s) Skipping CPER attachment fetch for Redfish event %s " - "(ACA decode already on log entry; AFID<%s check or no serial)", + "(ACA decode already on log entry; CPER-method AFID<=%s or no serial)", parent, event_id, - RF_CPER_AFID_MIN, + CPER_METHOD_AFID_MAX, ) continue diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py index fe9661dc..7aa047a9 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py @@ -27,29 +27,42 @@ from typing import Any -# Redfish CPER (RF) style AFIDs start at this value; lower values are in-band / -# OEM-field AFIDs already reflected on the log entry. -RF_CPER_AFID_MIN = 10000 +# CPER-method AFIDs <= 34; Redfish-method AFIDs >= 10000. +CPER_METHOD_AFID_MAX = 34 +REDFISH_METHOD_AFID_MIN = 10000 _SERIAL_KEYS = ("SerialNumber", "serial_number", "UbbSerial", "ubb_serial") -def event_afids_from_oem(event: dict[str, Any]) -> list[int]: - """AFIDs from ``Oem.AMDFieldIdentifiers`` (or similar list-of-dicts).""" +def _oem_dict(event: dict[str, Any]) -> dict[str, Any]: oem = event.get("Oem") - if not isinstance(oem, dict): - return [] - raw = oem.get("AMDFieldIdentifiers") - if not isinstance(raw, list): - return [] + return oem if isinstance(oem, dict) else {} + + +def _oem_list_field(oem: dict[str, Any], key: str) -> list[Any]: + """Return a list field from ``Oem`` or nested ``Oem.AMD`` (BMC layout varies).""" + raw = oem.get(key) + if isinstance(raw, list): + return raw + amd = oem.get("AMD") + if isinstance(amd, dict): + nested = amd.get(key) + if isinstance(nested, list): + return nested + return [] + + +def event_afids_from_oem(event: dict[str, Any]) -> list[int]: + """AFIDs from ``Oem.AMDFieldIdentifiers`` or ``Oem.AMD.AMDFieldIdentifiers``.""" + raw = _oem_list_field(_oem_dict(event), "AMDFieldIdentifiers") out: list[int] = [] for item in raw: if not isinstance(item, dict): continue for key in ("AFID", "Afid", "afid"): - if key in item and item[key] is not None: + if (v := item.get(key)) is not None: try: - out.append(int(item[key])) + out.append(int(v)) except (TypeError, ValueError): pass break @@ -57,12 +70,8 @@ def event_afids_from_oem(event: dict[str, Any]) -> list[int]: def _err_data_arr_entries(event: dict[str, Any]) -> list[dict[str, Any]]: - oem = event.get("Oem") - if not isinstance(oem, dict): - return [] - arr = oem.get("ErrDataArr") - if not isinstance(arr, list): - return [] + """``ErrDataArr`` rows from ``Oem.ErrDataArr`` or ``Oem.AMD.ErrDataArr``.""" + arr = _oem_list_field(_oem_dict(event), "ErrDataArr") return [e for e in arr if isinstance(e, dict)] @@ -86,24 +95,30 @@ def _nonempty_serial_in_mapping(obj: Any) -> bool: def event_aca_includes_serial(event: dict[str, Any]) -> bool: - """Serial (or UBB serial) present on any ``ErrDataArr`` row (typically ``MetaData``).""" + """Serial (or UBB serial) present on any ``ErrDataArr`` row ``MetaData``.""" for entry in _err_data_arr_entries(event): - meta = entry.get("MetaData") - if _nonempty_serial_in_mapping(meta): - return True - decoded = entry.get("DecodedData") - if _nonempty_serial_in_mapping(decoded): + if _nonempty_serial_in_mapping(entry.get("MetaData")): return True return False +def is_cper_method_afid(afid: int) -> bool: + """True for CPER-method AFIDs (<= ``CPER_METHOD_AFID_MAX``), including on RF log entries.""" + return afid <= CPER_METHOD_AFID_MAX + + +def is_redfish_method_afid(afid: int) -> bool: + """True for Redfish-method AFIDs in the 10k range.""" + return afid >= REDFISH_METHOD_AFID_MIN + + def should_skip_cper_fetch_or_decode(event: dict[str, Any]) -> bool: """Whether to omit CPER binary fetch and configured CPER decode for this Redfish member. Skip when: - * Every OEM-listed AFID is below ``RF_CPER_AFID_MIN`` (non-RF CPER range), - ACA ``DecodedData`` is present, and a serial is present on the entry; or + * Every OEM-listed AFID is CPER-method (<= ``CPER_METHOD_AFID_MAX``; may match + in-band CPER AFIDs), ACA ``DecodedData`` is present, and serial is on the entry; or * ACA ``DecodedData`` is present but no serial — the CPER blob does not add actionable identity beyond what is already missing from the log. """ @@ -114,4 +129,4 @@ def should_skip_cper_fetch_or_decode(event: dict[str, Any]) -> bool: afids = event_afids_from_oem(event) if not afids: return False - return all(afid < RF_CPER_AFID_MIN for afid in afids) + return all(is_cper_method_afid(afid) for afid in afids) diff --git a/test/unit/instinct_shaped_engine.py b/test/unit/instinct_shaped_engine.py index 6fa7f234..b5989a24 100644 --- a/test/unit/instinct_shaped_engine.py +++ b/test/unit/instinct_shaped_engine.py @@ -23,7 +23,6 @@ # SOFTWARE. # ############################################################################### - from __future__ import annotations from typing import Any, Optional diff --git a/test/unit/plugin/test_mi3xx_collector.py b/test/unit/plugin/test_mi3xx_collector.py index 1cddc2f3..96a9d556 100644 --- a/test/unit/plugin/test_mi3xx_collector.py +++ b/test/unit/plugin/test_mi3xx_collector.py @@ -175,6 +175,24 @@ def test_mi3xx_collector_satisfies_reference_time_helper(mi3xx_collector): assert not mi3xx_collector.satisfies_reference_time(DUMMY_TIMESTAMP_EARLIER, args) +def test_mi3xx_collector_is_cper_event_requires_cper_block_type_and_uri(mi3xx_collector): + assert mi3xx_collector.is_cper_event(dummy_cper_basic_member()) + assert not mi3xx_collector.is_cper_event( + { + "Id": "non-cper", + "AdditionalDataURI": DUMMY_EVENT_URI, + "MessageId": "ResourceEvent.1.2.1.ResourceErrorsDetectedOEM", + } + ) + assert not mi3xx_collector.is_cper_event( + { + "Id": "partial-cper", + "CPER": {"NotificationType": "dummy"}, + "DiagnosticDataType": "CPER", + } + ) + + def test_mi3xx_collector_fetches_cper_attachments(mi3xx_collector, redfish_conn_mock): import base64 from unittest.mock import MagicMock diff --git a/test/unit/plugin/test_mi3xx_cper_utils.py b/test/unit/plugin/test_mi3xx_cper_utils.py index b156b930..e4e2965e 100644 --- a/test/unit/plugin/test_mi3xx_cper_utils.py +++ b/test/unit/plugin/test_mi3xx_cper_utils.py @@ -25,16 +25,19 @@ ############################################################################### import pytest from serviceability_dummy_data import ( - DUMMY_AFID_B, DUMMY_AFID_BELOW_RF, + DUMMY_AFID_FATAL_HBM, DUMMY_RF_CPER_AFID, dummy_aca_err_row, ) from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import ( + CPER_METHOD_AFID_MAX, event_aca_includes_serial, event_afids_from_oem, event_has_aca_decode, + is_cper_method_afid, + is_redfish_method_afid, should_skip_cper_fetch_or_decode, ) @@ -50,6 +53,34 @@ def test_skip_when_afids_below_threshold_and_aca_has_serial(): assert should_skip_cper_fetch_or_decode(event) is True +def test_event_afids_from_oem_nested_amd_block(): + event = { + "Oem": { + "AMD": { + "AMDFieldIdentifiers": [{"AFID": DUMMY_AFID_BELOW_RF}], + "ErrDataArr": [dummy_aca_err_row()], + } + } + } + assert event_afids_from_oem(event) == [DUMMY_AFID_BELOW_RF] + assert event_has_aca_decode(event) is True + assert should_skip_cper_fetch_or_decode(event) is True + + +def test_err_data_arr_entries_nested_amd_block(): + event = {"Oem": {"AMD": {"ErrDataArr": [dummy_aca_err_row()]}}} + assert event_has_aca_decode(event) is True + assert event_aca_includes_serial(event) is True + + +def test_afid_method_ranges(): + assert is_cper_method_afid(DUMMY_AFID_BELOW_RF) + assert is_cper_method_afid(CPER_METHOD_AFID_MAX) + assert not is_cper_method_afid(CPER_METHOD_AFID_MAX + 1) + assert is_redfish_method_afid(DUMMY_RF_CPER_AFID) + assert not is_redfish_method_afid(DUMMY_AFID_BELOW_RF) + + def test_no_skip_when_rf_range_afid_even_with_aca_serial(): event = { "Oem": { @@ -94,11 +125,11 @@ def test_no_skip_when_aca_serial_but_no_afid_list(): @pytest.mark.parametrize( "afids,expect_skip", [ - ([DUMMY_AFID_BELOW_RF, DUMMY_AFID_B], True), + ([DUMMY_AFID_BELOW_RF, DUMMY_AFID_FATAL_HBM], True), ([DUMMY_AFID_BELOW_RF, DUMMY_RF_CPER_AFID], False), ], ) -def test_skip_requires_all_afids_below_rf_threshold(afids, expect_skip): +def test_skip_requires_all_afids_cper_method(afids, expect_skip): identifiers = [{"AFID": a} for a in afids] event = {"Oem": {"AMDFieldIdentifiers": identifiers, "ErrDataArr": [dummy_aca_err_row()]}} assert should_skip_cper_fetch_or_decode(event) is expect_skip diff --git a/test/unit/serviceability_dummy_data.py b/test/unit/serviceability_dummy_data.py index 379727d1..22e883e8 100644 --- a/test/unit/serviceability_dummy_data.py +++ b/test/unit/serviceability_dummy_data.py @@ -82,6 +82,7 @@ def dummy_cper_rf_member() -> dict[str, Any]: return { "Id": DUMMY_CPER_EVENT_ID_RF, "Created": DUMMY_TIMESTAMP_LATER, + "CPER": {"NotificationType": "dummy-notification-type"}, "DiagnosticDataType": "CPER", "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_2, "Oem": { @@ -96,6 +97,7 @@ def dummy_cper_skip_member() -> dict[str, Any]: return { "Id": DUMMY_CPER_EVENT_ID_SKIP, "Created": DUMMY_TIMESTAMP_LATER, + "CPER": {"NotificationType": "dummy-notification-type"}, "DiagnosticDataType": "CPER", "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_1, "Oem": { @@ -115,6 +117,7 @@ def dummy_cper_basic_member() -> dict[str, Any]: return { "Id": DUMMY_CPER_EVENT_ID_BASIC, "Created": DUMMY_TIMESTAMP_LATER, + "CPER": {"NotificationType": "dummy-notification-type"}, "DiagnosticDataType": "CPER", "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_1, } From 782d1562838fef3272e6b8bbff0c155e8c7672ba Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 23 Jun 2026 11:53:37 -0500 Subject: [PATCH 19/19] addressed reviews --- .../serviceability/mi3xx/mi3xx_cper_utils.py | 63 ++++++++++--------- test/unit/plugin/test_mi3xx_cper_utils.py | 19 ++++++ 2 files changed, 51 insertions(+), 31 deletions(-) diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py index 7aa047a9..bdc4ce15 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py @@ -27,34 +27,36 @@ from typing import Any -# CPER-method AFIDs <= 34; Redfish-method AFIDs >= 10000. +# CPER-method AFIDs <= 34; MI3XX Redfish-method AFIDs 10000–10999. CPER_METHOD_AFID_MAX = 34 REDFISH_METHOD_AFID_MIN = 10000 +REDFISH_METHOD_AFID_MAX = 10999 _SERIAL_KEYS = ("SerialNumber", "serial_number", "UbbSerial", "ubb_serial") -def _oem_dict(event: dict[str, Any]) -> dict[str, Any]: - oem = event.get("Oem") - return oem if isinstance(oem, dict) else {} +def get_amd_oem_dict(event: dict[str, Any]) -> dict[str, Any]: + """Return the AMD OEM payload dict for a Redfish log member. + + BMC layouts vary: fields may live on Oem directly or under Oem.AMD. + When AMD is absent, returns Oem; when present, returns AMD if it is a dict. + """ + if not isinstance(oem := event.get("Oem"), dict): + return {} + if (amd := oem.get("AMD")) is None: + return oem + return amd if isinstance(amd, dict) else {} -def _oem_list_field(oem: dict[str, Any], key: str) -> list[Any]: - """Return a list field from ``Oem`` or nested ``Oem.AMD`` (BMC layout varies).""" - raw = oem.get(key) - if isinstance(raw, list): - return raw - amd = oem.get("AMD") - if isinstance(amd, dict): - nested = amd.get(key) - if isinstance(nested, list): - return nested - return [] +def _oem_list_field(oem_dict: dict[str, Any], key: str) -> list[Any]: + """Return a list field from the resolved AMD OEM dict.""" + raw = oem_dict.get(key) + return raw if isinstance(raw, list) else [] def event_afids_from_oem(event: dict[str, Any]) -> list[int]: - """AFIDs from ``Oem.AMDFieldIdentifiers`` or ``Oem.AMD.AMDFieldIdentifiers``.""" - raw = _oem_list_field(_oem_dict(event), "AMDFieldIdentifiers") + """AFIDs from Oem.AMDFieldIdentifiers or Oem.AMD.AMDFieldIdentifiers.""" + raw = _oem_list_field(get_amd_oem_dict(event), "AMDFieldIdentifiers") out: list[int] = [] for item in raw: if not isinstance(item, dict): @@ -70,13 +72,13 @@ def event_afids_from_oem(event: dict[str, Any]) -> list[int]: def _err_data_arr_entries(event: dict[str, Any]) -> list[dict[str, Any]]: - """``ErrDataArr`` rows from ``Oem.ErrDataArr`` or ``Oem.AMD.ErrDataArr``.""" - arr = _oem_list_field(_oem_dict(event), "ErrDataArr") + """ErrDataArr rows from Oem.ErrDataArr or Oem.AMD.ErrDataArr.""" + arr = _oem_list_field(get_amd_oem_dict(event), "ErrDataArr") return [e for e in arr if isinstance(e, dict)] def event_has_aca_decode(event: dict[str, Any]) -> bool: - """True when the log entry includes ACA-style ``DecodedData`` under ``ErrDataArr``.""" + """True when the log entry includes ACA-style DecodedData under ErrDataArr.""" for entry in _err_data_arr_entries(event): decoded = entry.get("DecodedData") if isinstance(decoded, dict) and decoded: @@ -95,21 +97,20 @@ def _nonempty_serial_in_mapping(obj: Any) -> bool: def event_aca_includes_serial(event: dict[str, Any]) -> bool: - """Serial (or UBB serial) present on any ``ErrDataArr`` row ``MetaData``.""" - for entry in _err_data_arr_entries(event): - if _nonempty_serial_in_mapping(entry.get("MetaData")): - return True - return False + """Serial (or UBB serial) present on any ErrDataArr row MetaData.""" + return any( + _nonempty_serial_in_mapping(entry.get("MetaData")) for entry in _err_data_arr_entries(event) + ) def is_cper_method_afid(afid: int) -> bool: - """True for CPER-method AFIDs (<= ``CPER_METHOD_AFID_MAX``), including on RF log entries.""" + """True for CPER-method AFIDs (<= CPER_METHOD_AFID_MAX), including on RF log entries.""" return afid <= CPER_METHOD_AFID_MAX def is_redfish_method_afid(afid: int) -> bool: - """True for Redfish-method AFIDs in the 10k range.""" - return afid >= REDFISH_METHOD_AFID_MIN + """True for MI3XX Redfish-method AFIDs in the 10k range (10000–10999).""" + return REDFISH_METHOD_AFID_MIN <= afid <= REDFISH_METHOD_AFID_MAX def should_skip_cper_fetch_or_decode(event: dict[str, Any]) -> bool: @@ -117,9 +118,9 @@ def should_skip_cper_fetch_or_decode(event: dict[str, Any]) -> bool: Skip when: - * Every OEM-listed AFID is CPER-method (<= ``CPER_METHOD_AFID_MAX``; may match - in-band CPER AFIDs), ACA ``DecodedData`` is present, and serial is on the entry; or - * ACA ``DecodedData`` is present but no serial — the CPER blob does not add + * Every OEM-listed AFID is CPER-method (<= CPER_METHOD_AFID_MAX; may match + in-band CPER AFIDs), ACA DecodedData is present, and serial is on the entry; or + * ACA DecodedData is present but no serial — the CPER blob does not add actionable identity beyond what is already missing from the log. """ if not event_has_aca_decode(event): diff --git a/test/unit/plugin/test_mi3xx_cper_utils.py b/test/unit/plugin/test_mi3xx_cper_utils.py index e4e2965e..105ca203 100644 --- a/test/unit/plugin/test_mi3xx_cper_utils.py +++ b/test/unit/plugin/test_mi3xx_cper_utils.py @@ -33,15 +33,31 @@ from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import ( CPER_METHOD_AFID_MAX, + REDFISH_METHOD_AFID_MAX, + REDFISH_METHOD_AFID_MIN, event_aca_includes_serial, event_afids_from_oem, event_has_aca_decode, + get_amd_oem_dict, is_cper_method_afid, is_redfish_method_afid, should_skip_cper_fetch_or_decode, ) +def test_get_amd_oem_dict_layouts(): + flat = {"Oem": {"AMDFieldIdentifiers": [{"AFID": 1}]}} + assert get_amd_oem_dict(flat) == {"AMDFieldIdentifiers": [{"AFID": 1}]} + + nested = {"Oem": {"AMD": {"ErrDataArr": []}}} + assert get_amd_oem_dict(nested) == {"ErrDataArr": []} + + assert get_amd_oem_dict({}) == {} + assert get_amd_oem_dict({"Oem": None}) == {} + assert get_amd_oem_dict({"Oem": "bad"}) == {} + assert get_amd_oem_dict({"Oem": {"AMD": "bad"}}) == {} + + def test_skip_when_afids_below_threshold_and_aca_has_serial(): event = { "Oem": { @@ -78,6 +94,9 @@ def test_afid_method_ranges(): assert is_cper_method_afid(CPER_METHOD_AFID_MAX) assert not is_cper_method_afid(CPER_METHOD_AFID_MAX + 1) assert is_redfish_method_afid(DUMMY_RF_CPER_AFID) + assert is_redfish_method_afid(REDFISH_METHOD_AFID_MAX) + assert not is_redfish_method_afid(REDFISH_METHOD_AFID_MIN - 1) + assert not is_redfish_method_afid(REDFISH_METHOD_AFID_MAX + 1) assert not is_redfish_method_afid(DUMMY_AFID_BELOW_RF)