Merge pull request #177 from AFM-SPM/ns-rse/work-with-topostats-classes

ns-rse · web-flow · commit 062b98453fb6 · 2026-01-26T12:07:35.000Z
fix: decode numpy arrays; feature: topostats versions
diff --git a/.pylintrc b/.pylintrc
@@ -62,7 +62,7 @@ py-version=3.9
 
 # When enabled, pylint would attempt to guess common misconfiguration and emit
 # user-friendly hints instead of false-positive error messages.
-suggestion-mode=yes
+# suggestion-mode=yes
 
 # Allow loading of arbitrary C extensions. Extensions are imported into the
 # active Python interpreter and may run arbitrary code.
diff --git a/AFMReader/io.py b/AFMReader/io.py
@@ -5,6 +5,7 @@
 from typing import BinaryIO
 
 import h5py
+import numpy as np
 from loguru import logger
 from ruamel.yaml import YAML, YAMLError
 
@@ -255,7 +256,14 @@ def unpack_hdf5(open_hdf5_file: h5py.File, group_path: str = "/") -> dict:
         # Decode byte strings to utf-8. The data type "O" is a byte string.
         elif isinstance(item, h5py.Dataset) and item.dtype == "O":
             # Byte string
-            data[key] = item[()].decode("utf-8")
+            try:
+                data[key] = item[()].decode("utf-8")
+            # Numpy arrays of strings can not be directly decoded, have to iterate over each item
+            except AttributeError as e:
+                if isinstance(item[()], np.ndarray):
+                    data[key] = [_item.decode("utf-8") for _item in item[()]]  # type: ignore
+                else:
+                    raise e
         else:
             # Another type of dataset
             data[key] = item[()]
diff --git a/AFMReader/topostats.py b/AFMReader/topostats.py
@@ -5,6 +5,7 @@
 
 import h5py
 
+from packaging.version import parse as parse_version
 from AFMReader.io import unpack_hdf5
 from AFMReader.logging import logger
 
@@ -41,10 +42,15 @@ def load_topostats(file_path: Path | str) -> dict[str, Any]:
     try:
         with h5py.File(file_path, "r") as f:
             data = unpack_hdf5(open_hdf5_file=f, group_path="/")
-            if str(data["topostats_file_version"]) >= "0.2":
+            # Handle different names for variables holding the file version (<=0.3) or the newer topostats version
+            version = (
+                data["topostats_file_version"]
+                if "topostats_file_version" in data.keys()  # pylint: disable=consider-iterating-dictionary
+                else data["topostats_version"]
+            )
+            if parse_version(str(version)) > parse_version("0.2"):
                 data["img_path"] = Path(data["img_path"])
-            file_version = data["topostats_file_version"]
-            logger.info(f"[{filename}] TopoStats file version : {file_version}")
+            logger.info(f"[{filename}] TopoStats file version : {version}")
 
     except OSError as e:
         if "Unable to open file" in str(e):
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -201,6 +201,46 @@ def test_unpack_hdf5_nested_dict_group_path(tmp_path: Path) -> None:
     np.testing.assert_equal(result, expected)
 
 
+def test_unpack_hdf5_list_of_bytes(tmp_path: Path) -> None:
+    """Test loading a list of strings which are encoded to Numpy array on saving."""
+    to_save = {
+        "config": {
+            "grainstats": {
+                "class_names": np.asarray([b"DNA", b"Protein"], dtype="S7"),
+                "edge_detection_method": "binary_erosion",
+                "extract_height_profile": True,
+                "run": True,
+            }
+        }
+    }
+    group_path = "/config/grainstats/"
+    expected = {
+        "class_names": np.asarray([b"DNA", b"Protein"], dtype="S7"),
+        "edge_detection_method": "binary_erosion",
+        "extract_height_profile": True,
+        "run": True,
+    }
+    # Manually save the dictionary to HDF5 format
+    with h5py.File(tmp_path / "hdf5_file_list_of_strings", "w") as f:
+        # t_path = Path.cwd()
+        # with h5py.File(t_path / "tmp" / "something_else", "w") as f:
+        config = f.create_group("config")
+        grainstats = config.create_group("grainstats")
+        grainstats.create_dataset("class_names", data=to_save["config"]["grainstats"]["class_names"])
+        grainstats.create_dataset(
+            "edge_detection_method", data=to_save["config"]["grainstats"]["edge_detection_method"]
+        )
+        grainstats.create_dataset(
+            "extract_height_profile", data=to_save["config"]["grainstats"]["extract_height_profile"]
+        )
+        grainstats.create_dataset("run", data=to_save["config"]["grainstats"]["run"])
+
+    # Load it back in and check if the list is the same
+    with h5py.File(tmp_path / "hdf5_file_list_of_strings", "r") as f:
+        result = unpack_hdf5(open_hdf5_file=f, group_path=group_path)
+    np.testing.assert_equal(result, expected)
+
+
 def test_read_yaml() -> None:
     """Test reading of YAML file."""
     sample_config = read_yaml(RESOURCES / "test.yaml")
diff --git a/tests/test_topostats.py b/tests/test_topostats.py
@@ -99,7 +99,7 @@ def test_load_topostats(
     assert topostats_data["pixel_to_nm_scaling"] == pytest.approx(pixel_to_nm_scaling)
     assert topostats_data["image"].shape == image_shape
     assert topostats_data["image"].sum() == pytest.approx(image_sum)
-    if version >= "0.2":
+    if version > "0.2":
         assert isinstance(topostats_data["img_path"], Path)