From e92c9bb77b074bebef7ed91fb222361bdc633d06 Mon Sep 17 00:00:00 2001
From: Bjoern Ludwig <bjoern.ludwig@ptb.de>
Date: Thu, 29 Dec 2022 21:46:43 -0500
Subject: [PATCH] feat(dataset): turn dataset provider into class and fix
 normalization

---
 src/zema_emc_annotated/dataset.py | 274 +++++++++++++++++++-----------
 1 file changed, 178 insertions(+), 96 deletions(-)

diff --git a/src/zema_emc_annotated/dataset.py b/src/zema_emc_annotated/dataset.py
index ce1b935..86f83d6 100644
--- a/src/zema_emc_annotated/dataset.py
+++ b/src/zema_emc_annotated/dataset.py
@@ -2,8 +2,8 @@
 
 __all__ = [
     "ExtractionDataType",
-    "provide_zema_samples",
     "LOCAL_ZEMA_DATASET_PATH",
+    "ZeMASamples",
     "ZEMA_DATASET_HASH",
     "ZEMA_DATASET_URL",
     "ZEMA_QUANTITIES",
@@ -12,6 +12,7 @@ __all__ = [
 import operator
 import os
 import pickle
+from dataclasses import dataclass
 from enum import Enum
 from functools import reduce
 from os.path import dirname, exists
@@ -22,9 +23,8 @@ import h5py
 import numpy as np
 from h5py import Dataset
 from numpy._typing import NDArray
-from pooch import retrieve
 
-from zema_emc_annotated.data_types import UncertainArray
+from zema_emc_annotated.data_types import RealMatrix, RealVector, UncertainArray
 
 LOCAL_ZEMA_DATASET_PATH = Path(dirname(__file__), "datasets")
 ZEMA_DATASET_HASH = (
@@ -47,19 +47,18 @@ class ExtractionDataType(Enum):
 
     Attributes
     ----------
-    UNCERTAINTIES : str
-        with value ``qudt:standardUncertainty``
     VALUES : str
         with value ``qudt:value``
+    UNCERTAINTIES : str
+        with value ``qudt:standardUncertainty``
     """
 
-    UNCERTAINTIES = "qudt:standardUncertainty"
     VALUES = "qudt:value"
+    UNCERTAINTIES = "qudt:standardUncertainty"
 
 
-def provide_zema_samples(
-    n_samples: int = 1, size_scaler: int = 1, normalize: bool = False
-) -> UncertainArray:
+@dataclass
+class ZeMASamples:
     """Extracts requested number of samples of values with associated uncertainties
 
     The underlying dataset is the annotated "Sensor data set of one electromechanical
@@ -68,101 +67,184 @@ def provide_zema_samples(
     Parameters
     ----------
     n_samples : int, optional
-        number of samples each containing size_scaler readings from each of the eleven
-        sensors with associated uncertainties, defaults to 1
+        number of samples each containing size_scaler readings from each of the
+        eleven sensors with associated uncertainties, defaults to 1
     size_scaler : int, optional
         number of sensor readings from each of the individual sensors per sample,
         defaults to 1
     normalize : bool, optional
         if ``True``, then data is centered around zero and scaled to unit std,
         defaults to False
-    Returns
-    -------
-    UncertainArray
-        The collection of samples of values with associated uncertainties, will be of
-        shape (n_samples, 11 x size_scaler)
+
+    Attributes
+    ----------
+    uncertain_values : UncertainArray
+        The collection of samples of values with associated uncertainties,
+        will be of shape (n_samples, 11 x size_scaler)
     """
 
-    def _normalize_if_requested(data: Dataset) -> NDArray[np.double]:
-        _potentially_normalized_data = data[np.s_[1 : size_scaler + 1, :n_samples]]
-        if normalize:
-            _potentially_normalized_data -= np.mean(data[:, :n_samples], axis=0)
-            _potentially_normalized_data /= np.std(data[:, :n_samples], axis=0)
-        return _potentially_normalized_data.transpose()
-
-    def _append_to_extraction(
-        append_to: NDArray[np.double], appendix: NDArray[np.double]
-    ) -> NDArray[np.double]:
-        return np.append(append_to, appendix, axis=1)
-
-    if cached_data := _check_and_load_cache(n_samples):
-        return cached_data
-    dataset_full_path = retrieve(
-        url=ZEMA_DATASET_URL,
-        known_hash=ZEMA_DATASET_HASH,
-        path=LOCAL_ZEMA_DATASET_PATH,
-        progressbar=True,
-    )
-    assert exists(dataset_full_path)
-    uncertainties = np.empty((n_samples, 0))
-    values = np.empty((n_samples, 0))
-    relevant_datasets = (
-        ["ZeMA_DAQ", quantity, datatype.value]
-        for quantity in ZEMA_QUANTITIES
-        for datatype in ExtractionDataType
-    )
-    with h5py.File(dataset_full_path, "r") as h5f:
-        for dataset_descriptor in relevant_datasets:
-            dataset = cast(Dataset, reduce(operator.getitem, dataset_descriptor, h5f))
-            if ExtractionDataType.UNCERTAINTIES.value in dataset.name:
-                extracted_data = uncertainties
-                print(f"    Extract uncertainties from {dataset.name}")
-            elif ExtractionDataType.VALUES.value in dataset.name:
-                extracted_data = values
-                print(f"    Extract values from {dataset.name}")
-            else:
-                raise RuntimeError(
-                    "Somehow there is unexpected data in the dataset to be processed. "
-                    f"Did not expect to find {dataset.name}"
+    uncertain_values: UncertainArray
+
+    def __init__(
+        self, n_samples: int = 1, size_scaler: int = 1, normalize: bool = False
+    ):
+
+        self.normalize = normalize
+        self.n_samples = n_samples
+        self.size_scaler = size_scaler
+        # if cached_data := _check_and_load_cache(n_samples, size_scaler):
+        #     return cached_data
+        dataset_full_path = (
+            "/home/bjorn/code/zema_emc_annotated/src/zema_emc_annotated/"
+            "datasets/394da54b1fc044fc498d60367c4e292d-axis11_2kHz_ZeMA_PTB_SI.h5"
+        )
+        # retrieve(
+        #     url=ZEMA_DATASET_URL,
+        #     known_hash=ZEMA_DATASET_HASH,
+        #     path=LOCAL_ZEMA_DATASET_PATH,
+        #     progressbar=True,
+        # )
+        assert exists(dataset_full_path)
+        self._uncertainties = np.empty((n_samples, 0))
+        self._values = np.empty((n_samples, 0))
+        relevant_datasets = (
+            ["ZeMA_DAQ", quantity, datatype.value]
+            for quantity in ZEMA_QUANTITIES
+            for datatype in ExtractionDataType
+        )
+        self._treating_uncertainties: bool = False
+        self._treating_values: bool = False
+        self._normalization_divisors: dict[str, NDArray[np.double] | float] = {}
+        with h5py.File(dataset_full_path, "r") as h5f:
+            for dataset_descriptor in relevant_datasets:
+                self._current_dataset: Dataset = cast(
+                    Dataset, reduce(operator.getitem, dataset_descriptor, h5f)
                 )
-            if dataset.shape[0] == 3:
-                for sensor in dataset:
-                    extracted_data = _append_to_extraction(
-                        extracted_data, _normalize_if_requested(sensor)
+                if ExtractionDataType.VALUES.value in self._current_dataset.name:
+                    self._treating_values = True
+                    print(f"    Extract values from {self._current_dataset.name}")
+                elif (
+                    ExtractionDataType.UNCERTAINTIES.value in self._current_dataset.name
+                ):
+                    self._treating_values = False
+                    print(
+                        f"    Extract uncertainties from {self._current_dataset.name}"
+                    )
+                else:
+                    raise RuntimeError(
+                        "Somehow there is unexpected data in the dataset to be"
+                        f"processed. Did not expect to find "
+                        f"{self._current_dataset.name}"
+                    )
+                if self._current_dataset.shape[0] == 3:
+                    for idx, sensor in enumerate(self._current_dataset):
+                        self._normalize_if_requested_and_append(
+                            sensor, self._extract_sub_dataset_name(idx)
+                        )
+                else:
+                    self._normalize_if_requested_and_append(
+                        self._current_dataset,
+                        self._strip_data_type_from_dataset_descriptor(),
                     )
-            else:
-                extracted_data = _append_to_extraction(
-                    extracted_data, _normalize_if_requested(dataset)
+                if self._treating_values:
+                    print("    Values extracted")
+                else:
+                    print("    Uncertainties extracted")
+        self._store_cache(
+            uncertain_values := UncertainArray(self._values, self._uncertainties)
+        )
+        self.uncertain_values = uncertain_values
+
+    def _normalize_if_requested_and_append(
+        self, data: Dataset, dataset_descriptor: str
+    ) -> None:
+        """Normalize the provided data and append according to current state"""
+        _potentially_normalized_data = data[
+            np.s_[1 : self.size_scaler + 1, : self.n_samples]
+        ]
+        if self._treating_values:
+            if self.normalize:
+                _potentially_normalized_data -= np.mean(
+                    data[:, : self.n_samples], axis=0
                 )
-            if ExtractionDataType.UNCERTAINTIES.value in dataset.name:
-                uncertainties = extracted_data
-                print("    Uncertainties extracted")
-            elif ExtractionDataType.VALUES.value in dataset.name:
-                values = extracted_data
-                print("    Values extracted")
-    uncertain_values = UncertainArray(np.array(values), np.array(uncertainties))
-    _store_cache(uncertain_values)
-    return uncertain_values
-
-
-def _check_and_load_cache(n_samples: int) -> UncertainArray | None:
-    """Checks if corresponding file for n_samples exists and loads it with pickle"""
-    if os.path.exists(cache_path := _cache_path(n_samples)):
-        with open(cache_path, "rb") as cache_file:
-            return cast(UncertainArray, pickle.load(cache_file))
-    return None
-
-
-def _cache_path(n_samples: int) -> Path:
-    """Local file system path for a cache file containing n ZeMA samples
-
-    The result does not guarantee, that the file at the specified location exists,
-    but can be used to check for existence or creation.
-    """
-    return LOCAL_ZEMA_DATASET_PATH.joinpath(f"{str(n_samples)}_samples.pickle")
-
-
-def _store_cache(uncertain_values: UncertainArray) -> None:
-    """Dumps provided uncertain tenor to corresponding pickle file"""
-    with open(_cache_path(len(uncertain_values.values)), "wb") as cache_file:
-        pickle.dump(uncertain_values, cache_file)
+                data_std = np.std(data[:, : self.n_samples], axis=0)
+                data_std[data_std == 0] = 1.0
+                self._normalization_divisors[dataset_descriptor] = data_std
+                _potentially_normalized_data /= self._normalization_divisors[
+                    dataset_descriptor
+                ]
+            self._values = np.append(
+                self._values, _potentially_normalized_data.transpose(), axis=1
+            )
+        else:
+            if self.normalize:
+                _potentially_normalized_data /= self._normalization_divisors[
+                    dataset_descriptor
+                ]
+            self._uncertainties = np.append(
+                self._uncertainties, _potentially_normalized_data.transpose(), axis=1
+            )
+
+    def _extract_sub_dataset_name(self, idx: int) -> str:
+        return str(
+            self._strip_data_type_from_dataset_descriptor()
+            + self._current_dataset.attrs["si:label"]
+            .split(",")[idx]
+            .strip("[")
+            .strip("]")
+            .replace(" ", "")
+            .replace('"', "")
+            .replace("uncertainty", "")
+        ).replace("\n", "")
+
+    def _strip_data_type_from_dataset_descriptor(self) -> str:
+        return str(
+            self._current_dataset.name.replace(
+                ExtractionDataType.UNCERTAINTIES.value, ""
+            ).replace(ExtractionDataType.VALUES.value, "")
+        )
+
+    @property
+    def values(self) -> RealVector:
+        """The values of the stored :class:`UncertainArray` object"""
+        return self.uncertain_values.values
+
+    @property
+    def uncertainties(self) -> RealMatrix | RealVector:
+        """The uncertainties of the stored :class:`UncertainArray` object"""
+        return self.uncertain_values.uncertainties
+
+    @staticmethod
+    def _check_and_load_cache(
+        n_samples: int, size_scaler: int
+    ) -> UncertainArray | None:
+        """Checks if corresponding file for n_samples exists and loads it with pickle"""
+        if os.path.exists(
+            cache_path := ZeMASamples._cache_path(n_samples, size_scaler)
+        ):
+            with open(cache_path, "rb") as cache_file:
+                return cast(UncertainArray, pickle.load(cache_file))
+        return None
+
+    @staticmethod
+    def _cache_path(n_samples: int, size_scaler: int) -> Path:
+        """Local file system path for a cache file containing n ZeMA samples
+
+        The result does not guarantee, that the file at the specified location exists,
+        but can be used to check for existence or creation.
+        """
+        return LOCAL_ZEMA_DATASET_PATH.joinpath(
+            f"{str(n_samples)}_samples_with_{str(size_scaler)}_values_per_sensor.pickle"
+        )
+
+    @staticmethod
+    def _store_cache(uncertain_values: UncertainArray) -> None:
+        """Dumps provided uncertain tenor to corresponding pickle file"""
+        with open(
+            ZeMASamples._cache_path(
+                uncertain_values.values.shape[0],
+                int(uncertain_values.values.shape[1] / 11),
+            ),
+            "wb",
+        ) as cache_file:
+            pickle.dump(uncertain_values, cache_file)
-- 
GitLab