From b7e1f7caedbf059b5fe72c5aea3347731b20fd69 Mon Sep 17 00:00:00 2001
From: Joerg Martin <joerg.martin@ptb.de>
Date: Fri, 10 Dec 2021 10:21:35 +0100
Subject: [PATCH] Errors in coverage metrics now computed from sampling theta

Previously, this was done by looking at the average of the predictions
over the posterior predictive. This was removed, as this didn't make
much sense.
---
 EIVPackage/EIVArchitectures/Networks.py    |  34 +++---
 EIVPackage/EIVGeneral/coverage_metrices.py | 124 +++++++++++++--------
 Experiments/evaluate_tabular.py            |  37 +++---
 3 files changed, 121 insertions(+), 74 deletions(-)

diff --git a/EIVPackage/EIVArchitectures/Networks.py b/EIVPackage/EIVArchitectures/Networks.py
index ccae848..6acace5 100644
--- a/EIVPackage/EIVArchitectures/Networks.py
+++ b/EIVPackage/EIVArchitectures/Networks.py
@@ -158,8 +158,6 @@ class FNNEIV(nn.Module):
                 regularization += self.main[0].neg_x_evidence(x)
         return regularization 
 
-    # TODO: repetition
-    # + Test via breakpoints
     def predict(self, x, number_of_draws=[100,5], number_of_parameter_chunks = None,
             remove_graph=True
             , take_average_of_prediction=True):
@@ -217,17 +215,16 @@ class FNNEIV(nn.Module):
             sigma = torch.mean(sigma, dim=1)
         return pred, sigma
 
-    # TODO: repetition
-    # + Implement
-    # + Test via breakpoints
-    def predictive_logdensity(self, x, y, number_of_draws=[100, 5], number_of_parameter_chunks = None, remove_graph=True,
+    def predictive_logdensity(self, x_or_predictions, y, number_of_draws=[100, 5], number_of_parameter_chunks = None, remove_graph=True,
             average_batch_dimension=True, scale_labels=None,
             decouple_dimensions=False):
         """
         Computes the logarithm of the predictive density evaluated at `y`. If
         `average_batch_dimension` is `True` these values will be averaged over
         the batch dimension.
-        :param x: A torch.tensor, the input
+        :param x_or_predictions: Either a torch.tensor or a tuple. If
+        `x_or_predictions' is a tensor, it will be used as input. If it is a
+        tuple, it will be interpreted as the output of `predict` with `take_average_of_prediction` set to False.
         :param y: A torch.tensor, labels on which to evaluate the density
         :param number_of_draws: An integer or a list. If an integer
         `number_of_draws`, will be converted internally to
@@ -250,10 +247,13 @@ class FNNEIV(nn.Module):
         average, to make results comparable with the literature. Defaults to
         False.
         """
-        out, sigmas = self.predict(x, number_of_draws=number_of_draws,
+        if type(x_or_predictions) is torch.tensor:
+            out, sigmas = self.predict(x_or_predictions, number_of_draws=number_of_draws,
                 number_of_parameter_chunks=number_of_parameter_chunks,
                 remove_graph=remove_graph,
                 take_average_of_prediction=False)
+        else:
+            out, sigmas = x_or_predictions
         # Add "repetition" dimension to y and sigmas
         y = y[:,None,...]
         sigmas = sigmas[:,None,...]
@@ -427,14 +427,16 @@ class FNNBer(nn.Module):
             sigma = torch.mean(sigma, dim=1)
         return pred, sigma
 
-    def predictive_logdensity(self, x, y, number_of_draws=100, remove_graph=True,
+    def predictive_logdensity(self, x_or_predictions, y, number_of_draws=100, remove_graph=True,
             average_batch_dimension=True, scale_labels=None,
             decouple_dimensions=False):
         """
         Computes the logarithm of the predictive density evaluated at `y`. If
         `average_batch_dimension` is `True` these values will be averaged over
         the batch dimension.
-        :param x: A torch.tensor, the input
+        :param x_or_predictions: Either a torch.tensor or a tuple. If
+        `x_or_predictions' is a tensor, it will be used as input. If it is a
+        tuple, it will be interpreted as the output of `predict` with `take_average_of_prediction` set to False.
         :param y: A torch.tensor, labels on which to evaluate the density
         :param number_of_draws: Number of draws to obtain from x
         :param remove_graph: If True (default) the output will 
@@ -448,14 +450,20 @@ class FNNBer(nn.Module):
         average, to make results comparable with the literature. Defaults to
         False.
         """
-        out, sigmas = self.predict(x, number_of_draws=number_of_draws,
-                take_average_of_prediction=False, remove_graph=remove_graph)
-        # Add "repetition" dimension to y and out
+        if type(x_or_predictions) is torch.tensor:
+            out, sigmas = self.predict(x_or_predictions,
+                    number_of_draws=number_of_draws,
+                    take_average_of_prediction=False, remove_graph=remove_graph)
+        else:
+            out, sigmas = x_or_predictions
+        # Add "repetition" dimension to y and sigmas
         y = y[:,None,...]
         sigmas = sigmas[:,None,...]
         if len(y.shape) <= 2:
             # add an output axis if necessary
             y = y[...,None]
+        if len(sigmas.shape) <= 2:
+            # add an output axis if necessary
             sigmas = sigmas[...,None]
         # squeeze last dimensions into one
         y = y.view((*y.shape[:2], -1))
diff --git a/EIVPackage/EIVGeneral/coverage_metrices.py b/EIVPackage/EIVGeneral/coverage_metrices.py
index a984e77..3bafa96 100644
--- a/EIVPackage/EIVGeneral/coverage_metrices.py
+++ b/EIVPackage/EIVGeneral/coverage_metrices.py
@@ -28,55 +28,73 @@ def multivariate_interval_length(dim, q=0.95):
     :param q: Float, should be between 0 and 1. Defaults to 0.95.
     """
     # use independence of components to reduce to a univariate quantile
-    univariate_quantile = 0.5 * (1+0.95**(1/dim))
+    univariate_quantile = 0.5 * (1+q**(1/dim))
     return scipy.stats.norm.ppf(univariate_quantile)
 
 
-def epistemic_coverage(prediction_triple,  y, q=0.95, normalize_errors=False):
+def epistemic_coverage(not_averaged_predictions,  y, q=0.95, normalize_errors=False):
     """
     Returns the average coverage of `y` by the interval 
     "prefactor * (predictions + q-Interval)",
-    where "q-Interval" is the interval of measure `q` under the standard normal, 
-    "predictions" is the first component of `prediction_triple` and prefactor is either
-    the epistemic uncertainty, given by the second component of `prediction_triple`, if
-    `normalize_errors` is False, or 1 if it is true. The coverage is returned
-    as given by `y` and as a theoretical_coverage computed from the epistemic
-    uncertainty (second component of `prediction_triple`) and the aleatoric uncertainty
-    (third component of `prediction_triple`)
-    :param prediction_triple: A triple of tensors containing (in this order): the
-    predictions of the neural net (the average under the posterior), the
-    epistemic uncertainty (the standard deviation under the posterior) and
-    the aleatoric uncertainty. All tensors are expected to have two dimensions:
-    a batch and a feature dimension.
-    :param y: A `torch.tensor` of the same shape then the first two components
-    of `prediction_triple`. If the feature dimension is missing, it is added.
+    where 
+    - "q-Interval" is the interval of measure `q` under the standard normal, 
+    where
+    - "predictions" are the entries of the first component of the tuple
+      `not_averaged_predictions`, 
+    - "prefactor either equals the epistemic uncertainty, computed from the
+      first component of `not_averaged_predictions`,if
+    `normalize_errors` is set to False, or 1 if it is true. 
+    The coverage is returned as given by the `y` and as a theoretical_coverage
+    computed from the epistemic uncertainty  and the aleatoric uncertainty
+    (second component of `not_averaged_predictions`).
+    :param not_averaged_predictions: A tuple of tensors as in the output of
+    `FNNEIV.predict` with `take_average_of_prediction` set to `False`, i.e.:
+    the predictions of the neural net not averaged over the first dimension 
+    (the repetition dimension in `FNNEIV.predict`) and
+    the aleatoric uncertainty with a batch dimension and a feature dimension. 
+    :param y: A `torch.tensor` of the same shape then the second components
+    of `not_averaged_predictions`. If the feature dimension is missing, it is added.
     :param q: A float between 0 and 1. Defaults to 0.95.
     :param normalize_errors: If True, the deviations between predictions and
     `y` are normalized by the total uncertainty, computed from the aleatoric
     and epistemic uncertainty and the coverage w.r.t. q-interval is computed.
     :returns: numerical_coverage, theoretical_coverage
     """
-    mean, epis_unc, aleat_unc = prediction_triple
-    assert epis_unc.shape == aleat_unc.shape
-    assert mean.shape == epis_unc.shape
-    # Add feature dimension to y if missing
-    if len(y.shape) <= 1:
-        y = y.view((-1,1))
-    assert y.shape == mean.shape
+    out, sigmas = not_averaged_predictions
+    y = y[:,None,...]
+    sigmas = sigmas[:,None,...]
+    # add an output axis if necessary
+    if len(y.shape) <= 2:
+        y = y[...,None]
+    if len(sigmas.shape) <= 2:
+        sigmas = sigmas[...,None]
+    # squeeze last dimensions into one
+    y = y.view((*y.shape[:2], -1))
+    sigmas = sigmas.view((*sigmas.shape[:2], -1))
+    out = out.view((*out.shape[:2], -1))
+    # check if dimensions consistent
+    assert y.shape == sigmas.shape
+    assert y.shape[0] == out.shape[0]
+    assert y.shape[2] == out.shape[2]
+    # compute epistemic uncertainty
+    epis_unc = torch.std(out, dim=1, keepdim=True)
+    # compute total uncertainty
+    assert epis_unc.shape == sigmas.shape
+    total_unc = torch.sqrt(epis_unc**2 + sigmas **2)
     # fix interval based on epis_unc
     if not normalize_errors:
         interval_length = multivariate_interval_length(dim=y.shape[1], q=q) \
                 * epis_unc
     else:
         interval_length = multivariate_interval_length(dim=y.shape[1], q=q) 
-    total_unc = torch.sqrt(epis_unc**2 + aleat_unc **2)
     # numerical computation
-    errors = mean - y
+    errors = out - y
     if normalize_errors:
-        assert errors.shape == total_unc.shape
+        assert errors.shape[0] == total_unc.shape[0]
+        assert errors.shape[2] == total_unc.shape[2]
         errors /= total_unc
     check_if_in_interval = logical_and_along_dimension(
-            torch.abs(errors) <= interval_length, dim=1)
+            torch.abs(errors) <= interval_length, dim=2)
     numerical_coverage = torch.mean(
             check_if_in_interval.to(torch.float32)
             ).cpu().detach().item()
@@ -85,34 +103,52 @@ def epistemic_coverage(prediction_triple,  y, q=0.95, normalize_errors=False):
         cdf_args = (interval_length/total_unc).detach().cpu().numpy()
         cdf_values = scipy.stats.norm.cdf(cdf_args)
         prob_values = 2*cdf_values -1
-        assert len(cdf_values.shape) == 2
-        theoretical_coverage = np.mean(np.prod(prob_values, axis=1)).item()
+        assert len(cdf_values.shape) == 3
+        assert cdf_values.shape[1] == 1
+        # take product over feature dimension 
+        # and average over batch dimension
+        theoretical_coverage = np.mean(np.prod(prob_values, axis=2)).item()
     else:
         theoretical_coverage = q
     return numerical_coverage, theoretical_coverage
 
-def normalized_std(prediction_triple, y):
+def normalized_std(not_averaged_predictions, y):
     """
-    Returns the standard deviation of normalized residuals. In theory this
+    Returns the standard deviation of normalized residuals, averaged over the
+    feature dimension. In theory this
     number should be equal to 1.0.
-    :param prediction_triple: A triple of tensors containing (in this order): the
+    :param not_averaged_predictions: A triple of tensors containing (in this order): the
     predictions of the neural net (the average under the posterior), the
     epistemic uncertainty (the standard deviation under the posterior) and
     the aleatoric uncertainty.
     :param y: A `torch.tensor` of the same shape then the first two components
-    of `prediction_triple`. If the feature dimension is missing, it is added.
+    of `not_averaged_predictions`. If the feature dimension is missing, it is added.
     :returns: numerical_coverage, theoretical_coverage
     """
-    mean, epis_unc, aleat_unc = prediction_triple
-    assert epis_unc.shape == aleat_unc.shape
-    assert mean.shape == epis_unc.shape
-    # Add feature dimension to y if missing
-    if len(y.shape) <= 1:
-        y = y.view((-1,1))
-    assert y.shape == mean.shape
-    total_unc = torch.sqrt(epis_unc**2 + aleat_unc **2)
+    out = not_averaged_predictions[0]
+    sigmas = not_averaged_predictions[1]
+    # add repetition dimension
+    y = y[:,None,...]
+    sigmas = sigmas[:,None,...]
+    # add an output axis if necessary
+    if len(y.shape) <= 2:
+        y = y[...,None]
+    if len(sigmas.shape) <= 2:
+        sigmas = sigmas[...,None]
+    # squeeze last dimensions into one
+    y = y.view((*y.shape[:2], -1))
+    sigmas = sigmas.view((*sigmas.shape[:2], -1))
+    out = out.view((*out.shape[:2], -1))
+    # check if dimensions consistent
+    assert y.shape == sigmas.shape
+    assert y.shape[0] == out.shape[0]
+    assert y.shape[2] == out.shape[2]
+    # compute epistemic uncertainty
+    epis_unc = torch.std(out, dim=1, keepdim=True)
+    assert epis_unc.shape == sigmas.shape
+    total_unc = torch.sqrt(epis_unc**2 + sigmas **2)
     # numerical computation
-    errors = mean - y
-    assert errors.shape == total_unc.shape
+    errors = out - y
     errors /= total_unc
-    return torch.mean(torch.std(errors, dim=0)).item()
+    # average over feature dimension
+    return torch.mean(torch.std(errors, dim=(0,1))).item()
diff --git a/Experiments/evaluate_tabular.py b/Experiments/evaluate_tabular.py
index 6a738bd..40b3f85 100644
--- a/Experiments/evaluate_tabular.py
+++ b/Experiments/evaluate_tabular.py
@@ -67,13 +67,13 @@ def collect_metrics(x,y, seed=0,
     # RMSE
     training_state = net.training
     net.train()
-    ######
-    prediction_triple =\
-            net.predict_mean_and_unc(x, number_of_draws=noneiv_number_of_draws)
+    not_averaged_predictions = net.predict(x, number_of_draws=noneiv_number_of_draws, 
+            take_average_of_prediction=False)
+    noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
     if len(y.shape) <= 1:
         y = y.view((-1,1))
-    assert y.shape == prediction_triple[0].shape
-    res = y-prediction_triple[0]
+    assert y.shape == noneiv_mean.shape
+    res = y-noneiv_mean
     if scale_outputs:
         scale = train_data.dataset.std_labels.to(device)
         scaled_res = res * scale.view((1,-1))
@@ -83,10 +83,10 @@ def collect_metrics(x,y, seed=0,
     noneiv_metrics['rmse'] = np.sqrt(np.mean(scaled_res**2))
     noneiv_metrics['bias'] = np.mean(scaled_res)
     noneiv_metrics['coverage_numerical'], noneiv_metrics['coverage_theory'] =\
-            epistemic_coverage(prediction_triple, y, normalize_errors=False)
+            epistemic_coverage(not_averaged_predictions, y, normalize_errors=False)
     noneiv_metrics['coverage_normalized'],_ =\
-            epistemic_coverage(prediction_triple, y, normalize_errors=True)
-    noneiv_metrics['res_std'] = normalized_std(prediction_triple, y)
+            epistemic_coverage(not_averaged_predictions, y, normalize_errors=True)
+    noneiv_metrics['res_std'] = normalized_std(not_averaged_predictions, y)
     
 
 
@@ -95,7 +95,8 @@ def collect_metrics(x,y, seed=0,
         scale_labels = train_data.dataset.std_labels.view((-1,)).to(device)
     else:
         scale_labels = None
-    noneiv_metrics['logdens' ]= net.predictive_logdensity(x, y,
+    noneiv_metrics['logdens' ]= net.predictive_logdensity(
+            not_averaged_predictions, y,
             number_of_draws=100,
             decouple_dimensions=decouple_dimensions,
             scale_labels=scale_labels).mean().detach().cpu().numpy()
@@ -127,12 +128,13 @@ def collect_metrics(x,y, seed=0,
     noise_state = net.noise_is_on
     net.train()
     net.noise_on()
-    prediction_triple =\
-            net.predict_mean_and_unc(x, number_of_draws=eiv_number_of_draws)
+    not_averaged_predictions = net.predict(x, number_of_draws=noneiv_number_of_draws, 
+            take_average_of_prediction=False)
+    eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
     if len(y.shape) <= 1:
         y = y.view((-1,1))
-    assert y.shape == prediction_triple[0].shape
-    res = y-prediction_triple[0]
+    assert y.shape == eiv_mean.shape
+    res = y-eiv_mean
     scale = train_data.dataset.std_labels.to(device)
     if scale_outputs:
         scale = train_data.dataset.std_labels.to(device)
@@ -143,10 +145,10 @@ def collect_metrics(x,y, seed=0,
     eiv_metrics['rmse' ]= np.sqrt(np.mean(scaled_res**2))
     eiv_metrics['bias' ]= np.mean(scaled_res)
     eiv_metrics['coverage_numerical'], eiv_metrics['coverage_theory'] =\
-            epistemic_coverage(prediction_triple, y, normalize_errors=False)
+            epistemic_coverage(not_averaged_predictions, y, normalize_errors=False)
     eiv_metrics['coverage_normalized'],_ =\
-            epistemic_coverage(prediction_triple, y, normalize_errors=True)
-    eiv_metrics['res_std' ]= normalized_std(prediction_triple, y)
+            epistemic_coverage(not_averaged_predictions, y, normalize_errors=True)
+    eiv_metrics['res_std' ]= normalized_std(not_averaged_predictions, y)
 
 
     # NLL
@@ -154,7 +156,8 @@ def collect_metrics(x,y, seed=0,
         scale_labels = train_data.dataset.std_labels.view((-1,)).to(device)
     else:
         scale_labels = None
-    eiv_metrics['logdens' ]= net.predictive_logdensity(x, y,
+    eiv_metrics['logdens' ]= net.predictive_logdensity(
+            not_averaged_predictions, y,
             number_of_draws=eiv_number_of_draws,
             decouple_dimensions=decouple_dimensions,
             scale_labels=scale_labels).mean().detach().cpu().numpy()
-- 
GitLab