Included true_y in evaluate_metrics

0a001305 · Jörg Martin · a4f80144 · 0a001305 · 0a001305 · 0a001305
Commit 0a001305 authored 3 years ago by Jörg Martin
--- a/EIVPackage/EIVData/linear.py
+++ b/EIVPackage/EIVData/linear.py
 import torch
 import sys
-from torch.utils.data import TensorDataset, random_split
+from torch.utils.data import TensorDataset

 total_number_of_datapoints = 2000
 input_range = [-1,1]
@@ -31,8 +31,9 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
    :param return_ground_truth: Boolean. If True, the unnoisy ground truth will
    also be returned. Defaults to False.
    :returns: linear_trainset, linear_testset if return_ground_truth is False,
-    else linear_trainset, linear_testset, (true_train_x, true_train_y),
-        (true_test_x, true_test_y)
+    else linear_trainset, linear_testset,  true_linear_trainset,
+    true_linear_testset. The later two return **four tensors**: The true x,y and
+    their noisy counterparts.
    """
    random_generator = torch.Generator().manual_seed(seed)
    # draw different seeds for noise and splitting
@@ -65,9 +66,13 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
    noisy_train_y, noisy_test_y = torch.split(noisy_y, [train_len, test_len])
    linear_trainset = TensorDataset(noisy_train_x, noisy_train_y)
    linear_testset = TensorDataset(noisy_test_x, noisy_test_y)
+    true_linear_trainset = TensorDataset(true_train_x, true_train_y,
+            noisy_train_x, noisy_train_y)
+    true_linear_testset = TensorDataset(true_test_x, true_test_y,
+            noisy_test_x, noisy_test_y)
    if not return_ground_truth:
        return linear_trainset, linear_testset
    else:
-        return linear_trainset, linear_testset, (true_train_x, true_train_y),\
-            (true_test_x, true_test_y)
+        return linear_trainset, linear_testset, true_linear_trainset,\
+            true_linear_testset

--- a/EIVPackage/EIVData/quadratic.py
+++ b/EIVPackage/EIVData/quadratic.py
 import torch
 import sys
-from torch.utils.data import TensorDataset, random_split
+from torch.utils.data import TensorDataset

 total_number_of_datapoints = 2000
 input_range = [-1,1]
@@ -11,7 +11,8 @@ y_noise_strength = 0.1

 def get_normalization(*args):
    """
-    Returns the mean and standard deviations (in tuples) of the tensors in *args.
+    Returns the mean and standard deviations (in tuples) of the tensors in
+    *args.
    """
    normalization_collection = []
    for t in args:
@@ -30,9 +31,10 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
    :param normalize: Whether to normalize the data, defaults to True.
    :param return_ground_truth: Boolean. If True, the unnoisy ground truth will
    also be returned. Defaults to False.
-    :returns: linear_trainset, linear_testset if return_ground_truth is False,
-    else linear_trainset, linear_testset, (true_train_x, true_train_y),
-        (true_test_x, true_test_y)
+    :returns: quadratic_trainset, quadratic_testset if return_ground_truth is False,
+    else quadratic_trainset, quadratic_testset,  true_quadratic_trainset,
+    true_quadratic_testset. The later two return **four tensors**: The true x,y and
+    their noisy counterparts.
    """
    random_generator = torch.Generator().manual_seed(seed)
    # draw different seeds for noise and splitting
@@ -63,11 +65,15 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
    true_train_y, true_test_y = torch.split(true_y, [train_len, test_len])
    noisy_train_x, noisy_test_x = torch.split(noisy_x, [train_len, test_len])
    noisy_train_y, noisy_test_y = torch.split(noisy_y, [train_len, test_len])
-    linear_trainset = TensorDataset(noisy_train_x, noisy_train_y)
-    linear_testset = TensorDataset(noisy_test_x, noisy_test_y)
+    quadratic_trainset = TensorDataset(noisy_train_x, noisy_train_y)
+    quadratic_testset = TensorDataset(noisy_test_x, noisy_test_y)
+    true_quadratic_trainset = TensorDataset(true_train_x, true_train_y,
+            noisy_train_x, noisy_train_y)
+    true_quadratic_testset = TensorDataset(true_test_x, true_test_y,
+            noisy_test_x, noisy_test_y)
    if not return_ground_truth:
-        return linear_trainset, linear_testset
+        return quadratic_trainset, quadratic_testset
    else:
-        return linear_trainset, linear_testset, (true_train_x, true_train_y),\
-            (true_test_x, true_test_y)
+        return quadratic_trainset, quadratic_testset, true_quadratic_trainset,\
+            true_quadratic_testset

--- a/EIVPackage/EIVGeneral/coverage_metrics.py
+++ b/EIVPackage/EIVGeneral/coverage_metrics.py
@@ -24,7 +24,7 @@ def multivariate_interval_length(dim, q=0.95):
    Returns the half side length of multivariate cube, symmetrically centered
    around 0 such that its measure under a standard normal distribution equals
    `q`.
-    :param dim: A non-negative integer, the dimension.
+    :param dim: A positive integer, the dimension.
    :param q: Float, should be between 0 and 1. Defaults to 0.95.
    """
    # use independence of components to reduce to a univariate quantile
@@ -32,11 +32,12 @@ def multivariate_interval_length(dim, q=0.95):
    return scipy.stats.norm.ppf(univariate_quantile)


-def epistemic_coverage(not_averaged_predictions,  y, q=0.95, normalize_errors=False):
+def epistemic_coverage(not_averaged_predictions,  y, q=0.95,
+        normalize_errors=False,
+        noisy_y=True):
    """
    Returns the average coverage of `y` by the interval 
-    "prefactor * (predictions + q-Interval)",
-    where 
+    "predictions + prefactor * q-Interval", where 
    - "q-Interval" is the interval of measure `q` under the standard normal, 
    where
    - "predictions" are the entries of the first component of the tuple
@@ -45,8 +46,11 @@ def epistemic_coverage(not_averaged_predictions,  y, q=0.95, normalize_errors=Fa
      first component of `not_averaged_predictions`,if
    `normalize_errors` is set to False, or 1 if it is true. 
    The coverage is returned as given by the `y` and as a theoretical_coverage
-    computed from the epistemic uncertainty  and the aleatoric uncertainty
+    computed from the epistemic uncertainty and the aleatoric uncertainty
    (second component of `not_averaged_predictions`).
+    ** Note **: If `noisy_y` is True, the `y` will be treated as the unnoisy
+    ground truth. If it is False (default) it will be treated as noisy. For
+    real data only use the later. 
    :param not_averaged_predictions: A tuple of tensors as in the output of
    `FNNEIV.predict` with `take_average_of_prediction` set to `False`, i.e.:
    the predictions of the neural net not averaged over the first dimension 
@@ -58,6 +62,9 @@ def epistemic_coverage(not_averaged_predictions,  y, q=0.95, normalize_errors=Fa
    :param normalize_errors: If True, the deviations between predictions and
    `y` are normalized by the total uncertainty, computed from the aleatoric
    and epistemic uncertainty and the coverage w.r.t. q-interval is computed.
+    :param noisy_y: Boolean. If True (the default), `y` is treated as noisy and
+    the total uncertainty is considered. If False, `y` is treated as the
+    unnoisy ground truth.
    :returns: numerical_coverage, theoretical_coverage
    """
    out, sigmas = not_averaged_predictions
@@ -72,21 +79,26 @@ def epistemic_coverage(not_averaged_predictions,  y, q=0.95, normalize_errors=Fa
    y = y.view((*y.shape[:2], -1))
    sigmas = sigmas.view((*sigmas.shape[:2], -1))
    out = out.view((*out.shape[:2], -1))
-    # check if dimensions consistent
+    # check if dimensions are consistent
    assert y.shape == sigmas.shape
    assert y.shape[0] == out.shape[0]
    assert y.shape[2] == out.shape[2]
    # compute epistemic uncertainty
    epis_unc = torch.std(out, dim=1, keepdim=True)
-    # compute total uncertainty
    assert epis_unc.shape == sigmas.shape
-    total_unc = torch.sqrt(epis_unc**2 + sigmas **2)
+    # compute total uncertainty
+    if noisy_y:
+        total_unc = torch.sqrt(epis_unc**2 + sigmas **2)
+    else:
+        # for unnoisy y, the aleatoric uncertainty is treated as 0
+        total_unc = epis_unc
    # fix interval based on epis_unc
+    out_dim = y.shape[2]
    if not normalize_errors:
-        interval_length = multivariate_interval_length(dim=y.shape[1], q=q) \
+        interval_length = multivariate_interval_length(dim=out_dim, q=q) \
                * epis_unc
    else:
-        interval_length = multivariate_interval_length(dim=y.shape[1], q=q) 
+        interval_length = multivariate_interval_length(dim=out_dim, q=q) 
    # numerical computation
    errors = out - y
    if normalize_errors:

--- a/Experiments/evaluate_metrics.py
+++ b/Experiments/evaluate_metrics.py
@@ -59,15 +59,17 @@ except KeyError:
    device = torch.device('cpu')


-def collect_metrics(x,y, seed=0,
+def collect_metrics(x_y_pairs, seed=0,
    noneiv_number_of_draws=100, eiv_number_of_draws=[100,5],
    decouple_dimensions=False, device=device,
    scale_outputs=scale_outputs):
    """
    Compute various metrics for EiV and non-EiV. Will be returned as
    dictionaries.
-    :param x: A torch.tensor, taken as input
-    :param y: A torch.tensor, taken as output
+    :param x_y_pairs: A tuple of either the shape (None,None,x,y) or 
+    (x_true,y_true,x,y) containing torch.tensor or None. x and y are
+    considered as input and corresponding label. If the first two components
+    are not None, they are considered to be the unnoisy counterparts.
    :param seed: Integer. The seed used for loading, defaults to 0.
    :param noneiv_number_of_draws: Number of draws for non-EiV model
    for sampling from the posterior predictive. Defaults to 100.
@@ -82,8 +84,13 @@ def collect_metrics(x,y, seed=0,
    the log-dens to make them comparable with the literature.
    :returns: Dictionaries noneiv_metrics, eiv_metrics
    """
+    true_x, true_y, x, y = x_y_pairs
    x,y = x.to(device), y.to(device)
-
+    if true_x is not None:
+        assert true_y is not None
+        true_x,true_y = true_x.to(device), true_y.to(device)
+    else:
+        assert true_y is None

    # non-EiV
    noneiv_metrics = {}
@@ -126,6 +133,11 @@ def collect_metrics(x,y, seed=0,
    noneiv_metrics['res_std'] = normalized_std(not_averaged_predictions, y)

    # metrics that need a ground truth
+    if true_x is not None:
+        noneiv_metrics['true_coverage_numerical'],\
+                noneiv_metrics['true_coverage_theory'] =\
+                epistemic_coverage(not_averaged_predictions, true_y,
+                        normalize_errors=False, noisy_y=False)


    # NLL
@@ -187,6 +199,14 @@ def collect_metrics(x,y, seed=0,
            epistemic_coverage(not_averaged_predictions, y, normalize_errors=True)
    eiv_metrics['res_std' ]= normalized_std(not_averaged_predictions, y)

+    # metrics that need a ground truth
+    if true_x is not None:
+        eiv_metrics['true_coverage_numerical'],\
+                eiv_metrics['true_coverage_theory'] =\
+                epistemic_coverage(not_averaged_predictions, true_y,
+                        normalize_errors=False, noisy_y=False)
+
+

    # NLL
    if scale_outputs:
@@ -209,13 +229,9 @@ def collect_metrics(x,y, seed=0,
    return noneiv_metrics, eiv_metrics


-collection_keys = ['rmse','logdens','bias','coverage_numerical',
-        'coverage_theory','coverage_normalized','res_std']
 noneiv_metrics_collection = {}
 eiv_metrics_collection = {}
-for key in collection_keys:
-    noneiv_metrics_collection[key] = []
-    eiv_metrics_collection[key] = []
+collection_keys = []
 num_test_epochs = 10
 assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"]
 seed_list = range(noneiv_conf_dict["seed_range"][0],
@@ -223,22 +239,37 @@ seed_list = range(noneiv_conf_dict["seed_range"][0],
 max_batch_number = 2
 for seed in tqdm(seed_list):
    try:
-        train_data, test_data, (true_train_x, true_train_y),\
-                (true_test_x, true_test_y) \
+        train_data, test_data, true_train_data, true_test_data \
                = load_data(seed=seed, return_ground_truth=True)
    except TypeError:
        train_data, test_data = load_data(seed=seed)
-        (true_train_x, true_train_y), (true_test_x, true_test_y)\
-                = (None,None), (None,None)
-    test_dataloader = DataLoader(test_data,
+        true_train_data, true_test_data = None, None
+    if true_test_data is None:
+        test_dataloader = DataLoader(test_data,
            batch_size=int(np.min((len(test_data),
        800))), shuffle=True)
+    else:
+        test_dataloader = DataLoader(true_test_data,
+                batch_size=int(np.min((len(true_test_data), 800))), shuffle=True)
    for i in tqdm(range(num_test_epochs)):
-        for j, (x,y) in enumerate(test_dataloader):
+        for j, x_y_pairs in enumerate(test_dataloader):
            if j > max_batch_number:
                break
-
-            noneiv_metrics, eiv_metrics = collect_metrics(x,y, seed=seed)
+            # fill in ground truth with None, if not existent
+            if true_test_data is None:
+                x_y_pairs = (None, None, *x_y_pairs)
+            # should contain (true_x,true_y,x,y) or (None,None,x,y)
+            assert len(x_y_pairs) == 4
+            noneiv_metrics, eiv_metrics = collect_metrics(x_y_pairs,
+                    seed=seed)
+            if i==0 and j==0:
+                # fill collection keys
+                assert eiv_metrics.keys() == noneiv_metrics.keys()
+                collection_keys = list(eiv_metrics.keys())
+                for key in collection_keys:
+                    noneiv_metrics_collection[key] = []
+                    eiv_metrics_collection[key] = []
+            # collect results
            for key in collection_keys:
                noneiv_metrics_collection[key].append(noneiv_metrics[key])
                eiv_metrics_collection[key].append(eiv_metrics[key])