diff --git a/EIVPackage/EIVData/linear.py b/EIVPackage/EIVData/linear.py
index 89dd7a18a26f72293de31b39b1eb5f01ae88d0fe..9b69b6b310cf61382e8d9f5204f6ae18a2d1c2fc 100644
--- a/EIVPackage/EIVData/linear.py
+++ b/EIVPackage/EIVData/linear.py
@@ -1,6 +1,6 @@
 import torch
 import sys
-from torch.utils.data import TensorDataset, random_split
+from torch.utils.data import TensorDataset
 
 total_number_of_datapoints = 2000
 input_range = [-1,1]
@@ -31,8 +31,9 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
     :param return_ground_truth: Boolean. If True, the unnoisy ground truth will
     also be returned. Defaults to False.
     :returns: linear_trainset, linear_testset if return_ground_truth is False,
-    else linear_trainset, linear_testset, (true_train_x, true_train_y),
-        (true_test_x, true_test_y)
+    else linear_trainset, linear_testset,  true_linear_trainset,
+    true_linear_testset. The later two return **four tensors**: The true x,y and
+    their noisy counterparts.
     """
     random_generator = torch.Generator().manual_seed(seed)
     # draw different seeds for noise and splitting
@@ -65,9 +66,13 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
     noisy_train_y, noisy_test_y = torch.split(noisy_y, [train_len, test_len])
     linear_trainset = TensorDataset(noisy_train_x, noisy_train_y)
     linear_testset = TensorDataset(noisy_test_x, noisy_test_y)
+    true_linear_trainset = TensorDataset(true_train_x, true_train_y,
+            noisy_train_x, noisy_train_y)
+    true_linear_testset = TensorDataset(true_test_x, true_test_y,
+            noisy_test_x, noisy_test_y)
     if not return_ground_truth:
         return linear_trainset, linear_testset
     else:
-        return linear_trainset, linear_testset, (true_train_x, true_train_y),\
-            (true_test_x, true_test_y)
+        return linear_trainset, linear_testset, true_linear_trainset,\
+            true_linear_testset
 
diff --git a/EIVPackage/EIVData/quadratic.py b/EIVPackage/EIVData/quadratic.py
index eab8b6c6810371573d7885f13561d5c74e84fddb..aa4f4605094822116d36e12f64e854c38849a406 100644
--- a/EIVPackage/EIVData/quadratic.py
+++ b/EIVPackage/EIVData/quadratic.py
@@ -1,6 +1,6 @@
 import torch
 import sys
-from torch.utils.data import TensorDataset, random_split
+from torch.utils.data import TensorDataset
 
 total_number_of_datapoints = 2000
 input_range = [-1,1]
@@ -11,7 +11,8 @@ y_noise_strength = 0.1
 
 def get_normalization(*args):
     """
-    Returns the mean and standard deviations (in tuples) of the tensors in *args.
+    Returns the mean and standard deviations (in tuples) of the tensors in
+    *args.
     """
     normalization_collection = []
     for t in args:
@@ -30,9 +31,10 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
     :param normalize: Whether to normalize the data, defaults to True.
     :param return_ground_truth: Boolean. If True, the unnoisy ground truth will
     also be returned. Defaults to False.
-    :returns: linear_trainset, linear_testset if return_ground_truth is False,
-    else linear_trainset, linear_testset, (true_train_x, true_train_y),
-        (true_test_x, true_test_y)
+    :returns: quadratic_trainset, quadratic_testset if return_ground_truth is False,
+    else quadratic_trainset, quadratic_testset,  true_quadratic_trainset,
+    true_quadratic_testset. The later two return **four tensors**: The true x,y and
+    their noisy counterparts.
     """
     random_generator = torch.Generator().manual_seed(seed)
     # draw different seeds for noise and splitting
@@ -63,11 +65,15 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
     true_train_y, true_test_y = torch.split(true_y, [train_len, test_len])
     noisy_train_x, noisy_test_x = torch.split(noisy_x, [train_len, test_len])
     noisy_train_y, noisy_test_y = torch.split(noisy_y, [train_len, test_len])
-    linear_trainset = TensorDataset(noisy_train_x, noisy_train_y)
-    linear_testset = TensorDataset(noisy_test_x, noisy_test_y)
+    quadratic_trainset = TensorDataset(noisy_train_x, noisy_train_y)
+    quadratic_testset = TensorDataset(noisy_test_x, noisy_test_y)
+    true_quadratic_trainset = TensorDataset(true_train_x, true_train_y,
+            noisy_train_x, noisy_train_y)
+    true_quadratic_testset = TensorDataset(true_test_x, true_test_y,
+            noisy_test_x, noisy_test_y)
     if not return_ground_truth:
-        return linear_trainset, linear_testset
+        return quadratic_trainset, quadratic_testset
     else:
-        return linear_trainset, linear_testset, (true_train_x, true_train_y),\
-            (true_test_x, true_test_y)
+        return quadratic_trainset, quadratic_testset, true_quadratic_trainset,\
+            true_quadratic_testset
 
diff --git a/EIVPackage/EIVGeneral/coverage_metrics.py b/EIVPackage/EIVGeneral/coverage_metrics.py
index 3bafa96fcf3cb72e17ac075dff300309759dd9e9..ee1b6515c6713db1342208091f6ca0afd20ec017 100644
--- a/EIVPackage/EIVGeneral/coverage_metrics.py
+++ b/EIVPackage/EIVGeneral/coverage_metrics.py
@@ -24,7 +24,7 @@ def multivariate_interval_length(dim, q=0.95):
     Returns the half side length of multivariate cube, symmetrically centered
     around 0 such that its measure under a standard normal distribution equals
     `q`.
-    :param dim: A non-negative integer, the dimension.
+    :param dim: A positive integer, the dimension.
     :param q: Float, should be between 0 and 1. Defaults to 0.95.
     """
     # use independence of components to reduce to a univariate quantile
@@ -32,11 +32,12 @@ def multivariate_interval_length(dim, q=0.95):
     return scipy.stats.norm.ppf(univariate_quantile)
 
 
-def epistemic_coverage(not_averaged_predictions,  y, q=0.95, normalize_errors=False):
+def epistemic_coverage(not_averaged_predictions,  y, q=0.95,
+        normalize_errors=False,
+        noisy_y=True):
     """
     Returns the average coverage of `y` by the interval 
-    "prefactor * (predictions + q-Interval)",
-    where 
+    "predictions + prefactor * q-Interval", where 
     - "q-Interval" is the interval of measure `q` under the standard normal, 
     where
     - "predictions" are the entries of the first component of the tuple
@@ -45,8 +46,11 @@ def epistemic_coverage(not_averaged_predictions,  y, q=0.95, normalize_errors=Fa
       first component of `not_averaged_predictions`,if
     `normalize_errors` is set to False, or 1 if it is true. 
     The coverage is returned as given by the `y` and as a theoretical_coverage
-    computed from the epistemic uncertainty  and the aleatoric uncertainty
+    computed from the epistemic uncertainty and the aleatoric uncertainty
     (second component of `not_averaged_predictions`).
+    ** Note **: If `noisy_y` is True, the `y` will be treated as the unnoisy
+    ground truth. If it is False (default) it will be treated as noisy. For
+    real data only use the later. 
     :param not_averaged_predictions: A tuple of tensors as in the output of
     `FNNEIV.predict` with `take_average_of_prediction` set to `False`, i.e.:
     the predictions of the neural net not averaged over the first dimension 
@@ -58,6 +62,9 @@ def epistemic_coverage(not_averaged_predictions,  y, q=0.95, normalize_errors=Fa
     :param normalize_errors: If True, the deviations between predictions and
     `y` are normalized by the total uncertainty, computed from the aleatoric
     and epistemic uncertainty and the coverage w.r.t. q-interval is computed.
+    :param noisy_y: Boolean. If True (the default), `y` is treated as noisy and
+    the total uncertainty is considered. If False, `y` is treated as the
+    unnoisy ground truth.
     :returns: numerical_coverage, theoretical_coverage
     """
     out, sigmas = not_averaged_predictions
@@ -72,21 +79,26 @@ def epistemic_coverage(not_averaged_predictions,  y, q=0.95, normalize_errors=Fa
     y = y.view((*y.shape[:2], -1))
     sigmas = sigmas.view((*sigmas.shape[:2], -1))
     out = out.view((*out.shape[:2], -1))
-    # check if dimensions consistent
+    # check if dimensions are consistent
     assert y.shape == sigmas.shape
     assert y.shape[0] == out.shape[0]
     assert y.shape[2] == out.shape[2]
     # compute epistemic uncertainty
     epis_unc = torch.std(out, dim=1, keepdim=True)
-    # compute total uncertainty
     assert epis_unc.shape == sigmas.shape
-    total_unc = torch.sqrt(epis_unc**2 + sigmas **2)
+    # compute total uncertainty
+    if noisy_y:
+        total_unc = torch.sqrt(epis_unc**2 + sigmas **2)
+    else:
+        # for unnoisy y, the aleatoric uncertainty is treated as 0
+        total_unc = epis_unc
     # fix interval based on epis_unc
+    out_dim = y.shape[2]
     if not normalize_errors:
-        interval_length = multivariate_interval_length(dim=y.shape[1], q=q) \
+        interval_length = multivariate_interval_length(dim=out_dim, q=q) \
                 * epis_unc
     else:
-        interval_length = multivariate_interval_length(dim=y.shape[1], q=q) 
+        interval_length = multivariate_interval_length(dim=out_dim, q=q) 
     # numerical computation
     errors = out - y
     if normalize_errors:
diff --git a/Experiments/evaluate_metrics.py b/Experiments/evaluate_metrics.py
index 31af058a19075c616231689acacdba39c053f14c..db8d25d525bb2ccf019beb4e3f7e72409b57205f 100644
--- a/Experiments/evaluate_metrics.py
+++ b/Experiments/evaluate_metrics.py
@@ -59,15 +59,17 @@ except KeyError:
     device = torch.device('cpu')
 
 
-def collect_metrics(x,y, seed=0,
+def collect_metrics(x_y_pairs, seed=0,
     noneiv_number_of_draws=100, eiv_number_of_draws=[100,5],
     decouple_dimensions=False, device=device,
     scale_outputs=scale_outputs):
     """
     Compute various metrics for EiV and non-EiV. Will be returned as
     dictionaries.
-    :param x: A torch.tensor, taken as input
-    :param y: A torch.tensor, taken as output
+    :param x_y_pairs: A tuple of either the shape (None,None,x,y) or 
+    (x_true,y_true,x,y) containing torch.tensor or None. x and y are
+    considered as input and corresponding label. If the first two components
+    are not None, they are considered to be the unnoisy counterparts.
     :param seed: Integer. The seed used for loading, defaults to 0.
     :param noneiv_number_of_draws: Number of draws for non-EiV model
     for sampling from the posterior predictive. Defaults to 100.
@@ -82,8 +84,13 @@ def collect_metrics(x,y, seed=0,
     the log-dens to make them comparable with the literature.
     :returns: Dictionaries noneiv_metrics, eiv_metrics
     """
+    true_x, true_y, x, y = x_y_pairs
     x,y = x.to(device), y.to(device)
-
+    if true_x is not None:
+        assert true_y is not None
+        true_x,true_y = true_x.to(device), true_y.to(device)
+    else:
+        assert true_y is None
 
     # non-EiV
     noneiv_metrics = {}
@@ -126,6 +133,11 @@ def collect_metrics(x,y, seed=0,
     noneiv_metrics['res_std'] = normalized_std(not_averaged_predictions, y)
 
     # metrics that need a ground truth
+    if true_x is not None:
+        noneiv_metrics['true_coverage_numerical'],\
+                noneiv_metrics['true_coverage_theory'] =\
+                epistemic_coverage(not_averaged_predictions, true_y,
+                        normalize_errors=False, noisy_y=False)
 
 
     # NLL
@@ -187,6 +199,14 @@ def collect_metrics(x,y, seed=0,
             epistemic_coverage(not_averaged_predictions, y, normalize_errors=True)
     eiv_metrics['res_std' ]= normalized_std(not_averaged_predictions, y)
 
+    # metrics that need a ground truth
+    if true_x is not None:
+        eiv_metrics['true_coverage_numerical'],\
+                eiv_metrics['true_coverage_theory'] =\
+                epistemic_coverage(not_averaged_predictions, true_y,
+                        normalize_errors=False, noisy_y=False)
+
+
 
     # NLL
     if scale_outputs:
@@ -209,13 +229,9 @@ def collect_metrics(x,y, seed=0,
     return noneiv_metrics, eiv_metrics
 
 
-collection_keys = ['rmse','logdens','bias','coverage_numerical',
-        'coverage_theory','coverage_normalized','res_std']
 noneiv_metrics_collection = {}
 eiv_metrics_collection = {}
-for key in collection_keys:
-    noneiv_metrics_collection[key] = []
-    eiv_metrics_collection[key] = []
+collection_keys = []
 num_test_epochs = 10
 assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"]
 seed_list = range(noneiv_conf_dict["seed_range"][0],
@@ -223,22 +239,37 @@ seed_list = range(noneiv_conf_dict["seed_range"][0],
 max_batch_number = 2
 for seed in tqdm(seed_list):
     try:
-        train_data, test_data, (true_train_x, true_train_y),\
-                (true_test_x, true_test_y) \
+        train_data, test_data, true_train_data, true_test_data \
                 = load_data(seed=seed, return_ground_truth=True)
     except TypeError:
         train_data, test_data = load_data(seed=seed)
-        (true_train_x, true_train_y), (true_test_x, true_test_y)\
-                = (None,None), (None,None)
-    test_dataloader = DataLoader(test_data,
+        true_train_data, true_test_data = None, None
+    if true_test_data is None:
+        test_dataloader = DataLoader(test_data,
             batch_size=int(np.min((len(test_data),
         800))), shuffle=True)
+    else:
+        test_dataloader = DataLoader(true_test_data,
+                batch_size=int(np.min((len(true_test_data), 800))), shuffle=True)
     for i in tqdm(range(num_test_epochs)):
-        for j, (x,y) in enumerate(test_dataloader):
+        for j, x_y_pairs in enumerate(test_dataloader):
             if j > max_batch_number:
                 break
-
-            noneiv_metrics, eiv_metrics = collect_metrics(x,y, seed=seed)
+            # fill in ground truth with None, if not existent
+            if true_test_data is None:
+                x_y_pairs = (None, None, *x_y_pairs)
+            # should contain (true_x,true_y,x,y) or (None,None,x,y)
+            assert len(x_y_pairs) == 4
+            noneiv_metrics, eiv_metrics = collect_metrics(x_y_pairs,
+                    seed=seed)
+            if i==0 and j==0:
+                # fill collection keys
+                assert eiv_metrics.keys() == noneiv_metrics.keys()
+                collection_keys = list(eiv_metrics.keys())
+                for key in collection_keys:
+                    noneiv_metrics_collection[key] = []
+                    eiv_metrics_collection[key] = []
+            # collect results
             for key in collection_keys:
                 noneiv_metrics_collection[key].append(noneiv_metrics[key])
                 eiv_metrics_collection[key].append(eiv_metrics[key])