Included computation of full_seed_range metrics

By this we mean metrics that can only be computed when all seeds are evaluated so that, in particular, no std can be computed. The only metric of this kind at this point is the average of the absolute values of x-dependant biases ('avg-bias'). These metrics will only be computed for datasets with a dataloader of the type EIVData.repeated_sampling.repeated_sampling.

Included computation of full_seed_range metrics
01529051 · Jörg Martin · 947c258d · 3493ff60 · 01529051 · 01529051
Commit 01529051 authored 3 years ago by Jörg Martin
--- a/Experiments/create_tabular.py
+++ b/Experiments/create_tabular.py
@@ -3,7 +3,7 @@ import glob
 import argparse
 import json

-metrics_to_display = ['rmse','logdens','bias','true_coverage_numerical']
+metrics_to_display = ['rmse','logdens','bias','true_coverage_numerical','avg_bias']
 show_incomplete = True

 list_of_result_files = glob.glob(os.path.join('results','*.json'))
@@ -19,7 +19,13 @@ def save_readout(dictionary, key):
    the later doesn't exist, in which case (None,None) is returned.
    """
    try:
-        return dictionary[key]
+        readout = dictionary[key]
+        if type(readout) is list:
+            assert len(readout) == 2
+            return readout
+        else:
+            readout = float(readout)
+            return (readout, None)
    except KeyError:
        return (None,None)

@@ -40,7 +46,10 @@ for data in results.keys():
        if metric_mean is None:
            noneiv_results_string += '  None (None)'
        else:
-            noneiv_results_string += f'  {metric_mean:.3f} ({metric_std:.3f})'
+            if metric_std is not None:
+                noneiv_results_string += f'  {metric_mean:.3f} ({metric_std:.3f})'
+            else:
+                noneiv_results_string += f'  {metric_mean:.3f} (NaN)'
    print(noneiv_results_string)
    eiv_results = [save_readout(results[data]['eiv'],metric)
            for metric in metrics_to_display]
@@ -50,7 +59,10 @@ for data in results.keys():
        if metric_mean is None:
            eiv_results_string += '  None (None)'
        else:
-            eiv_results_string += f'  {metric_mean:.3f} ({metric_std:.3f})'
+            if metric_std is not None:
+                eiv_results_string += f'  {metric_mean:.3f} ({metric_std:.3f})'
+            else:
+                eiv_results_string += f'  {metric_mean:.3f} (NaN)'
    print(eiv_results_string)
    print(offset * '_' + 70 * '_')


--- a/Experiments/evaluate_metrics.py
+++ b/Experiments/evaluate_metrics.py
@@ -16,10 +16,11 @@ from tqdm import tqdm
 from EIVArchitectures import Networks
 from EIVTrainingRoutines import train_and_store
 from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std
+from EIVData.repeated_sampling import repeated_sampling

 # read in data via --data option
 parser = argparse.ArgumentParser()
-parser.add_argument("--data", help="Loads data", default='linear')
+parser.add_argument("--data", help="Loads data", default='replin')
 parser.add_argument("--no-autoindent", help="",
        action="store_true") # to avoid conflics in IPython
 args = parser.parse_args()
@@ -64,8 +65,8 @@ def collect_metrics(x_y_pairs, seed=0,
    decouple_dimensions=False, device=device,
    scale_outputs=scale_outputs):
    """
-    Compute various metrics for EiV and non-EiV. Will be returned as
-    dictionaries.
+    Compute various metrics for EiV and non-EiV for single seeds. Will be
+    returned as dictionaries.
    :param x_y_pairs: A tuple of either the shape (None,None,x,y) or 
    (x_true,y_true,x,y) containing torch.tensor or None. x and y are
    considered as input and corresponding label. If the first two components
@@ -235,6 +236,215 @@ def collect_metrics(x_y_pairs, seed=0,
    return noneiv_metrics, eiv_metrics


+
+def collect_full_seed_range_metrics(load_data,
+        seed_range,test_batch_size = 100, test_samples = 10,
+        noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], device=device,
+        scale_outputs=scale_outputs):
+    """
+    Collect metrics that need all seeds for their computation.
+    :param load_data: load_data map should take seed as an argument and,
+    optionally, `return_ground_truth`.
+    :param seed_range: iterator for seeds.
+    :param test_batch_size: An integer, used for drawing samples from the test
+    data.
+    :param test_samples: Number of test samples with batch size
+    `test_batch_size` to take.
+    :param noneiv_number_of_draws: Number of samples to take for the prediction
+    of the non-EiV model. Defaults to 100.
+    :param eiv_number_of_draws:Number of samples to take for the prediction
+    of the model. Defaults to [100,5].
+    :param device: The torch.device to use
+    :param scale_output: Boolean, scale the outputs for some metrics. Defaults
+    to False.
+    :returns: Dictionaries noneiv_metrics, eiv_metrics
+    """
+    noneiv_metrics = {}
+    eiv_metrics = {}
+    noneiv_residual_collection = []
+    eiv_residual_collection = []
+    for i, seed in enumerate(seed_range):
+        # load data according toseed
+        try:
+            train_data, test_data, true_train_data, true_test_data \
+                    = load_data(seed=seed, return_ground_truth=True)
+        except TypeError:
+            train_data, test_data = load_data(seed=seed)
+            true_train_data, true_test_data = None, None
+
+        ## Compute x-dependant bias
+
+        # only for repeated_sampling datasets
+        if type(load_data) == repeated_sampling:
+            # only if there is a ground truth
+            if true_test_data is not None:
+                # non-EiV
+                init_std_y = noneiv_conf_dict["init_std_y_list"][0]
+                unscaled_reg = noneiv_conf_dict["unscaled_reg"]
+                p = noneiv_conf_dict["p"]
+                hidden_layers = noneiv_conf_dict["hidden_layers"]
+                saved_file = os.path.join('saved_networks',
+                            f'noneiv_{short_dataname}'\
+                                    f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
+                                    f'_p_{p:.2f}_seed_{seed}.pkl')
+                net = Networks.FNNBer(p=p, init_std_y=init_std_y,
+                        h=[input_dim, *hidden_layers, output_dim]).to(device)
+                # load network
+                train_and_store.open_stored_training(saved_file=saved_file,
+                        net=net, device=device)
+
+                true_test_dataloader = DataLoader(true_test_data,
+                    batch_size=int(np.min((len(test_data), test_batch_size))),
+                    shuffle=False)
+                # to collect x-dependant residuals
+                true_scaled_res_collection = []
+                # variable to be used for checking
+                # that we loop over the same true_x for each seed
+                noneiv_true_x_sum = 0
+                for j, (true_x, true_y, noisy_x, _) in\
+                        enumerate(true_test_dataloader):
+                    if j >= test_samples:
+                        break
+                    # store the sum of the true_x
+                    noneiv_true_x_sum += true_x.abs().sum().item()
+                    
+                    true_x, true_y, noisy_x =\
+                            true_x.to(device), true_y.to(device),\
+                            noisy_x.to(device)
+                
+                    # Residuals
+                    training_state = net.training
+                    net.train()
+                    not_averaged_predictions = net.predict(noisy_x,\
+                            number_of_draws=noneiv_number_of_draws, 
+                            take_average_of_prediction=False)
+                    noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
+                    if len(true_y.shape) <= 1:
+                        true_y = true_y.view((-1,1))
+                    assert true_y.shape == noneiv_mean.shape
+                    true_res = true_y - noneiv_mean
+                    if scale_outputs:
+                        scale = train_data.dataset.std_labels.to(device)
+                        true_scaled_res = true_res * scale.view((1,-1))
+                    else:
+                        true_scaled_res = true_res
+
+                    # append residual
+                    true_scaled_res_collection.append(true_scaled_res)
+
+                    # restore net
+                    if training_state:
+                        net.train()
+                    else:
+                        net.eval()
+                if i>0:
+                    # check that the used true x are the same for each
+                    # seed, by comparing their sum
+                    assert noneiv_true_x_sum == old_noneiv_true_x_sum
+                old_noneiv_true_x_sum = noneiv_true_x_sum
+                
+                # concatenate batches along batch dimension
+                true_scaled_res_collection =\
+                        torch.concat(true_scaled_res_collection, dim=0)
+                noneiv_residual_collection.append(true_scaled_res_collection)
+
+
+                # EiV
+                init_std_y = eiv_conf_dict["init_std_y_list"][0]
+                unscaled_reg = eiv_conf_dict["unscaled_reg"]
+                p = eiv_conf_dict["p"]
+                hidden_layers = eiv_conf_dict["hidden_layers"]
+                fixed_std_x = eiv_conf_dict["fixed_std_x"]
+                saved_file = os.path.join('saved_networks',
+                        f'eiv_{short_dataname}'\
+                                f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
+                                f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
+                                f'_seed_{seed}.pkl')
+                net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
+                        h=[input_dim, *hidden_layers, output_dim],
+                        fixed_std_x=fixed_std_x).to(device)
+                # load network
+                train_and_store.open_stored_training(saved_file=saved_file,
+                        net=net, device=device)
+
+                # reinitialize dataloader to get the same true_x
+                true_test_dataloader = DataLoader(true_test_data,
+                    batch_size=int(np.min((len(test_data), test_batch_size))),
+                    shuffle=False)
+                true_scaled_res_collection = []
+                # variable to be used for checking
+                # that we loop over the same true_x for each seed
+                eiv_true_x_sum = 0
+                for j, (true_x, true_y, noisy_x, _) in\
+                        enumerate(true_test_dataloader):
+                    if j >= test_samples:
+                        break
+                    # store the sum of the true_x
+                    eiv_true_x_sum += true_x.abs().sum().item()
+                    true_x, true_y, noisy_x =\
+                            true_x.to(device), true_y.to(device),\
+                            noisy_x.to(device)
+                    # Residuals
+                    training_state = net.training
+                    noise_state = net.noise_is_on
+                    net.train()
+                    net.noise_on()
+                    not_averaged_predictions = net.predict(noisy_x,\
+                            number_of_draws=eiv_number_of_draws, 
+                            take_average_of_prediction=False)
+                    eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
+                    if len(true_y.shape) <= 1:
+                        true_y = true_y.view((-1,1))
+                    assert true_y.shape == eiv_mean.shape
+                    true_res = true_y - eiv_mean
+                    if scale_outputs:
+                        scale = train_data.dataset.std_labels.to(device)
+                        true_scaled_res = true_res * scale.view((1,-1))
+                    else:
+                        true_scaled_res = true_res
+                    # append residuals
+                    true_scaled_res_collection.append(true_scaled_res)
+                    # restore net
+                    if training_state:
+                        net.train()
+                    else:
+                        net.eval()
+                    if noise_state:
+                        net.noise_on()
+                    else:
+                        net.noise_off()
+                # check whether EiV and non-EiV used the same true_x for each
+                # seed by comparing their sum
+                assert eiv_true_x_sum == noneiv_true_x_sum
+                if i>0:
+                    assert eiv_true_x_sum == old_eiv_true_x_sum
+                old_eiv_true_x_sum = eiv_true_x_sum
+                # concate batches along batch dimension
+                true_scaled_res_collection =\
+                        torch.concat(true_scaled_res_collection, dim=0)
+                eiv_residual_collection.append(true_scaled_res_collection)
+
+
+    ## Store quantities
+
+    # Compute and store (averaged) x-dependant bias
+    if type(load_data) == repeated_sampling and\
+            len(noneiv_residual_collection) > 0 and\
+            len(eiv_residual_collection) > 0:
+        noneiv_residual_collection = torch.stack(\
+                tuple(noneiv_residual_collection), dim=-1)
+        bias_per_x = torch.mean(noneiv_residual_collection, dim=-1)
+        avg_bias = torch.mean(torch.abs(bias_per_x))
+        noneiv_metrics['avg_bias'] = avg_bias
+
+        eiv_residual_collection = torch.stack(tuple(eiv_residual_collection),\
+                dim=-1)
+        bias_per_x = torch.mean(eiv_residual_collection, dim=-1)
+        avg_bias = torch.mean(torch.abs(bias_per_x))
+        eiv_metrics['avg_bias'] = avg_bias
+    return noneiv_metrics, eiv_metrics
+
+# single seed metrics
 noneiv_metrics_collection = {}
 eiv_metrics_collection = {}
 collection_keys = []
@@ -242,6 +452,15 @@ num_test_epochs = 10
 assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"]
 seed_list = range(noneiv_conf_dict["seed_range"][0],
        noneiv_conf_dict["seed_range"][1])
+
+
+
+
+
+
+
+
+
 max_batch_number = 2
 for seed in tqdm(seed_list):
    try:
@@ -280,22 +499,51 @@ for seed in tqdm(seed_list):
                noneiv_metrics_collection[key].append(noneiv_metrics[key])
                eiv_metrics_collection[key].append(eiv_metrics[key])

+# full seed range metrics
+print('Computing metrics that use all seeds at once...')
+noneiv_full_seed_range_metrics, eiv_full_seed_range_metrics =\
+        collect_full_seed_range_metrics(load_data=load_data,\
+                seed_range=seed_list)
+# add keys to collection_keys
+assert noneiv_full_seed_range_metrics.keys() ==\
+        eiv_full_seed_range_metrics.keys()
+full_seed_range_collection_keys = list(noneiv_full_seed_range_metrics.keys())
+collection_keys += full_seed_range_collection_keys
+
+
 results_dict = {}
 print('Non-EiV:\n-----')
 results_dict['noneiv'] = {}
 for key in collection_keys:
-    metric_mean = float(np.mean(noneiv_metrics_collection[key]))
-    metric_std  = float(np.std(noneiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)))
-    results_dict['noneiv'][key] = (metric_mean, metric_std)
-    print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
+    if key not in full_seed_range_collection_keys:
+        # per seed metrics
+        metric_mean = float(np.mean(noneiv_metrics_collection[key]))
+        metric_std  = float(np.std(noneiv_metrics_collection[key])/\
+                np.sqrt(num_test_epochs*len(seed_list)))
+        results_dict['noneiv'][key] = (metric_mean, metric_std)
+        print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
+    else:
+        # full seed range metrics (without a std)
+        metric = float(noneiv_full_seed_range_metrics[key])
+        results_dict['noneiv'][key] = metric
+        print(f'{key}: {metric:.5f} (NaN)')
+
 print('\n')
 print('EiV:\n-----')
 results_dict['eiv'] = {}
 for key in collection_keys:
-    metric_mean = float(np.mean(eiv_metrics_collection[key]))
-    metric_std  = float(np.std(eiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)))
-    print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
-    results_dict['eiv'][key] = (metric_mean, metric_std)
+    if key not in full_seed_range_collection_keys:
+        # per seed metrics
+        metric_mean = float(np.mean(eiv_metrics_collection[key]))
+        metric_std  = float(np.std(eiv_metrics_collection[key])/\
+                np.sqrt(num_test_epochs*len(seed_list)))
+        print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
+        results_dict['eiv'][key] = (metric_mean, metric_std)
+    else:
+        # full seed range metrics (without a std)
+        metric = float(eiv_full_seed_range_metrics[key])
+        results_dict['eiv'][key] = metric
+        print(f'{key}: {metric:.5f} (NaN)')

 # write results to a JSON file in the results folder
 with open(os.path.join('results',f'metrics_{short_dataname}.json'), 'w') as f: