Included full_seed range quantities in evaluate_metrics.

This allows now to compute (the average of) a x-dependant bias.

Included full_seed range quantities in evaluate_metrics.
c44c4524 · Jörg Martin · 947c258d · c44c4524
Commit c44c4524 authored 3 years ago by Jörg Martin
--- a/Experiments/evaluate_metrics.py
+++ b/Experiments/evaluate_metrics.py
@@ -16,10 +16,11 @@ from tqdm import tqdm
 from EIVArchitectures import Networks
 from EIVTrainingRoutines import train_and_store
 from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std
+from EIVData.repeated_sampling import repeated_sampling

 # read in data via --data option
 parser = argparse.ArgumentParser()
-parser.add_argument("--data", help="Loads data", default='linear')
+parser.add_argument("--data", help="Loads data", default='replin')
 parser.add_argument("--no-autoindent", help="",
        action="store_true") # to avoid conflics in IPython
 args = parser.parse_args()
@@ -64,8 +65,8 @@ def collect_metrics(x_y_pairs, seed=0,
    decouple_dimensions=False, device=device,
    scale_outputs=scale_outputs):
    """
-    Compute various metrics for EiV and non-EiV. Will be returned as
-    dictionaries.
+    Compute various metrics for EiV and non-EiV for single seeds. Will be
+    returned as dictionaries.
    :param x_y_pairs: A tuple of either the shape (None,None,x,y) or 
    (x_true,y_true,x,y) containing torch.tensor or None. x and y are
    considered as input and corresponding label. If the first two components
@@ -235,6 +236,215 @@ def collect_metrics(x_y_pairs, seed=0,
    return noneiv_metrics, eiv_metrics


+
+def collect_full_seed_range_metrics(load_data,
+        seed_range,test_batch_size = 100, test_samples = 10,
+        noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], device=device,
+        scale_outputs=scale_outputs):
+    """
+    Collect metrics that need all seeds for their computation.
+    :param load_data: load_data map should take seed as an argument and,
+    optionally, `return_ground_truth`.
+    :param seed_range: iterator for seeds.
+    :param test_batch_size: An integer, used for drawing samples from the test
+    data.
+    :param test_samples: Number of test samples with batch size
+    `test_batch_size` to take.
+    :param noneiv_number_of_draws: Number of samples to take for the prediction
+    of the non-EiV model. Defaults to 100.
+    :param eiv_number_of_draws:Number of samples to take for the prediction
+    of the model. Defaults to [100,5].
+    :param device: The torch.device to use
+    :param scale_output: Boolean, scale the outputs for some metrics. Defaults
+    to False.
+    :returns: Dictionaries noneiv_metrics, eiv_metrics
+    """
+    noneiv_metrics = {}
+    eiv_metrics = {}
+    noneiv_residual_collection = []
+    eiv_residual_collection = []
+    for i, seed in enumerate(seed_range):
+        # load data according toseed
+        try:
+            train_data, test_data, true_train_data, true_test_data \
+                    = load_data(seed=seed, return_ground_truth=True)
+        except TypeError:
+            train_data, test_data = load_data(seed=seed)
+            true_train_data, true_test_data = None, None
+
+        ## Compute x-dependant bias
+
+        # only for repeated_sampling datasets
+        if type(load_data) == repeated_sampling:
+            # only if there is a ground truth
+            if true_test_data is not None:
+                # non-EiV
+                init_std_y = noneiv_conf_dict["init_std_y_list"][0]
+                unscaled_reg = noneiv_conf_dict["unscaled_reg"]
+                p = noneiv_conf_dict["p"]
+                hidden_layers = noneiv_conf_dict["hidden_layers"]
+                saved_file = os.path.join('saved_networks',
+                            f'noneiv_{short_dataname}'\
+                                    f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
+                                    f'_p_{p:.2f}_seed_{seed}.pkl')
+                net = Networks.FNNBer(p=p, init_std_y=init_std_y,
+                        h=[input_dim, *hidden_layers, output_dim]).to(device)
+                # load network
+                train_and_store.open_stored_training(saved_file=saved_file,
+                        net=net, device=device)
+
+                true_test_dataloader = DataLoader(true_test_data,
+                    batch_size=int(np.min((len(test_data), test_batch_size))),
+                    shuffle=False)
+                # to collect x-dependant residuals
+                true_scaled_res_collection = []
+                # variable to be used for checking
+                # that we loop over the same true_x for each seed
+                noneiv_true_x_sum = 0
+                for j, (true_x, true_y, noisy_x, _) in\
+                        enumerate(true_test_dataloader):
+                    if j >= test_samples:
+                        break
+                    # store the sum of the true_x
+                    noneiv_true_x_sum += true_x.abs().sum().item()
+                    
+                    true_x, true_y, noisy_x =\
+                            true_x.to(device), true_y.to(device),\
+                            noisy_x.to(device)
+                
+                    # Residuals
+                    training_state = net.training
+                    net.train()
+                    not_averaged_predictions = net.predict(noisy_x,\
+                            number_of_draws=noneiv_number_of_draws, 
+                            take_average_of_prediction=False)
+                    noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
+                    if len(true_y.shape) <= 1:
+                        true_y = true_y.view((-1,1))
+                    assert true_y.shape == noneiv_mean.shape
+                    true_res = true_y - noneiv_mean
+                    if scale_outputs:
+                        scale = train_data.dataset.std_labels.to(device)
+                        true_scaled_res = true_res * scale.view((1,-1))
+                    else:
+                        true_scaled_res = true_res
+
+                    # append residual
+                    true_scaled_res_collection.append(true_scaled_res)
+
+                    # restore net
+                    if training_state:
+                        net.train()
+                    else:
+                        net.eval()
+                if i>0:
+                    # check that the used true x are the same for each
+                    # seed, by comparing their sum
+                    assert noneiv_true_x_sum == old_noneiv_true_x_sum
+                old_noneiv_true_x_sum = noneiv_true_x_sum
+                
+                # concatenate batches along batch dimension
+                true_scaled_res_collection =\
+                        torch.concat(true_scaled_res_collection, dim=0)
+                noneiv_residual_collection.append(true_scaled_res_collection)
+
+
+                # EiV
+                init_std_y = eiv_conf_dict["init_std_y_list"][0]
+                unscaled_reg = eiv_conf_dict["unscaled_reg"]
+                p = eiv_conf_dict["p"]
+                hidden_layers = eiv_conf_dict["hidden_layers"]
+                fixed_std_x = eiv_conf_dict["fixed_std_x"]
+                saved_file = os.path.join('saved_networks',
+                        f'eiv_{short_dataname}'\
+                                f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
+                                f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
+                                f'_seed_{seed}.pkl')
+                net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
+                        h=[input_dim, *hidden_layers, output_dim],
+                        fixed_std_x=fixed_std_x).to(device)
+                # load network
+                train_and_store.open_stored_training(saved_file=saved_file,
+                        net=net, device=device)
+
+                # reinitialize dataloader to get the same true_x
+                true_test_dataloader = DataLoader(true_test_data,
+                    batch_size=int(np.min((len(test_data), test_batch_size))),
+                    shuffle=False)
+                true_scaled_res_collection = []
+                # variable to be used for checking
+                # that we loop over the same true_x for each seed
+                eiv_true_x_sum = 0
+                for j, (true_x, true_y, noisy_x, _) in\
+                        enumerate(true_test_dataloader):
+                    if j >= test_samples:
+                        break
+                    # store the sum of the true_x
+                    eiv_true_x_sum += true_x.abs().sum().item()
+                    true_x, true_y, noisy_x =\
+                            true_x.to(device), true_y.to(device),\
+                            noisy_x.to(device)
+                    # Residuals
+                    training_state = net.training
+                    noise_state = net.noise_is_on
+                    net.train()
+                    net.noise_on()
+                    not_averaged_predictions = net.predict(noisy_x,\
+                            number_of_draws=eiv_number_of_draws, 
+                            take_average_of_prediction=False)
+                    eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
+                    if len(true_y.shape) <= 1:
+                        true_y = true_y.view((-1,1))
+                    assert true_y.shape == eiv_mean.shape
+                    true_res = true_y - eiv_mean
+                    if scale_outputs:
+                        scale = train_data.dataset.std_labels.to(device)
+                        true_scaled_res = true_res * scale.view((1,-1))
+                    else:
+                        true_scaled_res = true_res
+                    # append residuals
+                    true_scaled_res_collection.append(true_scaled_res)
+                    # restore net
+                    if training_state:
+                        net.train()
+                    else:
+                        net.eval()
+                    if noise_state:
+                        net.noise_on()
+                    else:
+                        net.noise_off()
+                # check whether EiV and non-EiV used the same true_x for each
+                # seed by comparing their sum
+                assert eiv_true_x_sum == noneiv_true_x_sum
+                if i>0:
+                    assert eiv_true_x_sum == old_eiv_true_x_sum
+                old_eiv_true_x_sum = eiv_true_x_sum
+                # concate batches along batch dimension
+                true_scaled_res_collection =\
+                        torch.concat(true_scaled_res_collection, dim=0)
+                eiv_residual_collection.append(true_scaled_res_collection)
+
+
+    ## Store quantities
+
+    # Compute and store (averaged) x-dependant bias
+    if type(load_data) == repeated_sampling and\
+            len(noneiv_residual_collection) > 0 and\
+            len(eiv_residual_collection) > 0:
+        noneiv_residual_collection = torch.stack(\
+                tuple(noneiv_residual_collection), dim=-1)
+        bias_per_x = torch.mean(noneiv_residual_collection, dim=-1)
+        avg_bias = torch.mean(torch.abs(bias_per_x))
+        noneiv_metrics['avg_bias'] = avg_bias
+
+        eiv_residual_collection = torch.stack(tuple(eiv_residual_collection),\
+                dim=-1)
+        bias_per_x = torch.mean(eiv_residual_collection, dim=-1)
+        avg_bias = torch.mean(torch.abs(bias_per_x))
+        eiv_metrics['avg_bias'] = avg_bias
+    return noneiv_metrics, eiv_metrics
+
+# single seed metrics
 noneiv_metrics_collection = {}
 eiv_metrics_collection = {}
 collection_keys = []
@@ -242,6 +452,15 @@ num_test_epochs = 10
 assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"]
 seed_list = range(noneiv_conf_dict["seed_range"][0],
        noneiv_conf_dict["seed_range"][1])
+
+
+
+
+
+
+
+
+
 max_batch_number = 2
 for seed in tqdm(seed_list):
    try:
@@ -280,22 +499,51 @@ for seed in tqdm(seed_list):
                noneiv_metrics_collection[key].append(noneiv_metrics[key])
                eiv_metrics_collection[key].append(eiv_metrics[key])

+# full seed range metrics
+print('Computing metrics that use all seeds at once...')
+noneiv_full_seed_range_metrics, eiv_full_seed_range_metrics =\
+        collect_full_seed_range_metrics(load_data=load_data,\
+                seed_range=seed_list)
+# add keys to collection_keys
+assert noneiv_full_seed_range_metrics.keys() ==\
+        eiv_full_seed_range_metrics.keys()
+full_seed_range_collection_keys = list(noneiv_full_seed_range_metrics.keys())
+collection_keys += full_seed_range_collection_keys
+
+
 results_dict = {}
 print('Non-EiV:\n-----')
 results_dict['noneiv'] = {}
 for key in collection_keys:
-    metric_mean = float(np.mean(noneiv_metrics_collection[key]))
-    metric_std  = float(np.std(noneiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)))
-    results_dict['noneiv'][key] = (metric_mean, metric_std)
-    print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
+    if key not in full_seed_range_collection_keys:
+        # per seed metrics
+        metric_mean = float(np.mean(noneiv_metrics_collection[key]))
+        metric_std  = float(np.std(noneiv_metrics_collection[key])/\
+                np.sqrt(num_test_epochs*len(seed_list)))
+        results_dict['noneiv'][key] = (metric_mean, metric_std)
+        print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
+    else:
+        # full seed range metrics (without a std)
+        metric = float(noneiv_full_seed_range_metrics[key])
+        results_dict['noneiv'][key] = metric
+        print(f'{key}: {metric:.5f} (NaN)')
+
 print('\n')
 print('EiV:\n-----')
 results_dict['eiv'] = {}
 for key in collection_keys:
-    metric_mean = float(np.mean(eiv_metrics_collection[key]))
-    metric_std  = float(np.std(eiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)))
-    print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
-    results_dict['eiv'][key] = (metric_mean, metric_std)
+    if key not in full_seed_range_collection_keys:
+        # per seed metrics
+        metric_mean = float(np.mean(eiv_metrics_collection[key]))
+        metric_std  = float(np.std(eiv_metrics_collection[key])/\
+                np.sqrt(num_test_epochs*len(seed_list)))
+        print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
+        results_dict['eiv'][key] = (metric_mean, metric_std)
+    else:
+        # full seed range metrics (without a std)
+        metric = float(eiv_full_seed_range_metrics[key])
+        results_dict['eiv'][key] = metric
+        print(f'{key}: {metric:.5f} (NaN)')

 # write results to a JSON file in the results folder
 with open(os.path.join('results',f'metrics_{short_dataname}.json'), 'w') as f: