diff --git a/Experiments/evaluate_metrics.py b/Experiments/evaluate_metrics.py index 2f8f6c849c90ea279fd0e90984dea3188715e1f9..22d4e66495a8e87cbc6230c825d728629cb53ae2 100644 --- a/Experiments/evaluate_metrics.py +++ b/Experiments/evaluate_metrics.py @@ -16,10 +16,11 @@ from tqdm import tqdm from EIVArchitectures import Networks from EIVTrainingRoutines import train_and_store from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std +from EIVData.repeated_sampling import repeated_sampling # read in data via --data option parser = argparse.ArgumentParser() -parser.add_argument("--data", help="Loads data", default='linear') +parser.add_argument("--data", help="Loads data", default='replin') parser.add_argument("--no-autoindent", help="", action="store_true") # to avoid conflics in IPython args = parser.parse_args() @@ -64,8 +65,8 @@ def collect_metrics(x_y_pairs, seed=0, decouple_dimensions=False, device=device, scale_outputs=scale_outputs): """ - Compute various metrics for EiV and non-EiV. Will be returned as - dictionaries. + Compute various metrics for EiV and non-EiV for single seeds. Will be + returned as dictionaries. :param x_y_pairs: A tuple of either the shape (None,None,x,y) or (x_true,y_true,x,y) containing torch.tensor or None. x and y are considered as input and corresponding label. If the first two components @@ -235,6 +236,215 @@ def collect_metrics(x_y_pairs, seed=0, return noneiv_metrics, eiv_metrics + +def collect_full_seed_range_metrics(load_data, + seed_range,test_batch_size = 100, test_samples = 10, + noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], device=device, + scale_outputs=scale_outputs): + """ + Collect metrics that need all seeds for their computation. + :param load_data: load_data map should take seed as an argument and, + optionally, `return_ground_truth`. + :param seed_range: iterator for seeds. + :param test_batch_size: An integer, used for drawing samples from the test + data. + :param test_samples: Number of test samples with batch size + `test_batch_size` to take. + :param noneiv_number_of_draws: Number of samples to take for the prediction + of the non-EiV model. Defaults to 100. + :param eiv_number_of_draws:Number of samples to take for the prediction + of the model. Defaults to [100,5]. + :param device: The torch.device to use + :param scale_output: Boolean, scale the outputs for some metrics. Defaults + to False. + :returns: Dictionaries noneiv_metrics, eiv_metrics + """ + noneiv_metrics = {} + eiv_metrics = {} + noneiv_residual_collection = [] + eiv_residual_collection = [] + for i, seed in enumerate(seed_range): + # load data according toseed + try: + train_data, test_data, true_train_data, true_test_data \ + = load_data(seed=seed, return_ground_truth=True) + except TypeError: + train_data, test_data = load_data(seed=seed) + true_train_data, true_test_data = None, None + + ## Compute x-dependant bias + + # only for repeated_sampling datasets + if type(load_data) == repeated_sampling: + # only if there is a ground truth + if true_test_data is not None: + # non-EiV + init_std_y = noneiv_conf_dict["init_std_y_list"][0] + unscaled_reg = noneiv_conf_dict["unscaled_reg"] + p = noneiv_conf_dict["p"] + hidden_layers = noneiv_conf_dict["hidden_layers"] + saved_file = os.path.join('saved_networks', + f'noneiv_{short_dataname}'\ + f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\ + f'_p_{p:.2f}_seed_{seed}.pkl') + net = Networks.FNNBer(p=p, init_std_y=init_std_y, + h=[input_dim, *hidden_layers, output_dim]).to(device) + # load network + train_and_store.open_stored_training(saved_file=saved_file, + net=net, device=device) + + true_test_dataloader = DataLoader(true_test_data, + batch_size=int(np.min((len(test_data), test_batch_size))), + shuffle=False) + # to collect x-dependant residuals + true_scaled_res_collection = [] + # variable to be used for checking + # that we loop over the same true_x for each seed + noneiv_true_x_sum = 0 + for j, (true_x, true_y, noisy_x, _) in\ + enumerate(true_test_dataloader): + if j >= test_samples: + break + # store the sum of the true_x + noneiv_true_x_sum += true_x.abs().sum().item() + + true_x, true_y, noisy_x =\ + true_x.to(device), true_y.to(device),\ + noisy_x.to(device) + + # Residuals + training_state = net.training + net.train() + not_averaged_predictions = net.predict(noisy_x,\ + number_of_draws=noneiv_number_of_draws, + take_average_of_prediction=False) + noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1) + if len(true_y.shape) <= 1: + true_y = true_y.view((-1,1)) + assert true_y.shape == noneiv_mean.shape + true_res = true_y - noneiv_mean + if scale_outputs: + scale = train_data.dataset.std_labels.to(device) + true_scaled_res = true_res * scale.view((1,-1)) + else: + true_scaled_res = true_res + + # append residual + true_scaled_res_collection.append(true_scaled_res) + + # restore net + if training_state: + net.train() + else: + net.eval() + if i>0: + # check that the used true x are the same for each + # seed, by comparing their sum + assert noneiv_true_x_sum == old_noneiv_true_x_sum + old_noneiv_true_x_sum = noneiv_true_x_sum + + # concatenate batches along batch dimension + true_scaled_res_collection =\ + torch.concat(true_scaled_res_collection, dim=0) + noneiv_residual_collection.append(true_scaled_res_collection) + + + # EiV + init_std_y = eiv_conf_dict["init_std_y_list"][0] + unscaled_reg = eiv_conf_dict["unscaled_reg"] + p = eiv_conf_dict["p"] + hidden_layers = eiv_conf_dict["hidden_layers"] + fixed_std_x = eiv_conf_dict["fixed_std_x"] + saved_file = os.path.join('saved_networks', + f'eiv_{short_dataname}'\ + f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\ + f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\ + f'_seed_{seed}.pkl') + net = Networks.FNNEIV(p=p, init_std_y=init_std_y, + h=[input_dim, *hidden_layers, output_dim], + fixed_std_x=fixed_std_x).to(device) + # load network + train_and_store.open_stored_training(saved_file=saved_file, + net=net, device=device) + + # reinitialize dataloader to get the same true_x + true_test_dataloader = DataLoader(true_test_data, + batch_size=int(np.min((len(test_data), test_batch_size))), + shuffle=False) + true_scaled_res_collection = [] + # variable to be used for checking + # that we loop over the same true_x for each seed + eiv_true_x_sum = 0 + for j, (true_x, true_y, noisy_x, _) in\ + enumerate(true_test_dataloader): + if j >= test_samples: + break + # store the sum of the true_x + eiv_true_x_sum += true_x.abs().sum().item() + true_x, true_y, noisy_x =\ + true_x.to(device), true_y.to(device),\ + noisy_x.to(device) + # Residuals + training_state = net.training + noise_state = net.noise_is_on + net.train() + net.noise_on() + not_averaged_predictions = net.predict(noisy_x,\ + number_of_draws=eiv_number_of_draws, + take_average_of_prediction=False) + eiv_mean = torch.mean(not_averaged_predictions[0], dim=1) + if len(true_y.shape) <= 1: + true_y = true_y.view((-1,1)) + assert true_y.shape == eiv_mean.shape + true_res = true_y - eiv_mean + if scale_outputs: + scale = train_data.dataset.std_labels.to(device) + true_scaled_res = true_res * scale.view((1,-1)) + else: + true_scaled_res = true_res + # append residuals + true_scaled_res_collection.append(true_scaled_res) + # restore net + if training_state: + net.train() + else: + net.eval() + if noise_state: + net.noise_on() + else: + net.noise_off() + # check whether EiV and non-EiV used the same true_x for each + # seed by comparing their sum + assert eiv_true_x_sum == noneiv_true_x_sum + if i>0: + assert eiv_true_x_sum == old_eiv_true_x_sum + old_eiv_true_x_sum = eiv_true_x_sum + # concate batches along batch dimension + true_scaled_res_collection =\ + torch.concat(true_scaled_res_collection, dim=0) + eiv_residual_collection.append(true_scaled_res_collection) + + + ## Store quantities + + # Compute and store (averaged) x-dependant bias + if type(load_data) == repeated_sampling and\ + len(noneiv_residual_collection) > 0 and\ + len(eiv_residual_collection) > 0: + noneiv_residual_collection = torch.stack(\ + tuple(noneiv_residual_collection), dim=-1) + bias_per_x = torch.mean(noneiv_residual_collection, dim=-1) + avg_bias = torch.mean(torch.abs(bias_per_x)) + noneiv_metrics['avg_bias'] = avg_bias + + eiv_residual_collection = torch.stack(tuple(eiv_residual_collection),\ + dim=-1) + bias_per_x = torch.mean(eiv_residual_collection, dim=-1) + avg_bias = torch.mean(torch.abs(bias_per_x)) + eiv_metrics['avg_bias'] = avg_bias + return noneiv_metrics, eiv_metrics + +# single seed metrics noneiv_metrics_collection = {} eiv_metrics_collection = {} collection_keys = [] @@ -242,6 +452,15 @@ num_test_epochs = 10 assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"] seed_list = range(noneiv_conf_dict["seed_range"][0], noneiv_conf_dict["seed_range"][1]) + + + + + + + + + max_batch_number = 2 for seed in tqdm(seed_list): try: @@ -280,22 +499,51 @@ for seed in tqdm(seed_list): noneiv_metrics_collection[key].append(noneiv_metrics[key]) eiv_metrics_collection[key].append(eiv_metrics[key]) +# full seed range metrics +print('Computing metrics that use all seeds at once...') +noneiv_full_seed_range_metrics, eiv_full_seed_range_metrics =\ + collect_full_seed_range_metrics(load_data=load_data,\ + seed_range=seed_list) +# add keys to collection_keys +assert noneiv_full_seed_range_metrics.keys() ==\ + eiv_full_seed_range_metrics.keys() +full_seed_range_collection_keys = list(noneiv_full_seed_range_metrics.keys()) +collection_keys += full_seed_range_collection_keys + + results_dict = {} print('Non-EiV:\n-----') results_dict['noneiv'] = {} for key in collection_keys: - metric_mean = float(np.mean(noneiv_metrics_collection[key])) - metric_std = float(np.std(noneiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list))) - results_dict['noneiv'][key] = (metric_mean, metric_std) - print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})') + if key not in full_seed_range_collection_keys: + # per seed metrics + metric_mean = float(np.mean(noneiv_metrics_collection[key])) + metric_std = float(np.std(noneiv_metrics_collection[key])/\ + np.sqrt(num_test_epochs*len(seed_list))) + results_dict['noneiv'][key] = (metric_mean, metric_std) + print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})') + else: + # full seed range metrics (without a std) + metric = float(noneiv_full_seed_range_metrics[key]) + results_dict['noneiv'][key] = metric + print(f'{key}: {metric:.5f} (NaN)') + print('\n') print('EiV:\n-----') results_dict['eiv'] = {} for key in collection_keys: - metric_mean = float(np.mean(eiv_metrics_collection[key])) - metric_std = float(np.std(eiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list))) - print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})') - results_dict['eiv'][key] = (metric_mean, metric_std) + if key not in full_seed_range_collection_keys: + # per seed metrics + metric_mean = float(np.mean(eiv_metrics_collection[key])) + metric_std = float(np.std(eiv_metrics_collection[key])/\ + np.sqrt(num_test_epochs*len(seed_list))) + print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})') + results_dict['eiv'][key] = (metric_mean, metric_std) + else: + # full seed range metrics (without a std) + metric = float(eiv_full_seed_range_metrics[key]) + results_dict['eiv'][key] = metric + print(f'{key}: {metric:.5f} (NaN)') # write results to a JSON file in the results folder with open(os.path.join('results',f'metrics_{short_dataname}.json'), 'w') as f: