""" Compute metrics for datasets for which there is not necessarily a ground truth. Results will be stored in the results folder """ import importlib import os import argparse import json import numpy as np import torch import torch.backends.cudnn from torch.utils.data import DataLoader from tqdm import tqdm from EIVArchitectures import Networks from EIVTrainingRoutines import train_and_store from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std from EIVData.repeated_sampling import repeated_sampling # read in data via --data option parser = argparse.ArgumentParser() parser.add_argument("--data", help="Loads data", default='replin') parser.add_argument("--no-autoindent", help="", action="store_true") # to avoid conflics in IPython args = parser.parse_args() data = args.data # load hyperparameters from JSON file with open(os.path.join('configurations',f'eiv_{data}.json'),'r') as conf_file: eiv_conf_dict = json.load(conf_file) with open(os.path.join('configurations',f'noneiv_{data}.json'),'r') as conf_file: noneiv_conf_dict = json.load(conf_file) long_dataname = eiv_conf_dict["long_dataname"] short_dataname = eiv_conf_dict["short_dataname"] print(f"Evaluating {long_dataname}") scale_outputs = False load_data = importlib.import_module(f'EIVData.{long_dataname}').load_data train_data, test_data = load_data() input_dim = train_data[0][0].numel() output_dim = train_data[0][1].numel() try: gpu_number = eiv_conf_dict["gpu_number"] device = torch.device(f'cuda:{gpu_number}') try: torch.tensor([0.0]).to(device) except RuntimeError: if torch.cuda.is_available(): print('Switched to GPU 0') device = torch.device('cuda:0') else: print('No cuda available, using CPU') device = torch.device('cpu') except KeyError: device = torch.device('cpu') def collect_metrics(x_y_pairs, seed=0, noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], decouple_dimensions=False, device=device, scale_outputs=scale_outputs): """ Compute various metrics for EiV and non-EiV for single seeds. Will be returned as dictionaries. :param x_y_pairs: A tuple of either the shape (None,None,x,y) or (x_true,y_true,x,y) containing torch.tensor or None. x and y are considered as input and corresponding label. If the first two components are not None, they are considered to be the unnoisy counterparts. :param seed: Integer. The seed used for loading, defaults to 0. :param noneiv_number_of_draws: Number of draws for non-EiV model for sampling from the posterior predictive. Defaults to 100. :param noneiv_number_of_draws: Number of draws for EiV model for sampling from the posterior predictive. Defaults to 100. :param decouple_dimensions: Boolean. If True, the unsual convention of Gal et al. is followed where, in the evaluation of the log-posterior-predictive, each dimension is treated independently and then averaged. If False (default), a multivariate distribution is used. :param device: The device to use. :param scale_output: Boolean, scale the outputs for the RMSE, the bias and the log-dens to make them comparable with the literature. :returns: Dictionaries noneiv_metrics, eiv_metrics """ true_x, true_y, x, y = x_y_pairs x,y = x.to(device), y.to(device) if true_x is not None: assert true_y is not None true_x,true_y = true_x.to(device), true_y.to(device) else: assert true_y is None # non-EiV noneiv_metrics = {} init_std_y = noneiv_conf_dict["init_std_y_list"][0] unscaled_reg = noneiv_conf_dict["unscaled_reg"] p = noneiv_conf_dict["p"] hidden_layers = noneiv_conf_dict["hidden_layers"] saved_file = os.path.join('saved_networks', f'noneiv_{short_dataname}'\ f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\ f'_p_{p:.2f}_seed_{seed}.pkl') net = Networks.FNNBer(p=p, init_std_y=init_std_y, h=[input_dim, *hidden_layers, output_dim]).to(device) train_and_store.open_stored_training(saved_file=saved_file, net=net, device=device) # RMSE training_state = net.training net.train() not_averaged_predictions = net.predict(x,\ number_of_draws=noneiv_number_of_draws, take_average_of_prediction=False) noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1) if len(y.shape) <= 1: y = y.view((-1,1)) assert y.shape == noneiv_mean.shape res = y-noneiv_mean if scale_outputs: scale = train_data.dataset.std_labels.to(device) scaled_res = res * scale.view((1,-1)) else: scaled_res = res scaled_res = scaled_res.detach().cpu().numpy().flatten() noneiv_metrics['rmse'] = np.sqrt(np.mean(scaled_res**2)) noneiv_metrics['bias'] = np.mean(scaled_res) noneiv_metrics['coverage_numerical'], noneiv_metrics['coverage_theory'] =\ epistemic_coverage(not_averaged_predictions, y,\ normalize_errors=False) noneiv_metrics['coverage_normalized'],_ =\ epistemic_coverage(not_averaged_predictions, y,\ normalize_errors=True) noneiv_metrics['res_std'] = normalized_std(not_averaged_predictions, y) # metrics that need a ground truth if true_x is not None: noneiv_metrics['true_coverage_numerical'],\ noneiv_metrics['true_coverage_theory'] =\ epistemic_coverage(not_averaged_predictions, true_y, normalize_errors=False, noisy_y=False) true_res = (true_y - noneiv_mean).detach().cpu().numpy().flatten() noneiv_metrics['true_rmse'] = np.sqrt(np.mean(true_res**2)) # NLL if scale_outputs: scale_labels = train_data.dataset.std_labels.view((-1,)).to(device) else: scale_labels = None noneiv_metrics['logdens' ]= net.predictive_logdensity( not_averaged_predictions, y, number_of_draws=100, decouple_dimensions=decouple_dimensions, scale_labels=scale_labels).mean().detach().cpu().numpy() if training_state: net.train() else: net.eval() # EiV eiv_metrics = {} init_std_y = eiv_conf_dict["init_std_y_list"][0] unscaled_reg = eiv_conf_dict["unscaled_reg"] p = eiv_conf_dict["p"] hidden_layers = eiv_conf_dict["hidden_layers"] fixed_std_x = eiv_conf_dict["fixed_std_x"] saved_file = os.path.join('saved_networks', f'eiv_{short_dataname}'\ f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\ f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\ f'_seed_{seed}.pkl') net = Networks.FNNEIV(p=p, init_std_y=init_std_y, h=[input_dim, *hidden_layers, output_dim], fixed_std_x=fixed_std_x).to(device) train_and_store.open_stored_training(saved_file=saved_file, net=net) # RMSE training_state = net.training noise_state = net.noise_is_on net.train() net.noise_on() not_averaged_predictions = net.predict(x, number_of_draws=eiv_number_of_draws, take_average_of_prediction=False) eiv_mean = torch.mean(not_averaged_predictions[0], dim=1) if len(y.shape) <= 1: y = y.view((-1,1)) assert y.shape == eiv_mean.shape res = y-eiv_mean if scale_outputs: scale = train_data.dataset.std_labels.to(device) scaled_res = res * scale.view((1,-1)) else: scaled_res = res scaled_res = scaled_res.detach().cpu().numpy().flatten() eiv_metrics['rmse' ]= np.sqrt(np.mean(scaled_res**2)) eiv_metrics['bias' ]= np.mean(scaled_res) eiv_metrics['coverage_numerical'], eiv_metrics['coverage_theory'] =\ epistemic_coverage(not_averaged_predictions, y, normalize_errors=False) eiv_metrics['coverage_normalized'],_ =\ epistemic_coverage(not_averaged_predictions, y, normalize_errors=True) eiv_metrics['res_std' ]= normalized_std(not_averaged_predictions, y) # metrics that need a ground truth if true_x is not None: eiv_metrics['true_coverage_numerical'],\ eiv_metrics['true_coverage_theory'] =\ epistemic_coverage(not_averaged_predictions, true_y, normalize_errors=False, noisy_y=False) true_res = (true_y - eiv_mean).detach().cpu().numpy().flatten() eiv_metrics['true_rmse'] = np.sqrt(np.mean(true_res**2)) # NLL if scale_outputs: scale_labels = train_data.dataset.std_labels.view((-1,)).to(device) else: scale_labels = None eiv_metrics['logdens' ]= net.predictive_logdensity( not_averaged_predictions, y, number_of_draws=eiv_number_of_draws, decouple_dimensions=decouple_dimensions, scale_labels=scale_labels).mean().detach().cpu().numpy() if training_state: net.train() else: net.eval() if noise_state: net.noise_on() else: net.noise_off() return noneiv_metrics, eiv_metrics def collect_full_seed_range_metrics(load_data, seed_range,test_batch_size = 100, test_samples = 10, noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], device=device, scale_outputs=scale_outputs): """ Collect metrics that need all seeds for their computation. :param load_data: load_data map should take seed as an argument and, optionally, `return_ground_truth`. :param seed_range: iterator for seeds. :param test_batch_size: An integer, used for drawing samples from the test data. :param test_samples: Number of test samples with batch size `test_batch_size` to take. :param noneiv_number_of_draws: Number of samples to take for the prediction of the non-EiV model. Defaults to 100. :param eiv_number_of_draws:Number of samples to take for the prediction of the model. Defaults to [100,5]. :param device: The torch.device to use :param scale_output: Boolean, scale the outputs for some metrics. Defaults to False. :returns: Dictionaries noneiv_metrics, eiv_metrics """ noneiv_metrics = {} eiv_metrics = {} noneiv_residual_collection = [] eiv_residual_collection = [] for i, seed in enumerate(seed_range): # load data according toseed try: train_data, test_data, true_train_data, true_test_data \ = load_data(seed=seed, return_ground_truth=True) except TypeError: train_data, test_data = load_data(seed=seed) true_train_data, true_test_data = None, None ## Compute x-dependant bias # only for repeated_sampling datasets if type(load_data) == repeated_sampling: # only if there is a ground truth if true_test_data is not None: # non-EiV init_std_y = noneiv_conf_dict["init_std_y_list"][0] unscaled_reg = noneiv_conf_dict["unscaled_reg"] p = noneiv_conf_dict["p"] hidden_layers = noneiv_conf_dict["hidden_layers"] saved_file = os.path.join('saved_networks', f'noneiv_{short_dataname}'\ f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\ f'_p_{p:.2f}_seed_{seed}.pkl') net = Networks.FNNBer(p=p, init_std_y=init_std_y, h=[input_dim, *hidden_layers, output_dim]).to(device) # load network train_and_store.open_stored_training(saved_file=saved_file, net=net, device=device) true_test_dataloader = DataLoader(true_test_data, batch_size=int(np.min((len(test_data), test_batch_size))), shuffle=False) # to collect x-dependant residuals true_scaled_res_collection = [] # variable to be used for checking # that we loop over the same true_x for each seed noneiv_true_x_sum = 0 for j, (true_x, true_y, noisy_x, _) in\ enumerate(true_test_dataloader): if j >= test_samples: break # store the sum of the true_x noneiv_true_x_sum += true_x.abs().sum().item() true_x, true_y, noisy_x =\ true_x.to(device), true_y.to(device),\ noisy_x.to(device) # Residuals training_state = net.training net.train() not_averaged_predictions = net.predict(noisy_x,\ number_of_draws=noneiv_number_of_draws, take_average_of_prediction=False) noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1) if len(true_y.shape) <= 1: true_y = true_y.view((-1,1)) assert true_y.shape == noneiv_mean.shape true_res = true_y - noneiv_mean if scale_outputs: scale = train_data.dataset.std_labels.to(device) true_scaled_res = true_res * scale.view((1,-1)) else: true_scaled_res = true_res # append residual true_scaled_res_collection.append(true_scaled_res) # restore net if training_state: net.train() else: net.eval() if i>0: # check that the used true x are the same for each # seed, by comparing their sum assert noneiv_true_x_sum == old_noneiv_true_x_sum old_noneiv_true_x_sum = noneiv_true_x_sum # concatenate batches along batch dimension true_scaled_res_collection =\ torch.concat(true_scaled_res_collection, dim=0) noneiv_residual_collection.append(true_scaled_res_collection) # EiV init_std_y = eiv_conf_dict["init_std_y_list"][0] unscaled_reg = eiv_conf_dict["unscaled_reg"] p = eiv_conf_dict["p"] hidden_layers = eiv_conf_dict["hidden_layers"] fixed_std_x = eiv_conf_dict["fixed_std_x"] saved_file = os.path.join('saved_networks', f'eiv_{short_dataname}'\ f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\ f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\ f'_seed_{seed}.pkl') net = Networks.FNNEIV(p=p, init_std_y=init_std_y, h=[input_dim, *hidden_layers, output_dim], fixed_std_x=fixed_std_x).to(device) # load network train_and_store.open_stored_training(saved_file=saved_file, net=net, device=device) # reinitialize dataloader to get the same true_x true_test_dataloader = DataLoader(true_test_data, batch_size=int(np.min((len(test_data), test_batch_size))), shuffle=False) true_scaled_res_collection = [] # variable to be used for checking # that we loop over the same true_x for each seed eiv_true_x_sum = 0 for j, (true_x, true_y, noisy_x, _) in\ enumerate(true_test_dataloader): if j >= test_samples: break # store the sum of the true_x eiv_true_x_sum += true_x.abs().sum().item() true_x, true_y, noisy_x =\ true_x.to(device), true_y.to(device),\ noisy_x.to(device) # Residuals training_state = net.training noise_state = net.noise_is_on net.train() net.noise_on() not_averaged_predictions = net.predict(noisy_x,\ number_of_draws=eiv_number_of_draws, take_average_of_prediction=False) eiv_mean = torch.mean(not_averaged_predictions[0], dim=1) if len(true_y.shape) <= 1: true_y = true_y.view((-1,1)) assert true_y.shape == eiv_mean.shape true_res = true_y - eiv_mean if scale_outputs: scale = train_data.dataset.std_labels.to(device) true_scaled_res = true_res * scale.view((1,-1)) else: true_scaled_res = true_res # append residuals true_scaled_res_collection.append(true_scaled_res) # restore net if training_state: net.train() else: net.eval() if noise_state: net.noise_on() else: net.noise_off() # check whether EiV and non-EiV used the same true_x for each # seed by comparing their sum assert eiv_true_x_sum == noneiv_true_x_sum if i>0: assert eiv_true_x_sum == old_eiv_true_x_sum old_eiv_true_x_sum = eiv_true_x_sum # concate batches along batch dimension true_scaled_res_collection =\ torch.concat(true_scaled_res_collection, dim=0) eiv_residual_collection.append(true_scaled_res_collection) ## Store quantities # Compute and store (averaged) x-dependant bias if type(load_data) == repeated_sampling and\ len(noneiv_residual_collection) > 0 and\ len(eiv_residual_collection) > 0: noneiv_residual_collection = torch.stack(\ tuple(noneiv_residual_collection), dim=-1) bias_per_x = torch.mean(noneiv_residual_collection, dim=-1) avg_bias = torch.mean(torch.abs(bias_per_x)) noneiv_metrics['avg_bias'] = avg_bias eiv_residual_collection = torch.stack(tuple(eiv_residual_collection),\ dim=-1) bias_per_x = torch.mean(eiv_residual_collection, dim=-1) avg_bias = torch.mean(torch.abs(bias_per_x)) eiv_metrics['avg_bias'] = avg_bias return noneiv_metrics, eiv_metrics # single seed metrics noneiv_metrics_collection = {} eiv_metrics_collection = {} collection_keys = [] num_test_epochs = 10 assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"] seed_list = range(noneiv_conf_dict["seed_range"][0], noneiv_conf_dict["seed_range"][1]) max_batch_number = 2 for seed in tqdm(seed_list): try: train_data, test_data, true_train_data, true_test_data \ = load_data(seed=seed, return_ground_truth=True) except TypeError: train_data, test_data = load_data(seed=seed) true_train_data, true_test_data = None, None if true_test_data is None: test_dataloader = DataLoader(test_data, batch_size=int(np.min((len(test_data), 800))), shuffle=True) else: test_dataloader = DataLoader(true_test_data, batch_size=int(np.min((len(true_test_data), 800))), shuffle=True) for i in tqdm(range(num_test_epochs)): for j, x_y_pairs in enumerate(test_dataloader): if j > max_batch_number: break # fill in ground truth with None, if not existent if true_test_data is None: x_y_pairs = (None, None, *x_y_pairs) # should contain (true_x,true_y,x,y) or (None,None,x,y) assert len(x_y_pairs) == 4 noneiv_metrics, eiv_metrics = collect_metrics(x_y_pairs, seed=seed) if i==0 and j==0: # fill collection keys assert eiv_metrics.keys() == noneiv_metrics.keys() collection_keys = list(eiv_metrics.keys()) for key in collection_keys: noneiv_metrics_collection[key] = [] eiv_metrics_collection[key] = [] # collect results for key in collection_keys: noneiv_metrics_collection[key].append(noneiv_metrics[key]) eiv_metrics_collection[key].append(eiv_metrics[key]) # full seed range metrics print('Computing metrics that use all seeds at once...') noneiv_full_seed_range_metrics, eiv_full_seed_range_metrics =\ collect_full_seed_range_metrics(load_data=load_data,\ seed_range=seed_list) # add keys to collection_keys assert noneiv_full_seed_range_metrics.keys() ==\ eiv_full_seed_range_metrics.keys() full_seed_range_collection_keys = list(noneiv_full_seed_range_metrics.keys()) collection_keys += full_seed_range_collection_keys results_dict = {} print('Non-EiV:\n-----') results_dict['noneiv'] = {} for key in collection_keys: if key not in full_seed_range_collection_keys: # per seed metrics metric_mean = float(np.mean(noneiv_metrics_collection[key])) metric_std = float(np.std(noneiv_metrics_collection[key])/\ np.sqrt(num_test_epochs*len(seed_list))) results_dict['noneiv'][key] = (metric_mean, metric_std) print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})') else: # full seed range metrics (without a std) metric = float(noneiv_full_seed_range_metrics[key]) results_dict['noneiv'][key] = metric print(f'{key}: {metric:.5f} (NaN)') print('\n') print('EiV:\n-----') results_dict['eiv'] = {} for key in collection_keys: if key not in full_seed_range_collection_keys: # per seed metrics metric_mean = float(np.mean(eiv_metrics_collection[key])) metric_std = float(np.std(eiv_metrics_collection[key])/\ np.sqrt(num_test_epochs*len(seed_list))) print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})') results_dict['eiv'][key] = (metric_mean, metric_std) else: # full seed range metrics (without a std) metric = float(eiv_full_seed_range_metrics[key]) results_dict['eiv'][key] = metric print(f'{key}: {metric:.5f} (NaN)') # write results to a JSON file in the results folder with open(os.path.join('results',f'metrics_{short_dataname}.json'), 'w') as f: json.dump(results_dict, f)