import importlib import os import numpy as np import torch import torch.backends.cudnn from torch.utils.data import DataLoader from tqdm import tqdm from EIVArchitectures import Networks from EIVTrainingRoutines import train_and_store from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std long_dataname = 'energy_efficiency' short_dataname = 'energy' scale_outputs = False load_data = importlib.import_module(f'EIVData.{long_dataname}').load_data train_noneiv = importlib.import_module(f'train_noneiv_{short_dataname}') train_eiv = importlib.import_module(f'train_eiv_{short_dataname}') train_data, test_data = load_data() input_dim = train_data[0][0].numel() output_dim = train_data[0][1].numel() def collect_metrics(x,y, seed=0, noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], decouple_dimensions=False, device=torch.device('cuda:1'), scale_outputs=scale_outputs): """ Compute various metrics for EiV and non-EiV. Will be returned as dictionaries. :param x: A torch.tensor, taken as input :param y: A torch.tensor, taken as output :param seed: Integer. The seed used for loading, defaults to 0. :param noneiv_number_of_draws: Number of draws for non-EiV model for sampling from the posterior predictive. Defaults to 100. :param noneiv_number_of_draws: Number of draws for EiV model for sampling from the posterior predictive. Defaults to 100. :param decouple_dimensions: Boolean. If True, the unsual convention of Gal et al. is followed where, in the evaluation of the log-posterior-predictive, each dimension is treated independently and then averaged. If False (default), a multivariate distribution is used. :param scale_output: Boolean, scale the outputs for the RMSE, the bias and the log-dens to make them comparable with the literature. :returns: Dictionaries noneiv_metrics, eiv_metrics """ x,y = x.to(device), y.to(device) # non-EiV noneiv_metrics = {} init_std_y = train_noneiv.init_std_y_list[0] unscaled_reg = train_noneiv.unscaled_reg p = train_noneiv.p hidden_layers = train_noneiv.hidden_layers saved_file = os.path.join('saved_networks', f'noneiv_{short_dataname}'\ f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\ f'_p_{p:.2f}_seed_{seed}.pkl') net = Networks.FNNBer(p=p, init_std_y=init_std_y, h=[input_dim, *hidden_layers, output_dim]).to(device) train_and_store.open_stored_training(saved_file=saved_file, net=net, device=device) # RMSE training_state = net.training net.train() not_averaged_predictions = net.predict(x, number_of_draws=noneiv_number_of_draws, take_average_of_prediction=False) noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1) if len(y.shape) <= 1: y = y.view((-1,1)) assert y.shape == noneiv_mean.shape res = y-noneiv_mean if scale_outputs: scale = train_data.dataset.std_labels.to(device) scaled_res = res * scale.view((1,-1)) else: scaled_res = res scaled_res = scaled_res.detach().cpu().numpy().flatten() noneiv_metrics['rmse'] = np.sqrt(np.mean(scaled_res**2)) noneiv_metrics['bias'] = np.mean(scaled_res) noneiv_metrics['coverage_numerical'], noneiv_metrics['coverage_theory'] =\ epistemic_coverage(not_averaged_predictions, y, normalize_errors=False) noneiv_metrics['coverage_normalized'],_ =\ epistemic_coverage(not_averaged_predictions, y, normalize_errors=True) noneiv_metrics['res_std'] = normalized_std(not_averaged_predictions, y) # NLL if scale_outputs: scale_labels = train_data.dataset.std_labels.view((-1,)).to(device) else: scale_labels = None noneiv_metrics['logdens' ]= net.predictive_logdensity( not_averaged_predictions, y, number_of_draws=100, decouple_dimensions=decouple_dimensions, scale_labels=scale_labels).mean().detach().cpu().numpy() if training_state: net.train() else: net.eval() # EiV eiv_metrics = {} init_std_y = train_eiv.init_std_y_list[0] unscaled_reg = train_eiv.unscaled_reg p = train_eiv.p hidden_layers = train_eiv.hidden_layers fixed_std_x = train_eiv.fixed_std_x saved_file = os.path.join('saved_networks', f'eiv_{short_dataname}'\ f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\ f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\ f'_seed_{seed}.pkl') net = Networks.FNNEIV(p=p, init_std_y=init_std_y, h=[input_dim, *hidden_layers, output_dim], fixed_std_x=fixed_std_x).to(device) train_and_store.open_stored_training(saved_file=saved_file, net=net) # RMSE training_state = net.training noise_state = net.noise_is_on net.train() net.noise_on() not_averaged_predictions = net.predict(x, number_of_draws=noneiv_number_of_draws, take_average_of_prediction=False) eiv_mean = torch.mean(not_averaged_predictions[0], dim=1) if len(y.shape) <= 1: y = y.view((-1,1)) assert y.shape == eiv_mean.shape res = y-eiv_mean scale = train_data.dataset.std_labels.to(device) if scale_outputs: scale = train_data.dataset.std_labels.to(device) scaled_res = res * scale.view((1,-1)) else: scaled_res = res scaled_res = scaled_res.detach().cpu().numpy().flatten() eiv_metrics['rmse' ]= np.sqrt(np.mean(scaled_res**2)) eiv_metrics['bias' ]= np.mean(scaled_res) eiv_metrics['coverage_numerical'], eiv_metrics['coverage_theory'] =\ epistemic_coverage(not_averaged_predictions, y, normalize_errors=False) eiv_metrics['coverage_normalized'],_ =\ epistemic_coverage(not_averaged_predictions, y, normalize_errors=True) eiv_metrics['res_std' ]= normalized_std(not_averaged_predictions, y) # NLL if scale_outputs: scale_labels = train_data.dataset.std_labels.view((-1,)).to(device) else: scale_labels = None eiv_metrics['logdens' ]= net.predictive_logdensity( not_averaged_predictions, y, number_of_draws=eiv_number_of_draws, decouple_dimensions=decouple_dimensions, scale_labels=scale_labels).mean().detach().cpu().numpy() if training_state: net.train() else: net.eval() if noise_state: net.noise_on() else: net.noise_off() return noneiv_metrics, eiv_metrics collection_keys = ['rmse','logdens','bias','coverage_numerical', 'coverage_theory','coverage_normalized','res_std'] noneiv_metrics_collection = {} eiv_metrics_collection = {} for key in collection_keys: noneiv_metrics_collection[key] = [] eiv_metrics_collection[key] = [] num_test_epochs = 10 assert train_noneiv.seed_list == train_eiv.seed_list seed_list = train_noneiv.seed_list max_batch_number = 2 for seed in tqdm(seed_list): train_data, test_data = load_data(seed=seed) test_dataloader = DataLoader(test_data, batch_size=int(np.min((len(test_data), 800))), shuffle=True) for i in tqdm(range(num_test_epochs)): for j, (x,y) in enumerate(test_dataloader): if j > max_batch_number: break noneiv_metrics, eiv_metrics = collect_metrics(x,y, seed=seed) for key in collection_keys: noneiv_metrics_collection[key].append(noneiv_metrics[key]) eiv_metrics_collection[key].append(eiv_metrics[key]) print('Non-EiV\n-----') for key in collection_keys: print(f'{key} {np.mean(noneiv_metrics_collection[key]):.5f}'\ f'({np.std(noneiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)):.5f})') print('EiV\n-----') for key in collection_keys: print(f'{key} {np.mean(eiv_metrics_collection[key]):.5f}'\ f'({np.std(eiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)):.5f})')