diff --git a/EIVPackage/EIVArchitectures/Networks.py b/EIVPackage/EIVArchitectures/Networks.py index 56c76336dacc479e25a4915e941fc336f77282ee..ccae84842e6a29979c0a72156606ddcfaf8ca3e0 100644 --- a/EIVPackage/EIVArchitectures/Networks.py +++ b/EIVPackage/EIVArchitectures/Networks.py @@ -244,6 +244,11 @@ class FNNEIV(nn.Module): :param average_batch_dimension: Boolean. If True (default) the values will be averaged over the batch dimension. If False, the batch dimension will be left untouched and all values will be returned. + :scale_labels: If not None (the default), scale labels in evaluation to + make result comparable with the literature. + :decouple_dimensions: If True, treat dimensions seperate and finally + average, to make results comparable with the literature. Defaults to + False. """ out, sigmas = self.predict(x, number_of_draws=number_of_draws, number_of_parameter_chunks=number_of_parameter_chunks, @@ -437,6 +442,11 @@ class FNNBer(nn.Module): :param average_batch_dimension: Boolean. If True (default) the values will be averaged over the batch dimension. If False, the batch dimension will be left untouched and all values will be returned. + :scale_labels: If not None (the default), scale labels in evaluation to + make result comparable with the literature. + :decouple_dimensions: If True, treat dimensions seperate and finally + average, to make results comparable with the literature. Defaults to + False. """ out, sigmas = self.predict(x, number_of_draws=number_of_draws, take_average_of_prediction=False, remove_graph=remove_graph) diff --git a/Experiments/evaluate_tabular.py b/Experiments/evaluate_tabular.py index cfa6664eed259441c865c36612d00063f9d0098f..6a738bdaae782541924d8c474efb90c7eac6b8b0 100644 --- a/Experiments/evaluate_tabular.py +++ b/Experiments/evaluate_tabular.py @@ -14,6 +14,7 @@ from EIVGeneral.coverage_metrices import epistemic_coverage, normalized_std long_dataname = 'energy_efficiency' short_dataname = 'energy' +scale_outputs = False load_data = importlib.import_module(f'EIVData.{long_dataname}').load_data train_noneiv = importlib.import_module(f'train_noneiv_{short_dataname}') train_eiv = importlib.import_module(f'train_eiv_{short_dataname}') @@ -24,7 +25,8 @@ output_dim = train_data[0][1].numel() def collect_metrics(x,y, seed=0, noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], - decouple_dimensions=False, device=torch.device('cuda:1')): + decouple_dimensions=False, device=torch.device('cuda:1'), + scale_outputs=scale_outputs): """ Compute various metrics for EiV and non-EiV. Will be returned as dictionaries. @@ -39,6 +41,8 @@ def collect_metrics(x,y, seed=0, of Gal et al. is followed where, in the evaluation of the log-posterior-predictive, each dimension is treated independently and then averaged. If False (default), a multivariate distribution is used. + :param scale_output: Boolean, scale the outputs for the RMSE, the bias and + the log-dens to make them comparable with the literature. :returns: Dictionaries noneiv_metrics, eiv_metrics """ x,y = x.to(device), y.to(device) @@ -70,8 +74,11 @@ def collect_metrics(x,y, seed=0, y = y.view((-1,1)) assert y.shape == prediction_triple[0].shape res = y-prediction_triple[0] - scale = train_data.dataset.std_labels.to(device) - scaled_res = res * scale.view((1,-1)) + if scale_outputs: + scale = train_data.dataset.std_labels.to(device) + scaled_res = res * scale.view((1,-1)) + else: + scaled_res = res scaled_res = scaled_res.detach().cpu().numpy().flatten() noneiv_metrics['rmse'] = np.sqrt(np.mean(scaled_res**2)) noneiv_metrics['bias'] = np.mean(scaled_res) @@ -84,11 +91,14 @@ def collect_metrics(x,y, seed=0, # NLL - noneiv_metrics['logdens' ]= net.predictive_logdensity(x, y, number_of_draws=100, + if scale_outputs: + scale_labels = train_data.dataset.std_labels.view((-1,)).to(device) + else: + scale_labels = None + noneiv_metrics['logdens' ]= net.predictive_logdensity(x, y, + number_of_draws=100, decouple_dimensions=decouple_dimensions, - scale_labels=\ - train_data.dataset.std_labels.view((-1,)).to(device)\ - ).mean().detach().cpu().numpy() + scale_labels=scale_labels).mean().detach().cpu().numpy() if training_state: net.train() else: @@ -124,7 +134,11 @@ def collect_metrics(x,y, seed=0, assert y.shape == prediction_triple[0].shape res = y-prediction_triple[0] scale = train_data.dataset.std_labels.to(device) - scaled_res = res * scale.view((1,-1)) + if scale_outputs: + scale = train_data.dataset.std_labels.to(device) + scaled_res = res * scale.view((1,-1)) + else: + scaled_res = res scaled_res = scaled_res.detach().cpu().numpy().flatten() eiv_metrics['rmse' ]= np.sqrt(np.mean(scaled_res**2)) eiv_metrics['bias' ]= np.mean(scaled_res) @@ -136,12 +150,14 @@ def collect_metrics(x,y, seed=0, # NLL + if scale_outputs: + scale_labels = train_data.dataset.std_labels.view((-1,)).to(device) + else: + scale_labels = None eiv_metrics['logdens' ]= net.predictive_logdensity(x, y, number_of_draws=eiv_number_of_draws, decouple_dimensions=decouple_dimensions, - scale_labels=\ - train_data.dataset.std_labels.view((-1,)).to(device)\ - ).mean().detach().cpu().numpy() + scale_labels=scale_labels).mean().detach().cpu().numpy() if training_state: net.train() else: @@ -153,7 +169,8 @@ def collect_metrics(x,y, seed=0, return noneiv_metrics, eiv_metrics -collection_keys = ['rmse','logdens','bias','coverage_numerical','coverage_theory','coverage_normalized','res_std'] +collection_keys = ['rmse','logdens','bias','coverage_numerical', + 'coverage_theory','coverage_normalized','res_std'] noneiv_metrics_collection = {} eiv_metrics_collection = {} for key in collection_keys: