-
Jörg Martin authoredJörg Martin authored
evaluate_tabular.py 8.60 KiB
import importlib
import os
import argparse
import json
import numpy as np
import torch
import torch.backends.cudnn
from torch.utils.data import DataLoader
from tqdm import tqdm
from EIVArchitectures import Networks
from EIVTrainingRoutines import train_and_store
from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std
# read in data via --data option
parser = argparse.ArgumentParser()
parser.add_argument("--data", help="Loads data", default='california')
parser.add_argument("--no-autoindent", help="",
action="store_true") # to avoid conflics in IPython
args = parser.parse_args()
data = args.data
# load hyperparameters from JSON file
with open(os.path.join('configurations',f'eiv_{data}.json'),'r') as conf_file:
eiv_conf_dict = json.load(conf_file)
with open(os.path.join('configurations',f'noneiv_{data}.json'),'r') as conf_file:
noneiv_conf_dict = json.load(conf_file)
long_dataname = eiv_conf_dict["long_dataname"]
short_dataname = eiv_conf_dict["short_dataname"]
print(f"Evaluating {long_dataname}")
scale_outputs = False
load_data = importlib.import_module(f'EIVData.{long_dataname}').load_data
train_data, test_data = load_data()
input_dim = train_data[0][0].numel()
output_dim = train_data[0][1].numel()
def collect_metrics(x,y, seed=0,
noneiv_number_of_draws=100, eiv_number_of_draws=[100,5],
decouple_dimensions=False, device=torch.device('cuda:1'),
scale_outputs=scale_outputs):
"""
Compute various metrics for EiV and non-EiV. Will be returned as
dictionaries.
:param x: A torch.tensor, taken as input
:param y: A torch.tensor, taken as output
:param seed: Integer. The seed used for loading, defaults to 0.
:param noneiv_number_of_draws: Number of draws for non-EiV model
for sampling from the posterior predictive. Defaults to 100.
:param noneiv_number_of_draws: Number of draws for EiV model
for sampling from the posterior predictive. Defaults to 100.
:param decouple_dimensions: Boolean. If True, the unsual convention
of Gal et al. is followed where, in the evaluation of the
log-posterior-predictive, each dimension is treated independently and then
averaged. If False (default), a multivariate distribution is used.
:param scale_output: Boolean, scale the outputs for the RMSE, the bias and
the log-dens to make them comparable with the literature.
:returns: Dictionaries noneiv_metrics, eiv_metrics
"""
x,y = x.to(device), y.to(device)
# non-EiV
noneiv_metrics = {}
init_std_y = noneiv_conf_dict["init_std_y_list"][0]
unscaled_reg = noneiv_conf_dict["unscaled_reg"]
p = noneiv_conf_dict["p"]
hidden_layers = noneiv_conf_dict["hidden_layers"]
saved_file = os.path.join('saved_networks',
f'noneiv_{short_dataname}'\
f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
f'_p_{p:.2f}_seed_{seed}.pkl')
net = Networks.FNNBer(p=p, init_std_y=init_std_y,
h=[input_dim, *hidden_layers, output_dim]).to(device)
train_and_store.open_stored_training(saved_file=saved_file,
net=net, device=device)
# RMSE
training_state = net.training
net.train()
not_averaged_predictions = net.predict(x, number_of_draws=noneiv_number_of_draws,
take_average_of_prediction=False)
noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
if len(y.shape) <= 1:
y = y.view((-1,1))
assert y.shape == noneiv_mean.shape
res = y-noneiv_mean
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
scaled_res = res * scale.view((1,-1))
else:
scaled_res = res
scaled_res = scaled_res.detach().cpu().numpy().flatten()
noneiv_metrics['rmse'] = np.sqrt(np.mean(scaled_res**2))
noneiv_metrics['bias'] = np.mean(scaled_res)
noneiv_metrics['coverage_numerical'], noneiv_metrics['coverage_theory'] =\
epistemic_coverage(not_averaged_predictions, y, normalize_errors=False)
noneiv_metrics['coverage_normalized'],_ =\
epistemic_coverage(not_averaged_predictions, y, normalize_errors=True)
noneiv_metrics['res_std'] = normalized_std(not_averaged_predictions, y)
# NLL
if scale_outputs:
scale_labels = train_data.dataset.std_labels.view((-1,)).to(device)
else:
scale_labels = None
noneiv_metrics['logdens' ]= net.predictive_logdensity(
not_averaged_predictions, y,
number_of_draws=100,
decouple_dimensions=decouple_dimensions,
scale_labels=scale_labels).mean().detach().cpu().numpy()
if training_state:
net.train()
else:
net.eval()
# EiV
eiv_metrics = {}
init_std_y = eiv_conf_dict["init_std_y_list"][0]
unscaled_reg = eiv_conf_dict["unscaled_reg"]
p = eiv_conf_dict["p"]
hidden_layers = eiv_conf_dict["hidden_layers"]
fixed_std_x = eiv_conf_dict["fixed_std_x"]
saved_file = os.path.join('saved_networks',
f'eiv_{short_dataname}'\
f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
f'_seed_{seed}.pkl')
net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
h=[input_dim, *hidden_layers, output_dim],
fixed_std_x=fixed_std_x).to(device)
train_and_store.open_stored_training(saved_file=saved_file,
net=net)
# RMSE
training_state = net.training
noise_state = net.noise_is_on
net.train()
net.noise_on()
not_averaged_predictions = net.predict(x, number_of_draws=noneiv_number_of_draws,
take_average_of_prediction=False)
eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
if len(y.shape) <= 1:
y = y.view((-1,1))
assert y.shape == eiv_mean.shape
res = y-eiv_mean
scale = train_data.dataset.std_labels.to(device)
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
scaled_res = res * scale.view((1,-1))
else:
scaled_res = res
scaled_res = scaled_res.detach().cpu().numpy().flatten()
eiv_metrics['rmse' ]= np.sqrt(np.mean(scaled_res**2))
eiv_metrics['bias' ]= np.mean(scaled_res)
eiv_metrics['coverage_numerical'], eiv_metrics['coverage_theory'] =\
epistemic_coverage(not_averaged_predictions, y, normalize_errors=False)
eiv_metrics['coverage_normalized'],_ =\
epistemic_coverage(not_averaged_predictions, y, normalize_errors=True)
eiv_metrics['res_std' ]= normalized_std(not_averaged_predictions, y)
# NLL
if scale_outputs:
scale_labels = train_data.dataset.std_labels.view((-1,)).to(device)
else:
scale_labels = None
eiv_metrics['logdens' ]= net.predictive_logdensity(
not_averaged_predictions, y,
number_of_draws=eiv_number_of_draws,
decouple_dimensions=decouple_dimensions,
scale_labels=scale_labels).mean().detach().cpu().numpy()
if training_state:
net.train()
else:
net.eval()
if noise_state:
net.noise_on()
else:
net.noise_off()
return noneiv_metrics, eiv_metrics
collection_keys = ['rmse','logdens','bias','coverage_numerical',
'coverage_theory','coverage_normalized','res_std']
noneiv_metrics_collection = {}
eiv_metrics_collection = {}
for key in collection_keys:
noneiv_metrics_collection[key] = []
eiv_metrics_collection[key] = []
num_test_epochs = 10
assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"]
seed_list = range(noneiv_conf_dict["seed_range"][0],
noneiv_conf_dict["seed_range"][1])
max_batch_number = 2
for seed in tqdm(seed_list):
train_data, test_data = load_data(seed=seed)
test_dataloader = DataLoader(test_data,
batch_size=int(np.min((len(test_data),
800))), shuffle=True)
for i in tqdm(range(num_test_epochs)):
for j, (x,y) in enumerate(test_dataloader):
if j > max_batch_number:
break
noneiv_metrics, eiv_metrics = collect_metrics(x,y, seed=seed)
for key in collection_keys:
noneiv_metrics_collection[key].append(noneiv_metrics[key])
eiv_metrics_collection[key].append(eiv_metrics[key])
print('Non-EiV\n-----')
for key in collection_keys:
print(f'{key} {np.mean(noneiv_metrics_collection[key]):.5f}'\
f'({np.std(noneiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)):.5f})')
print('EiV\n-----')
for key in collection_keys:
print(f'{key} {np.mean(eiv_metrics_collection[key]):.5f}'\
f'({np.std(eiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)):.5f})')