Newer
Older
"""
Compute metrics for datasets for which there is not necessarily a ground truth.
Results will be stored in the results folder
"""
import numpy as np
import torch
import torch.backends.cudnn
from torch.utils.data import DataLoader
from tqdm import tqdm
from EIVArchitectures import Networks
from EIVTrainingRoutines import train_and_store
from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std
from EIVData.repeated_sampling import repeated_sampling
# read in data via --data option
parser = argparse.ArgumentParser()
parser.add_argument("--data", help="Loads data", default='replin')
parser.add_argument("--no-autoindent", help="",
action="store_true") # to avoid conflics in IPython
args = parser.parse_args()
data = args.data
# load hyperparameters from JSON file
with open(os.path.join('configurations',f'eiv_{data}.json'),'r') as conf_file:
eiv_conf_dict = json.load(conf_file)
with open(os.path.join('configurations',f'noneiv_{data}.json'),'r') as conf_file:
noneiv_conf_dict = json.load(conf_file)
long_dataname = eiv_conf_dict["long_dataname"]
short_dataname = eiv_conf_dict["short_dataname"]
print(f"Evaluating {long_dataname}")
load_data = importlib.import_module(f'EIVData.{long_dataname}').load_data
train_data, test_data = load_data()
input_dim = train_data[0][0].numel()
output_dim = train_data[0][1].numel()
try:
gpu_number = eiv_conf_dict["gpu_number"]
device = torch.device(f'cuda:{gpu_number}')
try:
torch.tensor([0.0]).to(device)
except RuntimeError:
if torch.cuda.is_available():
print('Switched to GPU 0')
device = torch.device('cuda:0')
else:
print('No cuda available, using CPU')
device = torch.device('cpu')
except KeyError:
device = torch.device('cpu')
noneiv_number_of_draws=100, eiv_number_of_draws=[100,5],
decouple_dimensions=False, device=device,
scale_outputs=scale_outputs):
Compute various metrics for EiV and non-EiV for single seeds. Will be
returned as dictionaries.
:param x_y_pairs: A tuple of either the shape (None,None,x,y) or
(x_true,y_true,x,y) containing torch.tensor or None. x and y are
considered as input and corresponding label. If the first two components
are not None, they are considered to be the unnoisy counterparts.
:param seed: Integer. The seed used for loading, defaults to 0.
:param noneiv_number_of_draws: Number of draws for non-EiV model
for sampling from the posterior predictive. Defaults to 100.
:param noneiv_number_of_draws: Number of draws for EiV model
for sampling from the posterior predictive. Defaults to 100.
:param decouple_dimensions: Boolean. If True, the unsual convention
of Gal et al. is followed where, in the evaluation of the
log-posterior-predictive, each dimension is treated independently and then
averaged. If False (default), a multivariate distribution is used.
:param scale_output: Boolean, scale the outputs for the RMSE, the bias and
the log-dens to make them comparable with the literature.
:returns: Dictionaries noneiv_metrics, eiv_metrics
if true_x is not None:
assert true_y is not None
true_x,true_y = true_x.to(device), true_y.to(device)
else:
assert true_y is None
# non-EiV
noneiv_metrics = {}
init_std_y = noneiv_conf_dict["init_std_y_list"][0]
unscaled_reg = noneiv_conf_dict["unscaled_reg"]
p = noneiv_conf_dict["p"]
hidden_layers = noneiv_conf_dict["hidden_layers"]
saved_file = os.path.join('saved_networks',
f'noneiv_{short_dataname}'\
f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
f'_p_{p:.2f}_seed_{seed}.pkl')
net = Networks.FNNBer(p=p, init_std_y=init_std_y,
train_and_store.open_stored_training(saved_file=saved_file,
# RMSE
training_state = net.training
net.train()
not_averaged_predictions = net.predict(x,\
number_of_draws=noneiv_number_of_draws,
take_average_of_prediction=False)
noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
if len(y.shape) <= 1:
y = y.view((-1,1))
assert y.shape == noneiv_mean.shape
res = y-noneiv_mean
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
scaled_res = res * scale.view((1,-1))
else:
scaled_res = res
scaled_res = scaled_res.detach().cpu().numpy().flatten()
noneiv_metrics['rmse'] = np.sqrt(np.mean(scaled_res**2))
noneiv_metrics['bias'] = np.mean(scaled_res)
noneiv_metrics['coverage_numerical'], noneiv_metrics['coverage_theory'] =\
epistemic_coverage(not_averaged_predictions, y,\
normalize_errors=False)
noneiv_metrics['coverage_normalized'],_ =\
epistemic_coverage(not_averaged_predictions, y,\
normalize_errors=True)
noneiv_metrics['res_std'] = normalized_std(not_averaged_predictions, y)
# metrics that need a ground truth
if true_x is not None:
noneiv_metrics['true_coverage_numerical'],\
noneiv_metrics['true_coverage_theory'] =\
epistemic_coverage(not_averaged_predictions, true_y,
normalize_errors=False, noisy_y=False)
true_res = (true_y - noneiv_mean).detach().cpu().numpy().flatten()
noneiv_metrics['true_rmse'] = np.sqrt(np.mean(true_res**2))
if scale_outputs:
scale_labels = train_data.dataset.std_labels.view((-1,)).to(device)
else:
scale_labels = None
noneiv_metrics['logdens' ]= net.predictive_logdensity(
not_averaged_predictions, y,
scale_labels=scale_labels).mean().detach().cpu().numpy()
if training_state:
net.train()
else:
net.eval()
# EiV
init_std_y = eiv_conf_dict["init_std_y_list"][0]
unscaled_reg = eiv_conf_dict["unscaled_reg"]
p = eiv_conf_dict["p"]
hidden_layers = eiv_conf_dict["hidden_layers"]
fixed_std_x = eiv_conf_dict["fixed_std_x"]
f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
f'_seed_{seed}.pkl')
net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
h=[input_dim, *hidden_layers, output_dim],
fixed_std_x=fixed_std_x).to(device)
train_and_store.open_stored_training(saved_file=saved_file,
net=net)
# RMSE
training_state = net.training
noise_state = net.noise_is_on
net.train()
net.noise_on()
not_averaged_predictions = net.predict(x, number_of_draws=eiv_number_of_draws,
take_average_of_prediction=False)
eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
assert y.shape == eiv_mean.shape
res = y-eiv_mean
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
scaled_res = res * scale.view((1,-1))
else:
scaled_res = res
scaled_res = scaled_res.detach().cpu().numpy().flatten()
eiv_metrics['rmse' ]= np.sqrt(np.mean(scaled_res**2))
eiv_metrics['bias' ]= np.mean(scaled_res)
eiv_metrics['coverage_numerical'], eiv_metrics['coverage_theory'] =\
epistemic_coverage(not_averaged_predictions, y, normalize_errors=False)
eiv_metrics['coverage_normalized'],_ =\
epistemic_coverage(not_averaged_predictions, y, normalize_errors=True)
eiv_metrics['res_std' ]= normalized_std(not_averaged_predictions, y)
# metrics that need a ground truth
if true_x is not None:
eiv_metrics['true_coverage_numerical'],\
eiv_metrics['true_coverage_theory'] =\
epistemic_coverage(not_averaged_predictions, true_y,
normalize_errors=False, noisy_y=False)
true_res = (true_y - eiv_mean).detach().cpu().numpy().flatten()
eiv_metrics['true_rmse'] = np.sqrt(np.mean(true_res**2))
if scale_outputs:
scale_labels = train_data.dataset.std_labels.view((-1,)).to(device)
else:
scale_labels = None
eiv_metrics['logdens' ]= net.predictive_logdensity(
not_averaged_predictions, y,
number_of_draws=eiv_number_of_draws,
scale_labels=scale_labels).mean().detach().cpu().numpy()
if training_state:
net.train()
else:
net.eval()
if noise_state:
net.noise_on()
else:
net.noise_off()
return noneiv_metrics, eiv_metrics
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
def collect_full_seed_range_metrics(load_data,
seed_range,test_batch_size = 100, test_samples = 10,
noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], device=device,
scale_outputs=scale_outputs):
"""
Collect metrics that need all seeds for their computation.
:param load_data: load_data map should take seed as an argument and,
optionally, `return_ground_truth`.
:param seed_range: iterator for seeds.
:param test_batch_size: An integer, used for drawing samples from the test
data.
:param test_samples: Number of test samples with batch size
`test_batch_size` to take.
:param noneiv_number_of_draws: Number of samples to take for the prediction
of the non-EiV model. Defaults to 100.
:param eiv_number_of_draws:Number of samples to take for the prediction
of the model. Defaults to [100,5].
:param device: The torch.device to use
:param scale_output: Boolean, scale the outputs for some metrics. Defaults
to False.
:returns: Dictionaries noneiv_metrics, eiv_metrics
"""
noneiv_metrics = {}
eiv_metrics = {}
noneiv_residual_collection = []
eiv_residual_collection = []
for i, seed in enumerate(seed_range):
# load data according toseed
try:
train_data, test_data, true_train_data, true_test_data \
= load_data(seed=seed, return_ground_truth=True)
except TypeError:
train_data, test_data = load_data(seed=seed)
true_train_data, true_test_data = None, None
## Compute x-dependant bias
# only for repeated_sampling datasets
if type(load_data) == repeated_sampling:
# only if there is a ground truth
if true_test_data is not None:
# non-EiV
init_std_y = noneiv_conf_dict["init_std_y_list"][0]
unscaled_reg = noneiv_conf_dict["unscaled_reg"]
p = noneiv_conf_dict["p"]
hidden_layers = noneiv_conf_dict["hidden_layers"]
saved_file = os.path.join('saved_networks',
f'noneiv_{short_dataname}'\
f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
f'_p_{p:.2f}_seed_{seed}.pkl')
net = Networks.FNNBer(p=p, init_std_y=init_std_y,
h=[input_dim, *hidden_layers, output_dim]).to(device)
# load network
train_and_store.open_stored_training(saved_file=saved_file,
net=net, device=device)
true_test_dataloader = DataLoader(true_test_data,
batch_size=int(np.min((len(test_data), test_batch_size))),
shuffle=False)
# to collect x-dependant residuals
true_scaled_res_collection = []
# variable to be used for checking
# that we loop over the same true_x for each seed
noneiv_true_x_sum = 0
for j, (true_x, true_y, noisy_x, _) in\
enumerate(true_test_dataloader):
if j >= test_samples:
break
# store the sum of the true_x
noneiv_true_x_sum += true_x.abs().sum().item()
true_x, true_y, noisy_x =\
true_x.to(device), true_y.to(device),\
noisy_x.to(device)
# Residuals
training_state = net.training
net.train()
not_averaged_predictions = net.predict(noisy_x,\
number_of_draws=noneiv_number_of_draws,
take_average_of_prediction=False)
noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
if len(true_y.shape) <= 1:
true_y = true_y.view((-1,1))
assert true_y.shape == noneiv_mean.shape
true_res = true_y - noneiv_mean
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
true_scaled_res = true_res * scale.view((1,-1))
else:
true_scaled_res = true_res
# append residual
true_scaled_res_collection.append(true_scaled_res)
# restore net
if training_state:
net.train()
else:
net.eval()
if i>0:
# check that the used true x are the same for each
# seed, by comparing their sum
assert noneiv_true_x_sum == old_noneiv_true_x_sum
old_noneiv_true_x_sum = noneiv_true_x_sum
# concatenate batches along batch dimension
true_scaled_res_collection =\
torch.concat(true_scaled_res_collection, dim=0)
noneiv_residual_collection.append(true_scaled_res_collection)
# EiV
init_std_y = eiv_conf_dict["init_std_y_list"][0]
unscaled_reg = eiv_conf_dict["unscaled_reg"]
p = eiv_conf_dict["p"]
hidden_layers = eiv_conf_dict["hidden_layers"]
fixed_std_x = eiv_conf_dict["fixed_std_x"]
saved_file = os.path.join('saved_networks',
f'eiv_{short_dataname}'\
f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
f'_seed_{seed}.pkl')
net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
h=[input_dim, *hidden_layers, output_dim],
fixed_std_x=fixed_std_x).to(device)
# load network
train_and_store.open_stored_training(saved_file=saved_file,
net=net, device=device)
# reinitialize dataloader to get the same true_x
true_test_dataloader = DataLoader(true_test_data,
batch_size=int(np.min((len(test_data), test_batch_size))),
shuffle=False)
true_scaled_res_collection = []
# variable to be used for checking
# that we loop over the same true_x for each seed
eiv_true_x_sum = 0
for j, (true_x, true_y, noisy_x, _) in\
enumerate(true_test_dataloader):
if j >= test_samples:
break
# store the sum of the true_x
eiv_true_x_sum += true_x.abs().sum().item()
true_x, true_y, noisy_x =\
true_x.to(device), true_y.to(device),\
noisy_x.to(device)
# Residuals
training_state = net.training
noise_state = net.noise_is_on
net.train()
net.noise_on()
not_averaged_predictions = net.predict(noisy_x,\
number_of_draws=eiv_number_of_draws,
take_average_of_prediction=False)
eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
if len(true_y.shape) <= 1:
true_y = true_y.view((-1,1))
assert true_y.shape == eiv_mean.shape
true_res = true_y - eiv_mean
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
true_scaled_res = true_res * scale.view((1,-1))
else:
true_scaled_res = true_res
# append residuals
true_scaled_res_collection.append(true_scaled_res)
# restore net
if training_state:
net.train()
else:
net.eval()
if noise_state:
net.noise_on()
else:
net.noise_off()
# check whether EiV and non-EiV used the same true_x for each
# seed by comparing their sum
assert eiv_true_x_sum == noneiv_true_x_sum
if i>0:
assert eiv_true_x_sum == old_eiv_true_x_sum
old_eiv_true_x_sum = eiv_true_x_sum
# concate batches along batch dimension
true_scaled_res_collection =\
torch.concat(true_scaled_res_collection, dim=0)
eiv_residual_collection.append(true_scaled_res_collection)
## Store quantities
# Compute and store (averaged) x-dependant bias
if type(load_data) == repeated_sampling and\
len(noneiv_residual_collection) > 0 and\
len(eiv_residual_collection) > 0:
noneiv_residual_collection = torch.stack(\
tuple(noneiv_residual_collection), dim=-1)
bias_per_x = torch.mean(noneiv_residual_collection, dim=-1)
avg_bias = torch.mean(torch.abs(bias_per_x))
noneiv_metrics['avg_bias'] = avg_bias
eiv_residual_collection = torch.stack(tuple(eiv_residual_collection),\
dim=-1)
bias_per_x = torch.mean(eiv_residual_collection, dim=-1)
avg_bias = torch.mean(torch.abs(bias_per_x))
eiv_metrics['avg_bias'] = avg_bias
return noneiv_metrics, eiv_metrics
# single seed metrics
noneiv_metrics_collection = {}
eiv_metrics_collection = {}
assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"]
seed_list = range(noneiv_conf_dict["seed_range"][0],
noneiv_conf_dict["seed_range"][1])
train_data, test_data, true_train_data, true_test_data \
= load_data(seed=seed, return_ground_truth=True)
except TypeError:
train_data, test_data = load_data(seed=seed)
true_train_data, true_test_data = None, None
if true_test_data is None:
test_dataloader = DataLoader(test_data,
batch_size=int(np.min((len(test_data),
else:
test_dataloader = DataLoader(true_test_data,
batch_size=int(np.min((len(true_test_data), 800))), shuffle=True)
for j, x_y_pairs in enumerate(test_dataloader):
if j > max_batch_number:
break
# fill in ground truth with None, if not existent
if true_test_data is None:
x_y_pairs = (None, None, *x_y_pairs)
# should contain (true_x,true_y,x,y) or (None,None,x,y)
assert len(x_y_pairs) == 4
noneiv_metrics, eiv_metrics = collect_metrics(x_y_pairs,
seed=seed)
if i==0 and j==0:
# fill collection keys
assert eiv_metrics.keys() == noneiv_metrics.keys()
collection_keys = list(eiv_metrics.keys())
for key in collection_keys:
noneiv_metrics_collection[key] = []
eiv_metrics_collection[key] = []
# collect results
for key in collection_keys:
noneiv_metrics_collection[key].append(noneiv_metrics[key])
eiv_metrics_collection[key].append(eiv_metrics[key])
# full seed range metrics
print('Computing metrics that use all seeds at once...')
noneiv_full_seed_range_metrics, eiv_full_seed_range_metrics =\
collect_full_seed_range_metrics(load_data=load_data,\
seed_range=seed_list)
# add keys to collection_keys
assert noneiv_full_seed_range_metrics.keys() ==\
eiv_full_seed_range_metrics.keys()
full_seed_range_collection_keys = list(noneiv_full_seed_range_metrics.keys())
collection_keys += full_seed_range_collection_keys
results_dict = {}
print('Non-EiV:\n-----')
results_dict['noneiv'] = {}
if key not in full_seed_range_collection_keys:
# per seed metrics
metric_mean = float(np.mean(noneiv_metrics_collection[key]))
metric_std = float(np.std(noneiv_metrics_collection[key])/\
np.sqrt(num_test_epochs*len(seed_list)))
results_dict['noneiv'][key] = (metric_mean, metric_std)
print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
else:
# full seed range metrics (without a std)
metric = float(noneiv_full_seed_range_metrics[key])
results_dict['noneiv'][key] = metric
print(f'{key}: {metric:.5f} (NaN)')
print('\n')
print('EiV:\n-----')
results_dict['eiv'] = {}
if key not in full_seed_range_collection_keys:
# per seed metrics
metric_mean = float(np.mean(eiv_metrics_collection[key]))
metric_std = float(np.std(eiv_metrics_collection[key])/\
np.sqrt(num_test_epochs*len(seed_list)))
print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
results_dict['eiv'][key] = (metric_mean, metric_std)
else:
# full seed range metrics (without a std)
metric = float(eiv_full_seed_range_metrics[key])
results_dict['eiv'][key] = metric
print(f'{key}: {metric:.5f} (NaN)')
# write results to a JSON file in the results folder
with open(os.path.join('results',f'metrics_{short_dataname}.json'), 'w') as f:
json.dump(results_dict, f)