Skip to content
Snippets Groups Projects
evaluate_metrics.py 9.6 KiB
Newer Older
Jörg Martin's avatar
Jörg Martin committed
"""
Compute metrics for datasets for which there is not necessarily a ground truth.
Results will be stored in the results folder
"""
import importlib
import os
import argparse
import json

import numpy as np
import torch
import torch.backends.cudnn
from torch.utils.data import DataLoader
from tqdm import tqdm

from EIVArchitectures import Networks
from EIVTrainingRoutines import train_and_store
Jörg Martin's avatar
Jörg Martin committed
from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std
# read in data via --data option
parser = argparse.ArgumentParser()
Jörg Martin's avatar
Jörg Martin committed
parser.add_argument("--data", help="Loads data", default='linear')
parser.add_argument("--no-autoindent", help="",
        action="store_true") # to avoid conflics in IPython
args = parser.parse_args()
data = args.data

# load hyperparameters from JSON file
with open(os.path.join('configurations',f'eiv_{data}.json'),'r') as conf_file:
    eiv_conf_dict = json.load(conf_file)
with open(os.path.join('configurations',f'noneiv_{data}.json'),'r') as conf_file:
    noneiv_conf_dict = json.load(conf_file)

long_dataname = eiv_conf_dict["long_dataname"]
short_dataname = eiv_conf_dict["short_dataname"]

print(f"Evaluating {long_dataname}")
scale_outputs = False 
load_data = importlib.import_module(f'EIVData.{long_dataname}').load_data

train_data, test_data = load_data()
input_dim = train_data[0][0].numel()
output_dim = train_data[0][1].numel()

try:
    gpu_number = eiv_conf_dict["gpu_number"]
    device = torch.device(f'cuda:{gpu_number}')
    try:
        torch.tensor([0.0]).to(device)
    except RuntimeError:
        if torch.cuda.is_available():
            print('Switched to GPU 0')
            device = torch.device('cuda:0')
        else:
            print('No cuda available, using CPU')
            device = torch.device('cpu')
except KeyError:
    device = torch.device('cpu')


def collect_metrics(x,y, seed=0,
    noneiv_number_of_draws=100, eiv_number_of_draws=[100,5],
    decouple_dimensions=False, device=device,
    scale_outputs=scale_outputs):
    Compute various metrics for EiV and non-EiV. Will be returned as
    dictionaries.
    :param x: A torch.tensor, taken as input
    :param y: A torch.tensor, taken as output
    :param seed: Integer. The seed used for loading, defaults to 0.
    :param noneiv_number_of_draws: Number of draws for non-EiV model
    for sampling from the posterior predictive. Defaults to 100.
    :param noneiv_number_of_draws: Number of draws for EiV model
Jörg Martin's avatar
Jörg Martin committed
    for sampling from the posterior predictive. Defaults to 100.
    :param decouple_dimensions: Boolean. If True, the unsual convention
    of Gal et al. is followed where, in the evaluation of the
    log-posterior-predictive, each dimension is treated independently and then
    averaged. If False (default), a multivariate distribution is used.
Jörg Martin's avatar
Jörg Martin committed
    :param device: The device to use.
    :param scale_output: Boolean, scale the outputs for the RMSE, the bias and
    the log-dens to make them comparable with the literature.
    :returns: Dictionaries noneiv_metrics, eiv_metrics
Jörg Martin's avatar
Jörg Martin committed
    x,y = x.to(device), y.to(device)


    # non-EiV
    noneiv_metrics = {}
    init_std_y = noneiv_conf_dict["init_std_y_list"][0]
    unscaled_reg = noneiv_conf_dict["unscaled_reg"]
    p = noneiv_conf_dict["p"]
    hidden_layers = noneiv_conf_dict["hidden_layers"]
    saved_file = os.path.join('saved_networks',
                f'noneiv_{short_dataname}'\
                        f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
                        f'_p_{p:.2f}_seed_{seed}.pkl')
    net = Networks.FNNBer(p=p, init_std_y=init_std_y,
Jörg Martin's avatar
Jörg Martin committed
            h=[input_dim, *hidden_layers, output_dim]).to(device)
    train_and_store.open_stored_training(saved_file=saved_file,
Jörg Martin's avatar
Jörg Martin committed
            net=net, device=device)


    # RMSE
    training_state = net.training
    net.train()
    not_averaged_predictions = net.predict(x, number_of_draws=noneiv_number_of_draws, 
            take_average_of_prediction=False)
    noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
    if len(y.shape) <= 1:
        y = y.view((-1,1))
    assert y.shape == noneiv_mean.shape
    res = y-noneiv_mean
    if scale_outputs:
        scale = train_data.dataset.std_labels.to(device)
        scaled_res = res * scale.view((1,-1))
    else:
        scaled_res = res
    scaled_res = scaled_res.detach().cpu().numpy().flatten()
    noneiv_metrics['rmse'] = np.sqrt(np.mean(scaled_res**2))
    noneiv_metrics['bias'] = np.mean(scaled_res)
    noneiv_metrics['coverage_numerical'], noneiv_metrics['coverage_theory'] =\
            epistemic_coverage(not_averaged_predictions, y, normalize_errors=False)
    noneiv_metrics['coverage_normalized'],_ =\
            epistemic_coverage(not_averaged_predictions, y, normalize_errors=True)
    noneiv_metrics['res_std'] = normalized_std(not_averaged_predictions, y)
    if scale_outputs:
        scale_labels = train_data.dataset.std_labels.view((-1,)).to(device)
    else:
        scale_labels = None
    noneiv_metrics['logdens' ]= net.predictive_logdensity(
            not_averaged_predictions, y,
            number_of_draws=100,
            decouple_dimensions=decouple_dimensions,
            scale_labels=scale_labels).mean().detach().cpu().numpy()
    if training_state:
        net.train()
    else:
        net.eval()

    # EiV
    eiv_metrics = {}
    init_std_y = eiv_conf_dict["init_std_y_list"][0]
    unscaled_reg = eiv_conf_dict["unscaled_reg"]
    p = eiv_conf_dict["p"]
    hidden_layers = eiv_conf_dict["hidden_layers"]
    fixed_std_x = eiv_conf_dict["fixed_std_x"]
    saved_file = os.path.join('saved_networks',
            f'eiv_{short_dataname}'\
Jörg Martin's avatar
Jörg Martin committed
                    f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
                    f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
                    f'_seed_{seed}.pkl')
    net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
Jörg Martin's avatar
Jörg Martin committed
            h=[input_dim, *hidden_layers, output_dim],
            fixed_std_x=fixed_std_x).to(device)
    train_and_store.open_stored_training(saved_file=saved_file,
            net=net)
    # RMSE
    training_state = net.training
    noise_state = net.noise_is_on
    net.train()
    net.noise_on()
Jörg Martin's avatar
Jörg Martin committed
    not_averaged_predictions = net.predict(x, number_of_draws=eiv_number_of_draws, 
            take_average_of_prediction=False)
    eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
    if len(y.shape) <= 1:
        y = y.view((-1,1))
    assert y.shape == eiv_mean.shape
    res = y-eiv_mean
    if scale_outputs:
        scale = train_data.dataset.std_labels.to(device)
        scaled_res = res * scale.view((1,-1))
    else:
        scaled_res = res
    scaled_res = scaled_res.detach().cpu().numpy().flatten()
    eiv_metrics['rmse' ]= np.sqrt(np.mean(scaled_res**2))
    eiv_metrics['bias' ]= np.mean(scaled_res)
    eiv_metrics['coverage_numerical'], eiv_metrics['coverage_theory'] =\
            epistemic_coverage(not_averaged_predictions, y, normalize_errors=False)
    eiv_metrics['coverage_normalized'],_ =\
            epistemic_coverage(not_averaged_predictions, y, normalize_errors=True)
    eiv_metrics['res_std' ]= normalized_std(not_averaged_predictions, y)
    if scale_outputs:
        scale_labels = train_data.dataset.std_labels.view((-1,)).to(device)
    else:
        scale_labels = None
    eiv_metrics['logdens' ]= net.predictive_logdensity(
            not_averaged_predictions, y,
            number_of_draws=eiv_number_of_draws,
            decouple_dimensions=decouple_dimensions,
            scale_labels=scale_labels).mean().detach().cpu().numpy()
    if training_state:
        net.train()
    else:
        net.eval()
    if noise_state:
        net.noise_on()
    else:
        net.noise_off()
    return noneiv_metrics, eiv_metrics


collection_keys = ['rmse','logdens','bias','coverage_numerical',
        'coverage_theory','coverage_normalized','res_std']
noneiv_metrics_collection = {}
eiv_metrics_collection = {}
for key in collection_keys:
    noneiv_metrics_collection[key] = []
    eiv_metrics_collection[key] = []
num_test_epochs = 10
assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"]
seed_list = range(noneiv_conf_dict["seed_range"][0],
        noneiv_conf_dict["seed_range"][1])
max_batch_number = 2
Jörg Martin's avatar
Jörg Martin committed
for seed in tqdm(seed_list):
    train_data, test_data = load_data(seed=seed)
    test_dataloader = DataLoader(test_data,
            batch_size=int(np.min((len(test_data),
Jörg Martin's avatar
Jörg Martin committed
        800))), shuffle=True)
    for i in tqdm(range(num_test_epochs)):
        for j, (x,y) in enumerate(test_dataloader):
            if j > max_batch_number:
                break

            noneiv_metrics, eiv_metrics = collect_metrics(x,y, seed=seed)
            for key in collection_keys:
                noneiv_metrics_collection[key].append(noneiv_metrics[key])
                eiv_metrics_collection[key].append(eiv_metrics[key])

results_dict = {}
print('Non-EiV:\n-----')
results_dict['noneiv'] = {}
for key in collection_keys:
    metric_mean = float(np.mean(noneiv_metrics_collection[key]))
    metric_std  = float(np.std(noneiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)))
    results_dict['noneiv'][key] = (metric_mean, metric_std)
    print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
print('\n')
print('EiV:\n-----')
results_dict['eiv'] = {}
for key in collection_keys:
    metric_mean = float(np.mean(eiv_metrics_collection[key]))
    metric_std  = float(np.std(eiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)))
    print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
    results_dict['eiv'][key] = (metric_mean, metric_std)

# write results to a JSON file in the results folder
with open(os.path.join('results',f'metrics_{short_dataname}.json'), 'w') as f:
    json.dump(results_dict, f)