Skip to content
Snippets Groups Projects
evaluate_metrics.py 22.6 KiB
Newer Older
Jörg Martin's avatar
Jörg Martin committed
"""
Compute metrics for datasets for which there is not necessarily a ground truth.
Results will be stored in the results folder
"""
import importlib
import os
import argparse
import json

import numpy as np
import torch
import torch.backends.cudnn
from torch.utils.data import DataLoader
from tqdm import tqdm

from EIVArchitectures import Networks
from EIVTrainingRoutines import train_and_store
Jörg Martin's avatar
Jörg Martin committed
from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std
from EIVData.repeated_sampling import repeated_sampling
# read in data via --data option
parser = argparse.ArgumentParser()
parser.add_argument("--data", help="Loads data", default='replin')
parser.add_argument("--no-autoindent", help="",
        action="store_true") # to avoid conflics in IPython
args = parser.parse_args()
data = args.data

# load hyperparameters from JSON file
with open(os.path.join('configurations',f'eiv_{data}.json'),'r') as conf_file:
    eiv_conf_dict = json.load(conf_file)
with open(os.path.join('configurations',f'noneiv_{data}.json'),'r') as conf_file:
    noneiv_conf_dict = json.load(conf_file)

long_dataname = eiv_conf_dict["long_dataname"]
short_dataname = eiv_conf_dict["short_dataname"]

print(f"Evaluating {long_dataname}")
scale_outputs = False 
load_data = importlib.import_module(f'EIVData.{long_dataname}').load_data

train_data, test_data = load_data()
input_dim = train_data[0][0].numel()
output_dim = train_data[0][1].numel()

try:
    gpu_number = eiv_conf_dict["gpu_number"]
    device = torch.device(f'cuda:{gpu_number}')
    try:
        torch.tensor([0.0]).to(device)
    except RuntimeError:
        if torch.cuda.is_available():
            print('Switched to GPU 0')
            device = torch.device('cuda:0')
        else:
            print('No cuda available, using CPU')
            device = torch.device('cpu')
except KeyError:
    device = torch.device('cpu')


def collect_metrics(x_y_pairs, seed=0,
    noneiv_number_of_draws=100, eiv_number_of_draws=[100,5],
    decouple_dimensions=False, device=device,
    scale_outputs=scale_outputs):
    Compute various metrics for EiV and non-EiV for single seeds. Will be
    returned as dictionaries.
    :param x_y_pairs: A tuple of either the shape (None,None,x,y) or 
    (x_true,y_true,x,y) containing torch.tensor or None. x and y are
    considered as input and corresponding label. If the first two components
    are not None, they are considered to be the unnoisy counterparts.
    :param seed: Integer. The seed used for loading, defaults to 0.
    :param noneiv_number_of_draws: Number of draws for non-EiV model
    for sampling from the posterior predictive. Defaults to 100.
    :param noneiv_number_of_draws: Number of draws for EiV model
Jörg Martin's avatar
Jörg Martin committed
    for sampling from the posterior predictive. Defaults to 100.
    :param decouple_dimensions: Boolean. If True, the unsual convention
    of Gal et al. is followed where, in the evaluation of the
    log-posterior-predictive, each dimension is treated independently and then
    averaged. If False (default), a multivariate distribution is used.
Jörg Martin's avatar
Jörg Martin committed
    :param device: The device to use.
    :param scale_output: Boolean, scale the outputs for the RMSE, the bias and
    the log-dens to make them comparable with the literature.
    :returns: Dictionaries noneiv_metrics, eiv_metrics
    true_x, true_y, x, y = x_y_pairs
Jörg Martin's avatar
Jörg Martin committed
    x,y = x.to(device), y.to(device)
    if true_x is not None:
        assert true_y is not None
        true_x,true_y = true_x.to(device), true_y.to(device)
    else:
        assert true_y is None

    # non-EiV
    noneiv_metrics = {}
    init_std_y = noneiv_conf_dict["init_std_y_list"][0]
    unscaled_reg = noneiv_conf_dict["unscaled_reg"]
    p = noneiv_conf_dict["p"]
    hidden_layers = noneiv_conf_dict["hidden_layers"]
    saved_file = os.path.join('saved_networks',
                f'noneiv_{short_dataname}'\
                        f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
                        f'_p_{p:.2f}_seed_{seed}.pkl')
    net = Networks.FNNBer(p=p, init_std_y=init_std_y,
Jörg Martin's avatar
Jörg Martin committed
            h=[input_dim, *hidden_layers, output_dim]).to(device)
    train_and_store.open_stored_training(saved_file=saved_file,
Jörg Martin's avatar
Jörg Martin committed
            net=net, device=device)


    # RMSE
    training_state = net.training
    net.train()
Jörg Martin's avatar
Jörg Martin committed
    not_averaged_predictions = net.predict(x,\
            number_of_draws=noneiv_number_of_draws, 
            take_average_of_prediction=False)
    noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
    if len(y.shape) <= 1:
        y = y.view((-1,1))
    assert y.shape == noneiv_mean.shape
    res = y-noneiv_mean
    if scale_outputs:
        scale = train_data.dataset.std_labels.to(device)
        scaled_res = res * scale.view((1,-1))
    else:
        scaled_res = res
    scaled_res = scaled_res.detach().cpu().numpy().flatten()
    noneiv_metrics['rmse'] = np.sqrt(np.mean(scaled_res**2))
    noneiv_metrics['bias'] = np.mean(scaled_res)
    noneiv_metrics['coverage_numerical'], noneiv_metrics['coverage_theory'] =\
Jörg Martin's avatar
Jörg Martin committed
            epistemic_coverage(not_averaged_predictions, y,\
            normalize_errors=False)
    noneiv_metrics['coverage_normalized'],_ =\
Jörg Martin's avatar
Jörg Martin committed
            epistemic_coverage(not_averaged_predictions, y,\
            normalize_errors=True)
    noneiv_metrics['res_std'] = normalized_std(not_averaged_predictions, y)

    # metrics that need a ground truth
    if true_x is not None:
        noneiv_metrics['true_coverage_numerical'],\
                noneiv_metrics['true_coverage_theory'] =\
                epistemic_coverage(not_averaged_predictions, true_y,
                        normalize_errors=False, noisy_y=False)
Jörg Martin's avatar
Jörg Martin committed
        true_res = (true_y - noneiv_mean).detach().cpu().numpy().flatten()
        noneiv_metrics['true_rmse'] = np.sqrt(np.mean(true_res**2))
    if scale_outputs:
        scale_labels = train_data.dataset.std_labels.view((-1,)).to(device)
    else:
        scale_labels = None
    noneiv_metrics['logdens' ]= net.predictive_logdensity(
            not_averaged_predictions, y,
            number_of_draws=100,
            decouple_dimensions=decouple_dimensions,
            scale_labels=scale_labels).mean().detach().cpu().numpy()
    if training_state:
        net.train()
    else:
        net.eval()

    # EiV
    eiv_metrics = {}
    init_std_y = eiv_conf_dict["init_std_y_list"][0]
    unscaled_reg = eiv_conf_dict["unscaled_reg"]
    p = eiv_conf_dict["p"]
    hidden_layers = eiv_conf_dict["hidden_layers"]
    fixed_std_x = eiv_conf_dict["fixed_std_x"]
    saved_file = os.path.join('saved_networks',
            f'eiv_{short_dataname}'\
Jörg Martin's avatar
Jörg Martin committed
                    f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
                    f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
                    f'_seed_{seed}.pkl')
    net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
Jörg Martin's avatar
Jörg Martin committed
            h=[input_dim, *hidden_layers, output_dim],
            fixed_std_x=fixed_std_x).to(device)
    train_and_store.open_stored_training(saved_file=saved_file,
            net=net)
    # RMSE
    training_state = net.training
    noise_state = net.noise_is_on
    net.train()
    net.noise_on()
Jörg Martin's avatar
Jörg Martin committed
    not_averaged_predictions = net.predict(x, number_of_draws=eiv_number_of_draws, 
            take_average_of_prediction=False)
    eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
    if len(y.shape) <= 1:
        y = y.view((-1,1))
    assert y.shape == eiv_mean.shape
    res = y-eiv_mean
    if scale_outputs:
        scale = train_data.dataset.std_labels.to(device)
        scaled_res = res * scale.view((1,-1))
    else:
        scaled_res = res
    scaled_res = scaled_res.detach().cpu().numpy().flatten()
    eiv_metrics['rmse' ]= np.sqrt(np.mean(scaled_res**2))
    eiv_metrics['bias' ]= np.mean(scaled_res)
    eiv_metrics['coverage_numerical'], eiv_metrics['coverage_theory'] =\
            epistemic_coverage(not_averaged_predictions, y, normalize_errors=False)
    eiv_metrics['coverage_normalized'],_ =\
            epistemic_coverage(not_averaged_predictions, y, normalize_errors=True)
    eiv_metrics['res_std' ]= normalized_std(not_averaged_predictions, y)
    # metrics that need a ground truth
    if true_x is not None:
        eiv_metrics['true_coverage_numerical'],\
                eiv_metrics['true_coverage_theory'] =\
                epistemic_coverage(not_averaged_predictions, true_y,
                        normalize_errors=False, noisy_y=False)

Jörg Martin's avatar
Jörg Martin committed
        true_res = (true_y - eiv_mean).detach().cpu().numpy().flatten()
        eiv_metrics['true_rmse'] = np.sqrt(np.mean(true_res**2))
    if scale_outputs:
        scale_labels = train_data.dataset.std_labels.view((-1,)).to(device)
    else:
        scale_labels = None
    eiv_metrics['logdens' ]= net.predictive_logdensity(
            not_averaged_predictions, y,
            number_of_draws=eiv_number_of_draws,
            decouple_dimensions=decouple_dimensions,
            scale_labels=scale_labels).mean().detach().cpu().numpy()
    if training_state:
        net.train()
    else:
        net.eval()
    if noise_state:
        net.noise_on()
    else:
        net.noise_off()
    return noneiv_metrics, eiv_metrics



def collect_full_seed_range_metrics(load_data,
        seed_range,test_batch_size = 100, test_samples = 10,
        noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], device=device,
        scale_outputs=scale_outputs):
    """
    Collect metrics that need all seeds for their computation.
    :param load_data: load_data map should take seed as an argument and,
    optionally, `return_ground_truth`.
    :param seed_range: iterator for seeds.
    :param test_batch_size: An integer, used for drawing samples from the test
    data.
    :param test_samples: Number of test samples with batch size
    `test_batch_size` to take.
    :param noneiv_number_of_draws: Number of samples to take for the prediction
    of the non-EiV model. Defaults to 100.
    :param eiv_number_of_draws:Number of samples to take for the prediction
    of the model. Defaults to [100,5].
    :param device: The torch.device to use
    :param scale_output: Boolean, scale the outputs for some metrics. Defaults
    to False.
    :returns: Dictionaries noneiv_metrics, eiv_metrics
    """
    noneiv_metrics = {}
    eiv_metrics = {}
    noneiv_residual_collection = []
    eiv_residual_collection = []
    for i, seed in enumerate(seed_range):
        # load data according toseed
        try:
            train_data, test_data, true_train_data, true_test_data \
                    = load_data(seed=seed, return_ground_truth=True)
        except TypeError:
            train_data, test_data = load_data(seed=seed)
            true_train_data, true_test_data = None, None

        ## Compute x-dependant bias

        # only for repeated_sampling datasets
        if type(load_data) == repeated_sampling:
            # only if there is a ground truth
            if true_test_data is not None:
                # non-EiV
                init_std_y = noneiv_conf_dict["init_std_y_list"][0]
                unscaled_reg = noneiv_conf_dict["unscaled_reg"]
                p = noneiv_conf_dict["p"]
                hidden_layers = noneiv_conf_dict["hidden_layers"]
                saved_file = os.path.join('saved_networks',
                            f'noneiv_{short_dataname}'\
                                    f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
                                    f'_p_{p:.2f}_seed_{seed}.pkl')
                net = Networks.FNNBer(p=p, init_std_y=init_std_y,
                        h=[input_dim, *hidden_layers, output_dim]).to(device)
                # load network
                train_and_store.open_stored_training(saved_file=saved_file,
                        net=net, device=device)

                true_test_dataloader = DataLoader(true_test_data,
                    batch_size=int(np.min((len(test_data), test_batch_size))),
                    shuffle=False)
                # to collect x-dependant residuals
                true_scaled_res_collection = []
                # variable to be used for checking
                # that we loop over the same true_x for each seed
                noneiv_true_x_sum = 0
                for j, (true_x, true_y, noisy_x, _) in\
                        enumerate(true_test_dataloader):
                    if j >= test_samples:
                        break
                    # store the sum of the true_x
                    noneiv_true_x_sum += true_x.abs().sum().item()
                    
                    true_x, true_y, noisy_x =\
                            true_x.to(device), true_y.to(device),\
                            noisy_x.to(device)
                
                    # Residuals
                    training_state = net.training
                    net.train()
                    not_averaged_predictions = net.predict(noisy_x,\
                            number_of_draws=noneiv_number_of_draws, 
                            take_average_of_prediction=False)
                    noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
                    if len(true_y.shape) <= 1:
                        true_y = true_y.view((-1,1))
                    assert true_y.shape == noneiv_mean.shape
                    true_res = true_y - noneiv_mean
                    if scale_outputs:
                        scale = train_data.dataset.std_labels.to(device)
                        true_scaled_res = true_res * scale.view((1,-1))
                    else:
                        true_scaled_res = true_res

                    # append residual
                    true_scaled_res_collection.append(true_scaled_res)

                    # restore net
                    if training_state:
                        net.train()
                    else:
                        net.eval()
                if i>0:
                    # check that the used true x are the same for each
                    # seed, by comparing their sum
                    assert noneiv_true_x_sum == old_noneiv_true_x_sum
                old_noneiv_true_x_sum = noneiv_true_x_sum
                
                # concatenate batches along batch dimension
                true_scaled_res_collection =\
                        torch.concat(true_scaled_res_collection, dim=0)
                noneiv_residual_collection.append(true_scaled_res_collection)


                # EiV
                init_std_y = eiv_conf_dict["init_std_y_list"][0]
                unscaled_reg = eiv_conf_dict["unscaled_reg"]
                p = eiv_conf_dict["p"]
                hidden_layers = eiv_conf_dict["hidden_layers"]
                fixed_std_x = eiv_conf_dict["fixed_std_x"]
                saved_file = os.path.join('saved_networks',
                        f'eiv_{short_dataname}'\
                                f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
                                f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
                                f'_seed_{seed}.pkl')
                net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
                        h=[input_dim, *hidden_layers, output_dim],
                        fixed_std_x=fixed_std_x).to(device)
                # load network
                train_and_store.open_stored_training(saved_file=saved_file,
                        net=net, device=device)

                # reinitialize dataloader to get the same true_x
                true_test_dataloader = DataLoader(true_test_data,
                    batch_size=int(np.min((len(test_data), test_batch_size))),
                    shuffle=False)
                true_scaled_res_collection = []
                # variable to be used for checking
                # that we loop over the same true_x for each seed
                eiv_true_x_sum = 0
                for j, (true_x, true_y, noisy_x, _) in\
                        enumerate(true_test_dataloader):
                    if j >= test_samples:
                        break
                    # store the sum of the true_x
                    eiv_true_x_sum += true_x.abs().sum().item()
                    true_x, true_y, noisy_x =\
                            true_x.to(device), true_y.to(device),\
                            noisy_x.to(device)
                    # Residuals
                    training_state = net.training
                    noise_state = net.noise_is_on
                    net.train()
                    net.noise_on()
                    not_averaged_predictions = net.predict(noisy_x,\
                            number_of_draws=eiv_number_of_draws, 
                            take_average_of_prediction=False)
                    eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
                    if len(true_y.shape) <= 1:
                        true_y = true_y.view((-1,1))
                    assert true_y.shape == eiv_mean.shape
                    true_res = true_y - eiv_mean
                    if scale_outputs:
                        scale = train_data.dataset.std_labels.to(device)
                        true_scaled_res = true_res * scale.view((1,-1))
                    else:
                        true_scaled_res = true_res
                    # append residuals
                    true_scaled_res_collection.append(true_scaled_res)
                    # restore net
                    if training_state:
                        net.train()
                    else:
                        net.eval()
                    if noise_state:
                        net.noise_on()
                    else:
                        net.noise_off()
                # check whether EiV and non-EiV used the same true_x for each
                # seed by comparing their sum
                assert eiv_true_x_sum == noneiv_true_x_sum
                if i>0:
                    assert eiv_true_x_sum == old_eiv_true_x_sum
                old_eiv_true_x_sum = eiv_true_x_sum
                # concate batches along batch dimension
                true_scaled_res_collection =\
                        torch.concat(true_scaled_res_collection, dim=0)
                eiv_residual_collection.append(true_scaled_res_collection)


    ## Store quantities

    # Compute and store (averaged) x-dependant bias
    if type(load_data) == repeated_sampling and\
            len(noneiv_residual_collection) > 0 and\
            len(eiv_residual_collection) > 0:
        noneiv_residual_collection = torch.stack(\
                tuple(noneiv_residual_collection), dim=-1)
        bias_per_x = torch.mean(noneiv_residual_collection, dim=-1)
        avg_bias = torch.mean(torch.abs(bias_per_x))
        noneiv_metrics['avg_bias'] = avg_bias

        eiv_residual_collection = torch.stack(tuple(eiv_residual_collection),\
                dim=-1)
        bias_per_x = torch.mean(eiv_residual_collection, dim=-1)
        avg_bias = torch.mean(torch.abs(bias_per_x))
        eiv_metrics['avg_bias'] = avg_bias
    return noneiv_metrics, eiv_metrics

# single seed metrics
noneiv_metrics_collection = {}
eiv_metrics_collection = {}
collection_keys = []
num_test_epochs = 10
assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"]
seed_list = range(noneiv_conf_dict["seed_range"][0],
        noneiv_conf_dict["seed_range"][1])
max_batch_number = 2
Jörg Martin's avatar
Jörg Martin committed
for seed in tqdm(seed_list):
        train_data, test_data, true_train_data, true_test_data \
                = load_data(seed=seed, return_ground_truth=True)
    except TypeError:
        train_data, test_data = load_data(seed=seed)
        true_train_data, true_test_data = None, None
    if true_test_data is None:
        test_dataloader = DataLoader(test_data,
            batch_size=int(np.min((len(test_data),
Jörg Martin's avatar
Jörg Martin committed
        800))), shuffle=True)
    else:
        test_dataloader = DataLoader(true_test_data,
                batch_size=int(np.min((len(true_test_data), 800))), shuffle=True)
Jörg Martin's avatar
Jörg Martin committed
    for i in tqdm(range(num_test_epochs)):
        for j, x_y_pairs in enumerate(test_dataloader):
            if j > max_batch_number:
                break
            # fill in ground truth with None, if not existent
            if true_test_data is None:
                x_y_pairs = (None, None, *x_y_pairs)
            # should contain (true_x,true_y,x,y) or (None,None,x,y)
            assert len(x_y_pairs) == 4
            noneiv_metrics, eiv_metrics = collect_metrics(x_y_pairs,
                    seed=seed)
            if i==0 and j==0:
                # fill collection keys
                assert eiv_metrics.keys() == noneiv_metrics.keys()
                collection_keys = list(eiv_metrics.keys())
                for key in collection_keys:
                    noneiv_metrics_collection[key] = []
                    eiv_metrics_collection[key] = []
            # collect results
            for key in collection_keys:
                noneiv_metrics_collection[key].append(noneiv_metrics[key])
                eiv_metrics_collection[key].append(eiv_metrics[key])

# full seed range metrics
print('Computing metrics that use all seeds at once...')
noneiv_full_seed_range_metrics, eiv_full_seed_range_metrics =\
        collect_full_seed_range_metrics(load_data=load_data,\
                seed_range=seed_list)
# add keys to collection_keys
assert noneiv_full_seed_range_metrics.keys() ==\
        eiv_full_seed_range_metrics.keys()
full_seed_range_collection_keys = list(noneiv_full_seed_range_metrics.keys())
collection_keys += full_seed_range_collection_keys


results_dict = {}
print('Non-EiV:\n-----')
results_dict['noneiv'] = {}
for key in collection_keys:
    if key not in full_seed_range_collection_keys:
        # per seed metrics
        metric_mean = float(np.mean(noneiv_metrics_collection[key]))
        metric_std  = float(np.std(noneiv_metrics_collection[key])/\
                np.sqrt(num_test_epochs*len(seed_list)))
        results_dict['noneiv'][key] = (metric_mean, metric_std)
        print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
    else:
        # full seed range metrics (without a std)
        metric = float(noneiv_full_seed_range_metrics[key])
        results_dict['noneiv'][key] = metric
        print(f'{key}: {metric:.5f} (NaN)')

print('\n')
print('EiV:\n-----')
results_dict['eiv'] = {}
for key in collection_keys:
    if key not in full_seed_range_collection_keys:
        # per seed metrics
        metric_mean = float(np.mean(eiv_metrics_collection[key]))
        metric_std  = float(np.std(eiv_metrics_collection[key])/\
                np.sqrt(num_test_epochs*len(seed_list)))
        print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
        results_dict['eiv'][key] = (metric_mean, metric_std)
    else:
        # full seed range metrics (without a std)
        metric = float(eiv_full_seed_range_metrics[key])
        results_dict['eiv'][key] = metric
        print(f'{key}: {metric:.5f} (NaN)')

# write results to a JSON file in the results folder
with open(os.path.join('results',f'metrics_{short_dataname}.json'), 'w') as f:
    json.dump(results_dict, f)