Skip to content
Snippets Groups Projects
Commit 01529051 authored by Jörg Martin's avatar Jörg Martin
Browse files

Included computation of full_seed_range metrics

By this we mean metrics that can only be computed when all seeds are
evaluated so that, in particular, no std can be computed. The only
metric of this kind at this point is the average of the absolute values
of x-dependant biases ('avg-bias'). These metrics will only be computed
for datasets with a dataloader of the type
EIVData.repeated_sampling.repeated_sampling.
parents 947c258d 3493ff60
No related branches found
No related tags found
No related merge requests found
......@@ -3,7 +3,7 @@ import glob
import argparse
import json
metrics_to_display = ['rmse','logdens','bias','true_coverage_numerical']
metrics_to_display = ['rmse','logdens','bias','true_coverage_numerical','avg_bias']
show_incomplete = True
list_of_result_files = glob.glob(os.path.join('results','*.json'))
......@@ -19,7 +19,13 @@ def save_readout(dictionary, key):
the later doesn't exist, in which case (None,None) is returned.
"""
try:
return dictionary[key]
readout = dictionary[key]
if type(readout) is list:
assert len(readout) == 2
return readout
else:
readout = float(readout)
return (readout, None)
except KeyError:
return (None,None)
......@@ -40,7 +46,10 @@ for data in results.keys():
if metric_mean is None:
noneiv_results_string += ' None (None)'
else:
noneiv_results_string += f' {metric_mean:.3f} ({metric_std:.3f})'
if metric_std is not None:
noneiv_results_string += f' {metric_mean:.3f} ({metric_std:.3f})'
else:
noneiv_results_string += f' {metric_mean:.3f} (NaN)'
print(noneiv_results_string)
eiv_results = [save_readout(results[data]['eiv'],metric)
for metric in metrics_to_display]
......@@ -50,7 +59,10 @@ for data in results.keys():
if metric_mean is None:
eiv_results_string += ' None (None)'
else:
eiv_results_string += f' {metric_mean:.3f} ({metric_std:.3f})'
if metric_std is not None:
eiv_results_string += f' {metric_mean:.3f} ({metric_std:.3f})'
else:
eiv_results_string += f' {metric_mean:.3f} (NaN)'
print(eiv_results_string)
print(offset * '_' + 70 * '_')
......
......@@ -16,10 +16,11 @@ from tqdm import tqdm
from EIVArchitectures import Networks
from EIVTrainingRoutines import train_and_store
from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std
from EIVData.repeated_sampling import repeated_sampling
# read in data via --data option
parser = argparse.ArgumentParser()
parser.add_argument("--data", help="Loads data", default='linear')
parser.add_argument("--data", help="Loads data", default='replin')
parser.add_argument("--no-autoindent", help="",
action="store_true") # to avoid conflics in IPython
args = parser.parse_args()
......@@ -64,8 +65,8 @@ def collect_metrics(x_y_pairs, seed=0,
decouple_dimensions=False, device=device,
scale_outputs=scale_outputs):
"""
Compute various metrics for EiV and non-EiV. Will be returned as
dictionaries.
Compute various metrics for EiV and non-EiV for single seeds. Will be
returned as dictionaries.
:param x_y_pairs: A tuple of either the shape (None,None,x,y) or
(x_true,y_true,x,y) containing torch.tensor or None. x and y are
considered as input and corresponding label. If the first two components
......@@ -235,6 +236,215 @@ def collect_metrics(x_y_pairs, seed=0,
return noneiv_metrics, eiv_metrics
def collect_full_seed_range_metrics(load_data,
seed_range,test_batch_size = 100, test_samples = 10,
noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], device=device,
scale_outputs=scale_outputs):
"""
Collect metrics that need all seeds for their computation.
:param load_data: load_data map should take seed as an argument and,
optionally, `return_ground_truth`.
:param seed_range: iterator for seeds.
:param test_batch_size: An integer, used for drawing samples from the test
data.
:param test_samples: Number of test samples with batch size
`test_batch_size` to take.
:param noneiv_number_of_draws: Number of samples to take for the prediction
of the non-EiV model. Defaults to 100.
:param eiv_number_of_draws:Number of samples to take for the prediction
of the model. Defaults to [100,5].
:param device: The torch.device to use
:param scale_output: Boolean, scale the outputs for some metrics. Defaults
to False.
:returns: Dictionaries noneiv_metrics, eiv_metrics
"""
noneiv_metrics = {}
eiv_metrics = {}
noneiv_residual_collection = []
eiv_residual_collection = []
for i, seed in enumerate(seed_range):
# load data according toseed
try:
train_data, test_data, true_train_data, true_test_data \
= load_data(seed=seed, return_ground_truth=True)
except TypeError:
train_data, test_data = load_data(seed=seed)
true_train_data, true_test_data = None, None
## Compute x-dependant bias
# only for repeated_sampling datasets
if type(load_data) == repeated_sampling:
# only if there is a ground truth
if true_test_data is not None:
# non-EiV
init_std_y = noneiv_conf_dict["init_std_y_list"][0]
unscaled_reg = noneiv_conf_dict["unscaled_reg"]
p = noneiv_conf_dict["p"]
hidden_layers = noneiv_conf_dict["hidden_layers"]
saved_file = os.path.join('saved_networks',
f'noneiv_{short_dataname}'\
f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
f'_p_{p:.2f}_seed_{seed}.pkl')
net = Networks.FNNBer(p=p, init_std_y=init_std_y,
h=[input_dim, *hidden_layers, output_dim]).to(device)
# load network
train_and_store.open_stored_training(saved_file=saved_file,
net=net, device=device)
true_test_dataloader = DataLoader(true_test_data,
batch_size=int(np.min((len(test_data), test_batch_size))),
shuffle=False)
# to collect x-dependant residuals
true_scaled_res_collection = []
# variable to be used for checking
# that we loop over the same true_x for each seed
noneiv_true_x_sum = 0
for j, (true_x, true_y, noisy_x, _) in\
enumerate(true_test_dataloader):
if j >= test_samples:
break
# store the sum of the true_x
noneiv_true_x_sum += true_x.abs().sum().item()
true_x, true_y, noisy_x =\
true_x.to(device), true_y.to(device),\
noisy_x.to(device)
# Residuals
training_state = net.training
net.train()
not_averaged_predictions = net.predict(noisy_x,\
number_of_draws=noneiv_number_of_draws,
take_average_of_prediction=False)
noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
if len(true_y.shape) <= 1:
true_y = true_y.view((-1,1))
assert true_y.shape == noneiv_mean.shape
true_res = true_y - noneiv_mean
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
true_scaled_res = true_res * scale.view((1,-1))
else:
true_scaled_res = true_res
# append residual
true_scaled_res_collection.append(true_scaled_res)
# restore net
if training_state:
net.train()
else:
net.eval()
if i>0:
# check that the used true x are the same for each
# seed, by comparing their sum
assert noneiv_true_x_sum == old_noneiv_true_x_sum
old_noneiv_true_x_sum = noneiv_true_x_sum
# concatenate batches along batch dimension
true_scaled_res_collection =\
torch.concat(true_scaled_res_collection, dim=0)
noneiv_residual_collection.append(true_scaled_res_collection)
# EiV
init_std_y = eiv_conf_dict["init_std_y_list"][0]
unscaled_reg = eiv_conf_dict["unscaled_reg"]
p = eiv_conf_dict["p"]
hidden_layers = eiv_conf_dict["hidden_layers"]
fixed_std_x = eiv_conf_dict["fixed_std_x"]
saved_file = os.path.join('saved_networks',
f'eiv_{short_dataname}'\
f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
f'_seed_{seed}.pkl')
net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
h=[input_dim, *hidden_layers, output_dim],
fixed_std_x=fixed_std_x).to(device)
# load network
train_and_store.open_stored_training(saved_file=saved_file,
net=net, device=device)
# reinitialize dataloader to get the same true_x
true_test_dataloader = DataLoader(true_test_data,
batch_size=int(np.min((len(test_data), test_batch_size))),
shuffle=False)
true_scaled_res_collection = []
# variable to be used for checking
# that we loop over the same true_x for each seed
eiv_true_x_sum = 0
for j, (true_x, true_y, noisy_x, _) in\
enumerate(true_test_dataloader):
if j >= test_samples:
break
# store the sum of the true_x
eiv_true_x_sum += true_x.abs().sum().item()
true_x, true_y, noisy_x =\
true_x.to(device), true_y.to(device),\
noisy_x.to(device)
# Residuals
training_state = net.training
noise_state = net.noise_is_on
net.train()
net.noise_on()
not_averaged_predictions = net.predict(noisy_x,\
number_of_draws=eiv_number_of_draws,
take_average_of_prediction=False)
eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
if len(true_y.shape) <= 1:
true_y = true_y.view((-1,1))
assert true_y.shape == eiv_mean.shape
true_res = true_y - eiv_mean
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
true_scaled_res = true_res * scale.view((1,-1))
else:
true_scaled_res = true_res
# append residuals
true_scaled_res_collection.append(true_scaled_res)
# restore net
if training_state:
net.train()
else:
net.eval()
if noise_state:
net.noise_on()
else:
net.noise_off()
# check whether EiV and non-EiV used the same true_x for each
# seed by comparing their sum
assert eiv_true_x_sum == noneiv_true_x_sum
if i>0:
assert eiv_true_x_sum == old_eiv_true_x_sum
old_eiv_true_x_sum = eiv_true_x_sum
# concate batches along batch dimension
true_scaled_res_collection =\
torch.concat(true_scaled_res_collection, dim=0)
eiv_residual_collection.append(true_scaled_res_collection)
## Store quantities
# Compute and store (averaged) x-dependant bias
if type(load_data) == repeated_sampling and\
len(noneiv_residual_collection) > 0 and\
len(eiv_residual_collection) > 0:
noneiv_residual_collection = torch.stack(\
tuple(noneiv_residual_collection), dim=-1)
bias_per_x = torch.mean(noneiv_residual_collection, dim=-1)
avg_bias = torch.mean(torch.abs(bias_per_x))
noneiv_metrics['avg_bias'] = avg_bias
eiv_residual_collection = torch.stack(tuple(eiv_residual_collection),\
dim=-1)
bias_per_x = torch.mean(eiv_residual_collection, dim=-1)
avg_bias = torch.mean(torch.abs(bias_per_x))
eiv_metrics['avg_bias'] = avg_bias
return noneiv_metrics, eiv_metrics
# single seed metrics
noneiv_metrics_collection = {}
eiv_metrics_collection = {}
collection_keys = []
......@@ -242,6 +452,15 @@ num_test_epochs = 10
assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"]
seed_list = range(noneiv_conf_dict["seed_range"][0],
noneiv_conf_dict["seed_range"][1])
max_batch_number = 2
for seed in tqdm(seed_list):
try:
......@@ -280,22 +499,51 @@ for seed in tqdm(seed_list):
noneiv_metrics_collection[key].append(noneiv_metrics[key])
eiv_metrics_collection[key].append(eiv_metrics[key])
# full seed range metrics
print('Computing metrics that use all seeds at once...')
noneiv_full_seed_range_metrics, eiv_full_seed_range_metrics =\
collect_full_seed_range_metrics(load_data=load_data,\
seed_range=seed_list)
# add keys to collection_keys
assert noneiv_full_seed_range_metrics.keys() ==\
eiv_full_seed_range_metrics.keys()
full_seed_range_collection_keys = list(noneiv_full_seed_range_metrics.keys())
collection_keys += full_seed_range_collection_keys
results_dict = {}
print('Non-EiV:\n-----')
results_dict['noneiv'] = {}
for key in collection_keys:
metric_mean = float(np.mean(noneiv_metrics_collection[key]))
metric_std = float(np.std(noneiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)))
results_dict['noneiv'][key] = (metric_mean, metric_std)
print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
if key not in full_seed_range_collection_keys:
# per seed metrics
metric_mean = float(np.mean(noneiv_metrics_collection[key]))
metric_std = float(np.std(noneiv_metrics_collection[key])/\
np.sqrt(num_test_epochs*len(seed_list)))
results_dict['noneiv'][key] = (metric_mean, metric_std)
print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
else:
# full seed range metrics (without a std)
metric = float(noneiv_full_seed_range_metrics[key])
results_dict['noneiv'][key] = metric
print(f'{key}: {metric:.5f} (NaN)')
print('\n')
print('EiV:\n-----')
results_dict['eiv'] = {}
for key in collection_keys:
metric_mean = float(np.mean(eiv_metrics_collection[key]))
metric_std = float(np.std(eiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)))
print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
results_dict['eiv'][key] = (metric_mean, metric_std)
if key not in full_seed_range_collection_keys:
# per seed metrics
metric_mean = float(np.mean(eiv_metrics_collection[key]))
metric_std = float(np.std(eiv_metrics_collection[key])/\
np.sqrt(num_test_epochs*len(seed_list)))
print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
results_dict['eiv'][key] = (metric_mean, metric_std)
else:
# full seed range metrics (without a std)
metric = float(eiv_full_seed_range_metrics[key])
results_dict['eiv'][key] = metric
print(f'{key}: {metric:.5f} (NaN)')
# write results to a JSON file in the results folder
with open(os.path.join('results',f'metrics_{short_dataname}.json'), 'w') as f:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment