Skip to content
Snippets Groups Projects
Commit c44c4524 authored by Jörg Martin's avatar Jörg Martin
Browse files

Included full_seed range quantities in evaluate_metrics.

This allows now to compute (the average of) a x-dependant bias.
parent 947c258d
Branches
Tags
No related merge requests found
......@@ -16,10 +16,11 @@ from tqdm import tqdm
from EIVArchitectures import Networks
from EIVTrainingRoutines import train_and_store
from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std
from EIVData.repeated_sampling import repeated_sampling
# read in data via --data option
parser = argparse.ArgumentParser()
parser.add_argument("--data", help="Loads data", default='linear')
parser.add_argument("--data", help="Loads data", default='replin')
parser.add_argument("--no-autoindent", help="",
action="store_true") # to avoid conflics in IPython
args = parser.parse_args()
......@@ -64,8 +65,8 @@ def collect_metrics(x_y_pairs, seed=0,
decouple_dimensions=False, device=device,
scale_outputs=scale_outputs):
"""
Compute various metrics for EiV and non-EiV. Will be returned as
dictionaries.
Compute various metrics for EiV and non-EiV for single seeds. Will be
returned as dictionaries.
:param x_y_pairs: A tuple of either the shape (None,None,x,y) or
(x_true,y_true,x,y) containing torch.tensor or None. x and y are
considered as input and corresponding label. If the first two components
......@@ -235,6 +236,215 @@ def collect_metrics(x_y_pairs, seed=0,
return noneiv_metrics, eiv_metrics
def collect_full_seed_range_metrics(load_data,
seed_range,test_batch_size = 100, test_samples = 10,
noneiv_number_of_draws=100, eiv_number_of_draws=[100,5], device=device,
scale_outputs=scale_outputs):
"""
Collect metrics that need all seeds for their computation.
:param load_data: load_data map should take seed as an argument and,
optionally, `return_ground_truth`.
:param seed_range: iterator for seeds.
:param test_batch_size: An integer, used for drawing samples from the test
data.
:param test_samples: Number of test samples with batch size
`test_batch_size` to take.
:param noneiv_number_of_draws: Number of samples to take for the prediction
of the non-EiV model. Defaults to 100.
:param eiv_number_of_draws:Number of samples to take for the prediction
of the model. Defaults to [100,5].
:param device: The torch.device to use
:param scale_output: Boolean, scale the outputs for some metrics. Defaults
to False.
:returns: Dictionaries noneiv_metrics, eiv_metrics
"""
noneiv_metrics = {}
eiv_metrics = {}
noneiv_residual_collection = []
eiv_residual_collection = []
for i, seed in enumerate(seed_range):
# load data according toseed
try:
train_data, test_data, true_train_data, true_test_data \
= load_data(seed=seed, return_ground_truth=True)
except TypeError:
train_data, test_data = load_data(seed=seed)
true_train_data, true_test_data = None, None
## Compute x-dependant bias
# only for repeated_sampling datasets
if type(load_data) == repeated_sampling:
# only if there is a ground truth
if true_test_data is not None:
# non-EiV
init_std_y = noneiv_conf_dict["init_std_y_list"][0]
unscaled_reg = noneiv_conf_dict["unscaled_reg"]
p = noneiv_conf_dict["p"]
hidden_layers = noneiv_conf_dict["hidden_layers"]
saved_file = os.path.join('saved_networks',
f'noneiv_{short_dataname}'\
f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
f'_p_{p:.2f}_seed_{seed}.pkl')
net = Networks.FNNBer(p=p, init_std_y=init_std_y,
h=[input_dim, *hidden_layers, output_dim]).to(device)
# load network
train_and_store.open_stored_training(saved_file=saved_file,
net=net, device=device)
true_test_dataloader = DataLoader(true_test_data,
batch_size=int(np.min((len(test_data), test_batch_size))),
shuffle=False)
# to collect x-dependant residuals
true_scaled_res_collection = []
# variable to be used for checking
# that we loop over the same true_x for each seed
noneiv_true_x_sum = 0
for j, (true_x, true_y, noisy_x, _) in\
enumerate(true_test_dataloader):
if j >= test_samples:
break
# store the sum of the true_x
noneiv_true_x_sum += true_x.abs().sum().item()
true_x, true_y, noisy_x =\
true_x.to(device), true_y.to(device),\
noisy_x.to(device)
# Residuals
training_state = net.training
net.train()
not_averaged_predictions = net.predict(noisy_x,\
number_of_draws=noneiv_number_of_draws,
take_average_of_prediction=False)
noneiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
if len(true_y.shape) <= 1:
true_y = true_y.view((-1,1))
assert true_y.shape == noneiv_mean.shape
true_res = true_y - noneiv_mean
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
true_scaled_res = true_res * scale.view((1,-1))
else:
true_scaled_res = true_res
# append residual
true_scaled_res_collection.append(true_scaled_res)
# restore net
if training_state:
net.train()
else:
net.eval()
if i>0:
# check that the used true x are the same for each
# seed, by comparing their sum
assert noneiv_true_x_sum == old_noneiv_true_x_sum
old_noneiv_true_x_sum = noneiv_true_x_sum
# concatenate batches along batch dimension
true_scaled_res_collection =\
torch.concat(true_scaled_res_collection, dim=0)
noneiv_residual_collection.append(true_scaled_res_collection)
# EiV
init_std_y = eiv_conf_dict["init_std_y_list"][0]
unscaled_reg = eiv_conf_dict["unscaled_reg"]
p = eiv_conf_dict["p"]
hidden_layers = eiv_conf_dict["hidden_layers"]
fixed_std_x = eiv_conf_dict["fixed_std_x"]
saved_file = os.path.join('saved_networks',
f'eiv_{short_dataname}'\
f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
f'_seed_{seed}.pkl')
net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
h=[input_dim, *hidden_layers, output_dim],
fixed_std_x=fixed_std_x).to(device)
# load network
train_and_store.open_stored_training(saved_file=saved_file,
net=net, device=device)
# reinitialize dataloader to get the same true_x
true_test_dataloader = DataLoader(true_test_data,
batch_size=int(np.min((len(test_data), test_batch_size))),
shuffle=False)
true_scaled_res_collection = []
# variable to be used for checking
# that we loop over the same true_x for each seed
eiv_true_x_sum = 0
for j, (true_x, true_y, noisy_x, _) in\
enumerate(true_test_dataloader):
if j >= test_samples:
break
# store the sum of the true_x
eiv_true_x_sum += true_x.abs().sum().item()
true_x, true_y, noisy_x =\
true_x.to(device), true_y.to(device),\
noisy_x.to(device)
# Residuals
training_state = net.training
noise_state = net.noise_is_on
net.train()
net.noise_on()
not_averaged_predictions = net.predict(noisy_x,\
number_of_draws=eiv_number_of_draws,
take_average_of_prediction=False)
eiv_mean = torch.mean(not_averaged_predictions[0], dim=1)
if len(true_y.shape) <= 1:
true_y = true_y.view((-1,1))
assert true_y.shape == eiv_mean.shape
true_res = true_y - eiv_mean
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
true_scaled_res = true_res * scale.view((1,-1))
else:
true_scaled_res = true_res
# append residuals
true_scaled_res_collection.append(true_scaled_res)
# restore net
if training_state:
net.train()
else:
net.eval()
if noise_state:
net.noise_on()
else:
net.noise_off()
# check whether EiV and non-EiV used the same true_x for each
# seed by comparing their sum
assert eiv_true_x_sum == noneiv_true_x_sum
if i>0:
assert eiv_true_x_sum == old_eiv_true_x_sum
old_eiv_true_x_sum = eiv_true_x_sum
# concate batches along batch dimension
true_scaled_res_collection =\
torch.concat(true_scaled_res_collection, dim=0)
eiv_residual_collection.append(true_scaled_res_collection)
## Store quantities
# Compute and store (averaged) x-dependant bias
if type(load_data) == repeated_sampling and\
len(noneiv_residual_collection) > 0 and\
len(eiv_residual_collection) > 0:
noneiv_residual_collection = torch.stack(\
tuple(noneiv_residual_collection), dim=-1)
bias_per_x = torch.mean(noneiv_residual_collection, dim=-1)
avg_bias = torch.mean(torch.abs(bias_per_x))
noneiv_metrics['avg_bias'] = avg_bias
eiv_residual_collection = torch.stack(tuple(eiv_residual_collection),\
dim=-1)
bias_per_x = torch.mean(eiv_residual_collection, dim=-1)
avg_bias = torch.mean(torch.abs(bias_per_x))
eiv_metrics['avg_bias'] = avg_bias
return noneiv_metrics, eiv_metrics
# single seed metrics
noneiv_metrics_collection = {}
eiv_metrics_collection = {}
collection_keys = []
......@@ -242,6 +452,15 @@ num_test_epochs = 10
assert noneiv_conf_dict["seed_range"] == eiv_conf_dict["seed_range"]
seed_list = range(noneiv_conf_dict["seed_range"][0],
noneiv_conf_dict["seed_range"][1])
max_batch_number = 2
for seed in tqdm(seed_list):
try:
......@@ -280,22 +499,51 @@ for seed in tqdm(seed_list):
noneiv_metrics_collection[key].append(noneiv_metrics[key])
eiv_metrics_collection[key].append(eiv_metrics[key])
# full seed range metrics
print('Computing metrics that use all seeds at once...')
noneiv_full_seed_range_metrics, eiv_full_seed_range_metrics =\
collect_full_seed_range_metrics(load_data=load_data,\
seed_range=seed_list)
# add keys to collection_keys
assert noneiv_full_seed_range_metrics.keys() ==\
eiv_full_seed_range_metrics.keys()
full_seed_range_collection_keys = list(noneiv_full_seed_range_metrics.keys())
collection_keys += full_seed_range_collection_keys
results_dict = {}
print('Non-EiV:\n-----')
results_dict['noneiv'] = {}
for key in collection_keys:
metric_mean = float(np.mean(noneiv_metrics_collection[key]))
metric_std = float(np.std(noneiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)))
results_dict['noneiv'][key] = (metric_mean, metric_std)
print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
if key not in full_seed_range_collection_keys:
# per seed metrics
metric_mean = float(np.mean(noneiv_metrics_collection[key]))
metric_std = float(np.std(noneiv_metrics_collection[key])/\
np.sqrt(num_test_epochs*len(seed_list)))
results_dict['noneiv'][key] = (metric_mean, metric_std)
print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
else:
# full seed range metrics (without a std)
metric = float(noneiv_full_seed_range_metrics[key])
results_dict['noneiv'][key] = metric
print(f'{key}: {metric:.5f} (NaN)')
print('\n')
print('EiV:\n-----')
results_dict['eiv'] = {}
for key in collection_keys:
metric_mean = float(np.mean(eiv_metrics_collection[key]))
metric_std = float(np.std(eiv_metrics_collection[key])/np.sqrt(num_test_epochs*len(seed_list)))
print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
results_dict['eiv'][key] = (metric_mean, metric_std)
if key not in full_seed_range_collection_keys:
# per seed metrics
metric_mean = float(np.mean(eiv_metrics_collection[key]))
metric_std = float(np.std(eiv_metrics_collection[key])/\
np.sqrt(num_test_epochs*len(seed_list)))
print(f'{key}: {metric_mean:.5f} ({metric_std:.5f})')
results_dict['eiv'][key] = (metric_mean, metric_std)
else:
# full seed range metrics (without a std)
metric = float(eiv_full_seed_range_metrics[key])
results_dict['eiv'][key] = metric
print(f'{key}: {metric:.5f} (NaN)')
# write results to a JSON file in the results folder
with open(os.path.join('results',f'metrics_{short_dataname}.json'), 'w') as f:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment