Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
J
journal_eiv
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jörg Martin
journal_eiv
Commits
c44c4524
Commit
c44c4524
authored
3 years ago
by
Jörg Martin
Browse files
Options
Downloads
Patches
Plain Diff
Included full_seed range quantities in evaluate_metrics.
This allows now to compute (the average of) a x-dependant bias.
parent
947c258d
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Experiments/evaluate_metrics.py
+259
-11
259 additions, 11 deletions
Experiments/evaluate_metrics.py
with
259 additions
and
11 deletions
Experiments/evaluate_metrics.py
+
259
−
11
View file @
c44c4524
...
...
@@ -16,10 +16,11 @@ from tqdm import tqdm
from
EIVArchitectures
import
Networks
from
EIVTrainingRoutines
import
train_and_store
from
EIVGeneral.coverage_metrics
import
epistemic_coverage
,
normalized_std
from
EIVData.repeated_sampling
import
repeated_sampling
# read in data via --data option
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"
--data
"
,
help
=
"
Loads data
"
,
default
=
'
lin
ear
'
)
parser
.
add_argument
(
"
--data
"
,
help
=
"
Loads data
"
,
default
=
'
rep
lin
'
)
parser
.
add_argument
(
"
--no-autoindent
"
,
help
=
""
,
action
=
"
store_true
"
)
# to avoid conflics in IPython
args
=
parser
.
parse_args
()
...
...
@@ -64,8 +65,8 @@ def collect_metrics(x_y_pairs, seed=0,
decouple_dimensions
=
False
,
device
=
device
,
scale_outputs
=
scale_outputs
):
"""
Compute various metrics for EiV and non-EiV
. Will be returned as
dictionaries.
Compute various metrics for EiV and non-EiV
for single seeds. Will be
returned as
dictionaries.
:param x_y_pairs: A tuple of either the shape (None,None,x,y) or
(x_true,y_true,x,y) containing torch.tensor or None. x and y are
considered as input and corresponding label. If the first two components
...
...
@@ -235,6 +236,215 @@ def collect_metrics(x_y_pairs, seed=0,
return
noneiv_metrics
,
eiv_metrics
def
collect_full_seed_range_metrics
(
load_data
,
seed_range
,
test_batch_size
=
100
,
test_samples
=
10
,
noneiv_number_of_draws
=
100
,
eiv_number_of_draws
=
[
100
,
5
],
device
=
device
,
scale_outputs
=
scale_outputs
):
"""
Collect metrics that need all seeds for their computation.
:param load_data: load_data map should take seed as an argument and,
optionally, `return_ground_truth`.
:param seed_range: iterator for seeds.
:param test_batch_size: An integer, used for drawing samples from the test
data.
:param test_samples: Number of test samples with batch size
`test_batch_size` to take.
:param noneiv_number_of_draws: Number of samples to take for the prediction
of the non-EiV model. Defaults to 100.
:param eiv_number_of_draws:Number of samples to take for the prediction
of the model. Defaults to [100,5].
:param device: The torch.device to use
:param scale_output: Boolean, scale the outputs for some metrics. Defaults
to False.
:returns: Dictionaries noneiv_metrics, eiv_metrics
"""
noneiv_metrics
=
{}
eiv_metrics
=
{}
noneiv_residual_collection
=
[]
eiv_residual_collection
=
[]
for
i
,
seed
in
enumerate
(
seed_range
):
# load data according toseed
try
:
train_data
,
test_data
,
true_train_data
,
true_test_data
\
=
load_data
(
seed
=
seed
,
return_ground_truth
=
True
)
except
TypeError
:
train_data
,
test_data
=
load_data
(
seed
=
seed
)
true_train_data
,
true_test_data
=
None
,
None
## Compute x-dependant bias
# only for repeated_sampling datasets
if
type
(
load_data
)
==
repeated_sampling
:
# only if there is a ground truth
if
true_test_data
is
not
None
:
# non-EiV
init_std_y
=
noneiv_conf_dict
[
"
init_std_y_list
"
][
0
]
unscaled_reg
=
noneiv_conf_dict
[
"
unscaled_reg
"
]
p
=
noneiv_conf_dict
[
"
p
"
]
hidden_layers
=
noneiv_conf_dict
[
"
hidden_layers
"
]
saved_file
=
os
.
path
.
join
(
'
saved_networks
'
,
f
'
noneiv_
{
short_dataname
}
'
\
f
'
_init_std_y_
{
init_std_y
:
.
3
f
}
_ureg_
{
unscaled_reg
:
.
1
f
}
'
\
f
'
_p_
{
p
:
.
2
f
}
_seed_
{
seed
}
.pkl
'
)
net
=
Networks
.
FNNBer
(
p
=
p
,
init_std_y
=
init_std_y
,
h
=
[
input_dim
,
*
hidden_layers
,
output_dim
]).
to
(
device
)
# load network
train_and_store
.
open_stored_training
(
saved_file
=
saved_file
,
net
=
net
,
device
=
device
)
true_test_dataloader
=
DataLoader
(
true_test_data
,
batch_size
=
int
(
np
.
min
((
len
(
test_data
),
test_batch_size
))),
shuffle
=
False
)
# to collect x-dependant residuals
true_scaled_res_collection
=
[]
# variable to be used for checking
# that we loop over the same true_x for each seed
noneiv_true_x_sum
=
0
for
j
,
(
true_x
,
true_y
,
noisy_x
,
_
)
in
\
enumerate
(
true_test_dataloader
):
if
j
>=
test_samples
:
break
# store the sum of the true_x
noneiv_true_x_sum
+=
true_x
.
abs
().
sum
().
item
()
true_x
,
true_y
,
noisy_x
=
\
true_x
.
to
(
device
),
true_y
.
to
(
device
),
\
noisy_x
.
to
(
device
)
# Residuals
training_state
=
net
.
training
net
.
train
()
not_averaged_predictions
=
net
.
predict
(
noisy_x
,
\
number_of_draws
=
noneiv_number_of_draws
,
take_average_of_prediction
=
False
)
noneiv_mean
=
torch
.
mean
(
not_averaged_predictions
[
0
],
dim
=
1
)
if
len
(
true_y
.
shape
)
<=
1
:
true_y
=
true_y
.
view
((
-
1
,
1
))
assert
true_y
.
shape
==
noneiv_mean
.
shape
true_res
=
true_y
-
noneiv_mean
if
scale_outputs
:
scale
=
train_data
.
dataset
.
std_labels
.
to
(
device
)
true_scaled_res
=
true_res
*
scale
.
view
((
1
,
-
1
))
else
:
true_scaled_res
=
true_res
# append residual
true_scaled_res_collection
.
append
(
true_scaled_res
)
# restore net
if
training_state
:
net
.
train
()
else
:
net
.
eval
()
if
i
>
0
:
# check that the used true x are the same for each
# seed, by comparing their sum
assert
noneiv_true_x_sum
==
old_noneiv_true_x_sum
old_noneiv_true_x_sum
=
noneiv_true_x_sum
# concatenate batches along batch dimension
true_scaled_res_collection
=
\
torch
.
concat
(
true_scaled_res_collection
,
dim
=
0
)
noneiv_residual_collection
.
append
(
true_scaled_res_collection
)
# EiV
init_std_y
=
eiv_conf_dict
[
"
init_std_y_list
"
][
0
]
unscaled_reg
=
eiv_conf_dict
[
"
unscaled_reg
"
]
p
=
eiv_conf_dict
[
"
p
"
]
hidden_layers
=
eiv_conf_dict
[
"
hidden_layers
"
]
fixed_std_x
=
eiv_conf_dict
[
"
fixed_std_x
"
]
saved_file
=
os
.
path
.
join
(
'
saved_networks
'
,
f
'
eiv_
{
short_dataname
}
'
\
f
'
_init_std_y_
{
init_std_y
:
.
3
f
}
_ureg_
{
unscaled_reg
:
.
1
f
}
'
\
f
'
_p_
{
p
:
.
2
f
}
_fixed_std_x_
{
fixed_std_x
:
.
3
f
}
'
\
f
'
_seed_
{
seed
}
.pkl
'
)
net
=
Networks
.
FNNEIV
(
p
=
p
,
init_std_y
=
init_std_y
,
h
=
[
input_dim
,
*
hidden_layers
,
output_dim
],
fixed_std_x
=
fixed_std_x
).
to
(
device
)
# load network
train_and_store
.
open_stored_training
(
saved_file
=
saved_file
,
net
=
net
,
device
=
device
)
# reinitialize dataloader to get the same true_x
true_test_dataloader
=
DataLoader
(
true_test_data
,
batch_size
=
int
(
np
.
min
((
len
(
test_data
),
test_batch_size
))),
shuffle
=
False
)
true_scaled_res_collection
=
[]
# variable to be used for checking
# that we loop over the same true_x for each seed
eiv_true_x_sum
=
0
for
j
,
(
true_x
,
true_y
,
noisy_x
,
_
)
in
\
enumerate
(
true_test_dataloader
):
if
j
>=
test_samples
:
break
# store the sum of the true_x
eiv_true_x_sum
+=
true_x
.
abs
().
sum
().
item
()
true_x
,
true_y
,
noisy_x
=
\
true_x
.
to
(
device
),
true_y
.
to
(
device
),
\
noisy_x
.
to
(
device
)
# Residuals
training_state
=
net
.
training
noise_state
=
net
.
noise_is_on
net
.
train
()
net
.
noise_on
()
not_averaged_predictions
=
net
.
predict
(
noisy_x
,
\
number_of_draws
=
eiv_number_of_draws
,
take_average_of_prediction
=
False
)
eiv_mean
=
torch
.
mean
(
not_averaged_predictions
[
0
],
dim
=
1
)
if
len
(
true_y
.
shape
)
<=
1
:
true_y
=
true_y
.
view
((
-
1
,
1
))
assert
true_y
.
shape
==
eiv_mean
.
shape
true_res
=
true_y
-
eiv_mean
if
scale_outputs
:
scale
=
train_data
.
dataset
.
std_labels
.
to
(
device
)
true_scaled_res
=
true_res
*
scale
.
view
((
1
,
-
1
))
else
:
true_scaled_res
=
true_res
# append residuals
true_scaled_res_collection
.
append
(
true_scaled_res
)
# restore net
if
training_state
:
net
.
train
()
else
:
net
.
eval
()
if
noise_state
:
net
.
noise_on
()
else
:
net
.
noise_off
()
# check whether EiV and non-EiV used the same true_x for each
# seed by comparing their sum
assert
eiv_true_x_sum
==
noneiv_true_x_sum
if
i
>
0
:
assert
eiv_true_x_sum
==
old_eiv_true_x_sum
old_eiv_true_x_sum
=
eiv_true_x_sum
# concate batches along batch dimension
true_scaled_res_collection
=
\
torch
.
concat
(
true_scaled_res_collection
,
dim
=
0
)
eiv_residual_collection
.
append
(
true_scaled_res_collection
)
## Store quantities
# Compute and store (averaged) x-dependant bias
if
type
(
load_data
)
==
repeated_sampling
and
\
len
(
noneiv_residual_collection
)
>
0
and
\
len
(
eiv_residual_collection
)
>
0
:
noneiv_residual_collection
=
torch
.
stack
(
\
tuple
(
noneiv_residual_collection
),
dim
=-
1
)
bias_per_x
=
torch
.
mean
(
noneiv_residual_collection
,
dim
=-
1
)
avg_bias
=
torch
.
mean
(
torch
.
abs
(
bias_per_x
))
noneiv_metrics
[
'
avg_bias
'
]
=
avg_bias
eiv_residual_collection
=
torch
.
stack
(
tuple
(
eiv_residual_collection
),
\
dim
=-
1
)
bias_per_x
=
torch
.
mean
(
eiv_residual_collection
,
dim
=-
1
)
avg_bias
=
torch
.
mean
(
torch
.
abs
(
bias_per_x
))
eiv_metrics
[
'
avg_bias
'
]
=
avg_bias
return
noneiv_metrics
,
eiv_metrics
# single seed metrics
noneiv_metrics_collection
=
{}
eiv_metrics_collection
=
{}
collection_keys
=
[]
...
...
@@ -242,6 +452,15 @@ num_test_epochs = 10
assert
noneiv_conf_dict
[
"
seed_range
"
]
==
eiv_conf_dict
[
"
seed_range
"
]
seed_list
=
range
(
noneiv_conf_dict
[
"
seed_range
"
][
0
],
noneiv_conf_dict
[
"
seed_range
"
][
1
])
max_batch_number
=
2
for
seed
in
tqdm
(
seed_list
):
try
:
...
...
@@ -280,22 +499,51 @@ for seed in tqdm(seed_list):
noneiv_metrics_collection
[
key
].
append
(
noneiv_metrics
[
key
])
eiv_metrics_collection
[
key
].
append
(
eiv_metrics
[
key
])
# full seed range metrics
print
(
'
Computing metrics that use all seeds at once...
'
)
noneiv_full_seed_range_metrics
,
eiv_full_seed_range_metrics
=
\
collect_full_seed_range_metrics
(
load_data
=
load_data
,
\
seed_range
=
seed_list
)
# add keys to collection_keys
assert
noneiv_full_seed_range_metrics
.
keys
()
==
\
eiv_full_seed_range_metrics
.
keys
()
full_seed_range_collection_keys
=
list
(
noneiv_full_seed_range_metrics
.
keys
())
collection_keys
+=
full_seed_range_collection_keys
results_dict
=
{}
print
(
'
Non-EiV:
\n
-----
'
)
results_dict
[
'
noneiv
'
]
=
{}
for
key
in
collection_keys
:
metric_mean
=
float
(
np
.
mean
(
noneiv_metrics_collection
[
key
]))
metric_std
=
float
(
np
.
std
(
noneiv_metrics_collection
[
key
])
/
np
.
sqrt
(
num_test_epochs
*
len
(
seed_list
)))
results_dict
[
'
noneiv
'
][
key
]
=
(
metric_mean
,
metric_std
)
print
(
f
'
{
key
}
:
{
metric_mean
:
.
5
f
}
(
{
metric_std
:
.
5
f
}
)
'
)
if
key
not
in
full_seed_range_collection_keys
:
# per seed metrics
metric_mean
=
float
(
np
.
mean
(
noneiv_metrics_collection
[
key
]))
metric_std
=
float
(
np
.
std
(
noneiv_metrics_collection
[
key
])
/
\
np
.
sqrt
(
num_test_epochs
*
len
(
seed_list
)))
results_dict
[
'
noneiv
'
][
key
]
=
(
metric_mean
,
metric_std
)
print
(
f
'
{
key
}
:
{
metric_mean
:
.
5
f
}
(
{
metric_std
:
.
5
f
}
)
'
)
else
:
# full seed range metrics (without a std)
metric
=
float
(
noneiv_full_seed_range_metrics
[
key
])
results_dict
[
'
noneiv
'
][
key
]
=
metric
print
(
f
'
{
key
}
:
{
metric
:
.
5
f
}
(NaN)
'
)
print
(
'
\n
'
)
print
(
'
EiV:
\n
-----
'
)
results_dict
[
'
eiv
'
]
=
{}
for
key
in
collection_keys
:
metric_mean
=
float
(
np
.
mean
(
eiv_metrics_collection
[
key
]))
metric_std
=
float
(
np
.
std
(
eiv_metrics_collection
[
key
])
/
np
.
sqrt
(
num_test_epochs
*
len
(
seed_list
)))
print
(
f
'
{
key
}
:
{
metric_mean
:
.
5
f
}
(
{
metric_std
:
.
5
f
}
)
'
)
results_dict
[
'
eiv
'
][
key
]
=
(
metric_mean
,
metric_std
)
if
key
not
in
full_seed_range_collection_keys
:
# per seed metrics
metric_mean
=
float
(
np
.
mean
(
eiv_metrics_collection
[
key
]))
metric_std
=
float
(
np
.
std
(
eiv_metrics_collection
[
key
])
/
\
np
.
sqrt
(
num_test_epochs
*
len
(
seed_list
)))
print
(
f
'
{
key
}
:
{
metric_mean
:
.
5
f
}
(
{
metric_std
:
.
5
f
}
)
'
)
results_dict
[
'
eiv
'
][
key
]
=
(
metric_mean
,
metric_std
)
else
:
# full seed range metrics (without a std)
metric
=
float
(
eiv_full_seed_range_metrics
[
key
])
results_dict
[
'
eiv
'
][
key
]
=
metric
print
(
f
'
{
key
}
:
{
metric
:
.
5
f
}
(NaN)
'
)
# write results to a JSON file in the results folder
with
open
(
os
.
path
.
join
(
'
results
'
,
f
'
metrics_
{
short_dataname
}
.json
'
),
'
w
'
)
as
f
:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment