Skip to content
Snippets Groups Projects
Commit 6e80eaff authored by Jörg Martin's avatar Jörg Martin
Browse files

Included repeated sampling for simulated data

parents 667997a2 bf35c5dc
Branches
No related tags found
No related merge requests found
......@@ -2,6 +2,8 @@ import torch
import sys
from torch.utils.data import TensorDataset
from EIVGeneral.manipulate_tensors import add_noise
total_number_of_datapoints = 2000
input_range = [-1,1]
slope = 1.0
......@@ -9,17 +11,6 @@ intercept = 0.0
x_noise_strength = 0.05
y_noise_strength = 0.1
def get_normalization(*args):
"""
Returns the mean and standard deviations (in tuples) of the tensors in *args.
"""
normalization_collection = []
for t in args:
t_mean = torch.mean(t, dim=0, keepdim=True)
t_std = torch.std(t, dim=0, keepdim=True)
normalization_collection.append((t_mean, t_std))
return tuple(normalization_collection)
def load_data(seed=0, splitting_part=0.8, normalize=True,
return_ground_truth=False):
"""
......@@ -37,26 +28,20 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
"""
random_generator = torch.Generator().manual_seed(seed)
# draw different seeds for noise and splitting
seeds = torch.randint(0,sys.maxsize,(3,), generator=random_generator)
seeds = [int(t) for t in torch.randint(0,sys.maxsize,(3,),\
generator=random_generator)]
# create new generators from tensor seeds
create_generator = lambda tensor_seed:\
torch.Generator().manual_seed(tensor_seed.item())
true_x = input_range[0] + (input_range[1]-input_range[0])\
* torch.rand((total_number_of_datapoints,1),
generator=create_generator(seeds[0]))
generator=torch.Generator().manual_seed(seeds[0]))
true_y = slope * true_x + intercept
noisy_x = true_x + x_noise_strength * \
torch.randn((total_number_of_datapoints,1),
generator=create_generator(seeds[1]))
noisy_y = true_y + y_noise_strength * \
torch.randn((total_number_of_datapoints,1),
generator=create_generator(seeds[2]))
if normalize:
normalization_x, normalization_y = get_normalization(noisy_x, noisy_y)
noisy_x = (noisy_x-normalization_x[0])/normalization_x[1]
true_x = (true_x-normalization_x[0])/normalization_x[1]
noisy_y = (noisy_y-normalization_y[0])/normalization_y[1]
true_y = (true_y-normalization_y[0])/normalization_y[1]
# add noise and normalize x and y
(noisy_x, noisy_y), (true_x, true_y) = add_noise(
tensor_list=(true_x, true_y),
noise_strength_list=(x_noise_strength, y_noise_strength),
seed_list=seeds[1:3],
normalize=normalize)
# create datasets
dataset_len = noisy_x.shape[0]
train_len = int(dataset_len*splitting_part)
test_len = dataset_len - train_len
......@@ -75,4 +60,3 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
else:
return linear_trainset, linear_testset, true_linear_trainset,\
true_linear_testset
......@@ -2,6 +2,8 @@ import torch
import sys
from torch.utils.data import TensorDataset
from EIVGeneral.manipulate_tensors import add_noise
total_number_of_datapoints = 2000
input_range = [-1,1]
slope = 1.0
......@@ -9,18 +11,6 @@ intercept = 0.0
x_noise_strength = 0.05
y_noise_strength = 0.1
def get_normalization(*args):
"""
Returns the mean and standard deviations (in tuples) of the tensors in
*args.
"""
normalization_collection = []
for t in args:
t_mean = torch.mean(t, dim=0, keepdim=True)
t_std = torch.std(t, dim=0, keepdim=True)
normalization_collection.append((t_mean, t_std))
return tuple(normalization_collection)
def load_data(seed=0, splitting_part=0.8, normalize=True,
return_ground_truth=False):
"""
......@@ -38,26 +28,20 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
"""
random_generator = torch.Generator().manual_seed(seed)
# draw different seeds for noise and splitting
seeds = torch.randint(0,sys.maxsize,(3,), generator=random_generator)
seeds = [int(t) for t in torch.randint(0,sys.maxsize,(3,),\
generator=random_generator)]
# create new generators from tensor seeds
create_generator = lambda tensor_seed:\
torch.Generator().manual_seed(tensor_seed.item())
true_x = input_range[0] + (input_range[1]-input_range[0])\
* torch.rand((total_number_of_datapoints,1),
generator=create_generator(seeds[0]))
generator=torch.Generator().manual_seed(seeds[0]))
true_y = slope * true_x**2 + intercept
noisy_x = true_x + x_noise_strength * \
torch.randn((total_number_of_datapoints,1),
generator=create_generator(seeds[1]))
noisy_y = true_y + y_noise_strength * \
torch.randn((total_number_of_datapoints,1),
generator=create_generator(seeds[2]))
if normalize:
normalization_x, normalization_y = get_normalization(noisy_x, noisy_y)
noisy_x = (noisy_x-normalization_x[0])/normalization_x[1]
true_x = (true_x-normalization_x[0])/normalization_x[1]
noisy_y = (noisy_y-normalization_y[0])/normalization_y[1]
true_y = (true_y-normalization_y[0])/normalization_y[1]
# add noise and normalize x and y
(noisy_x, noisy_y), (true_x, true_y) = add_noise(
tensor_list=(true_x, true_y),
noise_strength_list=(x_noise_strength, y_noise_strength),
seed_list=seeds[1:3],
normalize=normalize)
# create datasets
dataset_len = noisy_x.shape[0]
train_len = int(dataset_len*splitting_part)
test_len = dataset_len - train_len
......@@ -76,4 +60,3 @@ def load_data(seed=0, splitting_part=0.8, normalize=True,
else:
return quadratic_trainset, quadratic_testset, true_quadratic_trainset,\
true_quadratic_testset
"""
Repeated sampling from the linear dataset.
"""
from EIVData import linear
from EIVData.repeated_sampling import repeated_sampling
fixed_seed = 0
load_data = repeated_sampling(dataclass=linear,
fixed_seed=fixed_seed)
"""
Repeated sampling from the quadratic dataset.
"""
from EIVData import quadratic
from EIVData.repeated_sampling import repeated_sampling
fixed_seed = 0
load_data = repeated_sampling(dataclass=quadratic,
fixed_seed=fixed_seed)
"""
Contains the class `repeated_sampling` that can be used to generate
datasets for repeated sampling from datasets with a ground truth.
"""
import sys
import torch
from torch.utils.data import TensorDataset
from EIVGeneral.manipulate_tensors import add_noise
class repeated_sampling():
"""
A class for repeated sampling from datasets with a known ground truth and
known input and output noise. The class `dataclass` should contain a
`load_data` routine that returns a ground truth and two positive floats
`x_noise_strength` and `y_noise_strength` that will be used as the standard
deviation of input and output noise.
:param dataclass: A module that contains a routine `load_data`, which
accepts the keyword `return_ground_truth` and returns the noisy and true
train and test datasets, and two positive floats `x_noise_strength` and
`y_noise_strength`.
:param fixed_seed: Integer. The seed to load the unnoisy ground truth,
defaults to 0.
"""
def __init__(self, dataclass, fixed_seed=0):
self.dataclass = dataclass
self.fixed_seed = fixed_seed
self.x_noise_strength = dataclass.x_noise_strength
self.y_noise_strength = dataclass.y_noise_strength
def __call__(self,seed=0, splitting_part=0.8, normalize=True,
return_ground_truth=False):
_, _, true_trainset, true_testset\
= self.dataclass.load_data(
seed=self.fixed_seed, splitting_part=splitting_part,
return_ground_truth=True)
true_train_x, true_train_y = true_trainset.tensors[:2]
true_test_x, true_test_y = true_testset.tensors[:2]
random_generator = torch.Generator().manual_seed(seed)
# draw different seeds for noise and splitting
seeds = [int(t) for t in torch.randint(0,sys.maxsize,(2,),\
generator=random_generator)]
(noisy_train_x, noisy_train_y), (true_train_x, true_train_y) =\
add_noise((true_train_x, true_train_y),
(self.x_noise_strength, self.y_noise_strength), seeds,
normalize=normalize,
normalization_list=true_trainset.tensors[2:])
(noisy_test_x, noisy_test_y), (true_test_x, true_test_y) =\
add_noise((true_test_x, true_test_y),
(self.x_noise_strength, self.y_noise_strength), seeds,
normalize=normalize,
# normalize both datasets with train set
normalization_list=true_trainset.tensors[2:])
trainset = TensorDataset(noisy_train_x, noisy_train_y)
testset = TensorDataset(noisy_test_x, noisy_test_y)
true_trainset = TensorDataset(true_train_x, true_train_y,
noisy_train_x, noisy_train_y)
true_testset = TensorDataset(true_test_x, true_test_y,
noisy_test_x, noisy_test_y)
if not return_ground_truth:
return trainset, testset
else:
return trainset, testset, true_trainset, true_testset
"""
Collection of functions to manipulate tensors
"""
import torch
def get_normalization(t):
"""
Returns the mean and standard deviations (in tuples) of the tensor `t`
"""
t_mean = torch.mean(t, dim=0, keepdim=True)
t_std = torch.std(t, dim=0, keepdim=True)
return (t_mean, t_std)
def normalize_tensor(t, mean_std):
"""
Normalize the tensor `t` by the mean `mean_std[0]` and the standard
devation `mean_std[1]`
"""
return (t-mean_std[0])/mean_std[1]
def add_noise(tensor_list, noise_strength_list, seed_list, normalize=True,
normalization_list = None):
"""
Takes the tensors in `tensor_list`, adds random noise using the standard
deviations in `noise_strength_list` and the seeds in `seed_list`, then, if
normalize is True (default), computes according normalization and returns
the normalized noisy tensors and the normalized unnoisy tensors. If
`normalize` is False, no normalization is performed and the second returned
list will coincide with `tensor_list`.
:param tensor_list: A list of torch.tensors
:param noise_strength_list: A list of positive floats
:param seed_list: A list of integers.
:param normalize: A Boolean, defaults to True.
:param normalization_list: Either None (default) or a list of tensors.
If the latter, these tensors will be used for normalization and `normalize`
is assumed to be True.
:returns: noisy_tensor_list, unnoisy_tensor_list, both normalized
"""
noisy_t_list = []
unnoisy_t_list = []
if normalization_list is not None:
assert len(normalization_list) == len(tensor_list)
for i, (t,noise,seed) in enumerate(zip(tensor_list, noise_strength_list,\
seed_list)):
noisy_t = t + noise * torch.randn(t.shape,
generator=torch.Generator().manual_seed(seed))
if normalize:
if normalization_list is not None:
noisy_t_normalization =\
get_normalization(normalization_list[i])
else:
noisy_t_normalization = get_normalization(noisy_t)
noisy_t = normalize_tensor(noisy_t, noisy_t_normalization)
t = normalize_tensor(t, noisy_t_normalization)
noisy_t_list.append(noisy_t)
unnoisy_t_list.append(t)
return noisy_t_list, unnoisy_t_list
{
"long_dataname": "repeated_linear",
"short_dataname": "replin",
"lr": 1e-3,
"batch_size": 64,
"test_batch_size": 800,
"number_of_epochs": 100,
"unscaled_reg": 10,
"report_point": 5,
"p": 0.1,
"lr_update": 20,
"std_y_update_points": [1,40],
"eiv_prediction_number_of_draws": [100,5],
"eiv_prediction_number_of_batches": 10,
"init_std_y_list": [0.5],
"gamma": 0.5,
"hidden_layers": [128, 128, 128, 128],
"fixed_std_x": 0.05,
"seed_range": [0,10],
"gpu_number": 1
}
{
"long_dataname": "repeated_quadratic",
"short_dataname": "repquad",
"lr": 1e-3,
"batch_size": 64,
"test_batch_size": 800,
"number_of_epochs": 100,
"unscaled_reg": 10,
"report_point": 5,
"p": 0.1,
"lr_update": 20,
"std_y_update_points": [1,40],
"eiv_prediction_number_of_draws": [100,5],
"eiv_prediction_number_of_batches": 10,
"init_std_y_list": [0.5],
"gamma": 0.5,
"hidden_layers": [128, 128, 128, 128],
"fixed_std_x": 0.05,
"seed_range": [0,10],
"gpu_number": 1
}
{
"long_dataname": "repeated_linear",
"short_dataname": "replin",
"lr": 1e-3,
"batch_size": 64,
"test_batch_size": 800,
"number_of_epochs": 100,
"unscaled_reg": 10,
"report_point": 5,
"p": 0.1,
"lr_update": 20,
"std_y_update_points": [1,40] ,
"noneiv_prediction_number_of_draws": 100,
"noneiv_prediction_number_of_batches": 10,
"init_std_y_list": [0.5],
"gamma": 0.5,
"hidden_layers": [128, 128, 128, 128],
"seed_range": [0,10],
"gpu_number": 1
}
{
"long_dataname": "repeated_quadratic",
"short_dataname": "repquad",
"lr": 1e-3,
"batch_size": 64,
"test_batch_size": 800,
"number_of_epochs": 100,
"unscaled_reg": 10,
"report_point": 5,
"p": 0.1,
"lr_update": 20,
"std_y_update_points": [1,40] ,
"noneiv_prediction_number_of_draws": 100,
"noneiv_prediction_number_of_batches": 10,
"init_std_y_list": [0.5],
"gamma": 0.5,
"hidden_layers": [128, 128, 128, 128],
"seed_range": [0,10],
"gpu_number": 1
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment