diff --git a/EIVPackage/EIVData/linear.py b/EIVPackage/EIVData/linear.py index 9b69b6b310cf61382e8d9f5204f6ae18a2d1c2fc..75c7e400e558b3825b700ce9d4108786e4a9bc62 100644 --- a/EIVPackage/EIVData/linear.py +++ b/EIVPackage/EIVData/linear.py @@ -2,6 +2,8 @@ import torch import sys from torch.utils.data import TensorDataset +from EIVGeneral.manipulate_tensors import add_noise + total_number_of_datapoints = 2000 input_range = [-1,1] slope = 1.0 @@ -9,17 +11,6 @@ intercept = 0.0 x_noise_strength = 0.05 y_noise_strength = 0.1 -def get_normalization(*args): - """ - Returns the mean and standard deviations (in tuples) of the tensors in *args. - """ - normalization_collection = [] - for t in args: - t_mean = torch.mean(t, dim=0, keepdim=True) - t_std = torch.std(t, dim=0, keepdim=True) - normalization_collection.append((t_mean, t_std)) - return tuple(normalization_collection) - def load_data(seed=0, splitting_part=0.8, normalize=True, return_ground_truth=False): """ @@ -37,26 +28,20 @@ def load_data(seed=0, splitting_part=0.8, normalize=True, """ random_generator = torch.Generator().manual_seed(seed) # draw different seeds for noise and splitting - seeds = torch.randint(0,sys.maxsize,(3,), generator=random_generator) + seeds = [int(t) for t in torch.randint(0,sys.maxsize,(3,),\ + generator=random_generator)] # create new generators from tensor seeds - create_generator = lambda tensor_seed:\ - torch.Generator().manual_seed(tensor_seed.item()) true_x = input_range[0] + (input_range[1]-input_range[0])\ * torch.rand((total_number_of_datapoints,1), - generator=create_generator(seeds[0])) + generator=torch.Generator().manual_seed(seeds[0])) true_y = slope * true_x + intercept - noisy_x = true_x + x_noise_strength * \ - torch.randn((total_number_of_datapoints,1), - generator=create_generator(seeds[1])) - noisy_y = true_y + y_noise_strength * \ - torch.randn((total_number_of_datapoints,1), - generator=create_generator(seeds[2])) - if normalize: - normalization_x, normalization_y = get_normalization(noisy_x, noisy_y) - noisy_x = (noisy_x-normalization_x[0])/normalization_x[1] - true_x = (true_x-normalization_x[0])/normalization_x[1] - noisy_y = (noisy_y-normalization_y[0])/normalization_y[1] - true_y = (true_y-normalization_y[0])/normalization_y[1] + # add noise and normalize x and y + (noisy_x, noisy_y), (true_x, true_y) = add_noise( + tensor_list=(true_x, true_y), + noise_strength_list=(x_noise_strength, y_noise_strength), + seed_list=seeds[1:3], + normalize=normalize) + # create datasets dataset_len = noisy_x.shape[0] train_len = int(dataset_len*splitting_part) test_len = dataset_len - train_len @@ -75,4 +60,3 @@ def load_data(seed=0, splitting_part=0.8, normalize=True, else: return linear_trainset, linear_testset, true_linear_trainset,\ true_linear_testset - diff --git a/EIVPackage/EIVData/quadratic.py b/EIVPackage/EIVData/quadratic.py index aa4f4605094822116d36e12f64e854c38849a406..a42148205f481665dc420172e5797b84ffaab4ad 100644 --- a/EIVPackage/EIVData/quadratic.py +++ b/EIVPackage/EIVData/quadratic.py @@ -2,6 +2,8 @@ import torch import sys from torch.utils.data import TensorDataset +from EIVGeneral.manipulate_tensors import add_noise + total_number_of_datapoints = 2000 input_range = [-1,1] slope = 1.0 @@ -9,18 +11,6 @@ intercept = 0.0 x_noise_strength = 0.05 y_noise_strength = 0.1 -def get_normalization(*args): - """ - Returns the mean and standard deviations (in tuples) of the tensors in - *args. - """ - normalization_collection = [] - for t in args: - t_mean = torch.mean(t, dim=0, keepdim=True) - t_std = torch.std(t, dim=0, keepdim=True) - normalization_collection.append((t_mean, t_std)) - return tuple(normalization_collection) - def load_data(seed=0, splitting_part=0.8, normalize=True, return_ground_truth=False): """ @@ -38,26 +28,20 @@ def load_data(seed=0, splitting_part=0.8, normalize=True, """ random_generator = torch.Generator().manual_seed(seed) # draw different seeds for noise and splitting - seeds = torch.randint(0,sys.maxsize,(3,), generator=random_generator) + seeds = [int(t) for t in torch.randint(0,sys.maxsize,(3,),\ + generator=random_generator)] # create new generators from tensor seeds - create_generator = lambda tensor_seed:\ - torch.Generator().manual_seed(tensor_seed.item()) true_x = input_range[0] + (input_range[1]-input_range[0])\ * torch.rand((total_number_of_datapoints,1), - generator=create_generator(seeds[0])) + generator=torch.Generator().manual_seed(seeds[0])) true_y = slope * true_x**2 + intercept - noisy_x = true_x + x_noise_strength * \ - torch.randn((total_number_of_datapoints,1), - generator=create_generator(seeds[1])) - noisy_y = true_y + y_noise_strength * \ - torch.randn((total_number_of_datapoints,1), - generator=create_generator(seeds[2])) - if normalize: - normalization_x, normalization_y = get_normalization(noisy_x, noisy_y) - noisy_x = (noisy_x-normalization_x[0])/normalization_x[1] - true_x = (true_x-normalization_x[0])/normalization_x[1] - noisy_y = (noisy_y-normalization_y[0])/normalization_y[1] - true_y = (true_y-normalization_y[0])/normalization_y[1] + # add noise and normalize x and y + (noisy_x, noisy_y), (true_x, true_y) = add_noise( + tensor_list=(true_x, true_y), + noise_strength_list=(x_noise_strength, y_noise_strength), + seed_list=seeds[1:3], + normalize=normalize) + # create datasets dataset_len = noisy_x.shape[0] train_len = int(dataset_len*splitting_part) test_len = dataset_len - train_len @@ -76,4 +60,3 @@ def load_data(seed=0, splitting_part=0.8, normalize=True, else: return quadratic_trainset, quadratic_testset, true_quadratic_trainset,\ true_quadratic_testset - diff --git a/EIVPackage/EIVData/repeated_linear.py b/EIVPackage/EIVData/repeated_linear.py new file mode 100644 index 0000000000000000000000000000000000000000..6e0bd5376b8bac3631ee65590851a8da1480db5d --- /dev/null +++ b/EIVPackage/EIVData/repeated_linear.py @@ -0,0 +1,11 @@ +""" +Repeated sampling from the linear dataset. +""" +from EIVData import linear + +from EIVData.repeated_sampling import repeated_sampling + +fixed_seed = 0 + +load_data = repeated_sampling(dataclass=linear, + fixed_seed=fixed_seed) diff --git a/EIVPackage/EIVData/repeated_quadratic.py b/EIVPackage/EIVData/repeated_quadratic.py new file mode 100644 index 0000000000000000000000000000000000000000..4c409829be7ef006deff7812d79cb35e2717b6ae --- /dev/null +++ b/EIVPackage/EIVData/repeated_quadratic.py @@ -0,0 +1,11 @@ +""" +Repeated sampling from the quadratic dataset. +""" +from EIVData import quadratic + +from EIVData.repeated_sampling import repeated_sampling + +fixed_seed = 0 + +load_data = repeated_sampling(dataclass=quadratic, + fixed_seed=fixed_seed) diff --git a/EIVPackage/EIVData/repeated_sampling.py b/EIVPackage/EIVData/repeated_sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..9d87db6185eacc8fd2506aaac710bfcce0dd82fe --- /dev/null +++ b/EIVPackage/EIVData/repeated_sampling.py @@ -0,0 +1,66 @@ +""" +Contains the class `repeated_sampling` that can be used to generate +datasets for repeated sampling from datasets with a ground truth. +""" +import sys + +import torch +from torch.utils.data import TensorDataset + +from EIVGeneral.manipulate_tensors import add_noise + +class repeated_sampling(): + """ + A class for repeated sampling from datasets with a known ground truth and + known input and output noise. The class `dataclass` should contain a + `load_data` routine that returns a ground truth and two positive floats + `x_noise_strength` and `y_noise_strength` that will be used as the standard + deviation of input and output noise. + :param dataclass: A module that contains a routine `load_data`, which + accepts the keyword `return_ground_truth` and returns the noisy and true + train and test datasets, and two positive floats `x_noise_strength` and + `y_noise_strength`. + :param fixed_seed: Integer. The seed to load the unnoisy ground truth, + defaults to 0. + """ + def __init__(self, dataclass, fixed_seed=0): + self.dataclass = dataclass + self.fixed_seed = fixed_seed + self.x_noise_strength = dataclass.x_noise_strength + self.y_noise_strength = dataclass.y_noise_strength + + def __call__(self,seed=0, splitting_part=0.8, normalize=True, + return_ground_truth=False): + _, _, true_trainset, true_testset\ + = self.dataclass.load_data( + seed=self.fixed_seed, splitting_part=splitting_part, + return_ground_truth=True) + true_train_x, true_train_y = true_trainset.tensors[:2] + true_test_x, true_test_y = true_testset.tensors[:2] + random_generator = torch.Generator().manual_seed(seed) + # draw different seeds for noise and splitting + seeds = [int(t) for t in torch.randint(0,sys.maxsize,(2,),\ + generator=random_generator)] + (noisy_train_x, noisy_train_y), (true_train_x, true_train_y) =\ + add_noise((true_train_x, true_train_y), + (self.x_noise_strength, self.y_noise_strength), seeds, + normalize=normalize, + normalization_list=true_trainset.tensors[2:]) + (noisy_test_x, noisy_test_y), (true_test_x, true_test_y) =\ + add_noise((true_test_x, true_test_y), + (self.x_noise_strength, self.y_noise_strength), seeds, + normalize=normalize, + # normalize both datasets with train set + normalization_list=true_trainset.tensors[2:]) + trainset = TensorDataset(noisy_train_x, noisy_train_y) + testset = TensorDataset(noisy_test_x, noisy_test_y) + true_trainset = TensorDataset(true_train_x, true_train_y, + noisy_train_x, noisy_train_y) + true_testset = TensorDataset(true_test_x, true_test_y, + noisy_test_x, noisy_test_y) + if not return_ground_truth: + return trainset, testset + else: + return trainset, testset, true_trainset, true_testset + + diff --git a/EIVPackage/EIVGeneral/manipulate_tensors.py b/EIVPackage/EIVGeneral/manipulate_tensors.py new file mode 100644 index 0000000000000000000000000000000000000000..20c37a63ed2be6cc6a618b0d9c7a712ba8c81df7 --- /dev/null +++ b/EIVPackage/EIVGeneral/manipulate_tensors.py @@ -0,0 +1,59 @@ +""" +Collection of functions to manipulate tensors +""" +import torch + +def get_normalization(t): + """ + Returns the mean and standard deviations (in tuples) of the tensor `t` + """ + t_mean = torch.mean(t, dim=0, keepdim=True) + t_std = torch.std(t, dim=0, keepdim=True) + return (t_mean, t_std) + +def normalize_tensor(t, mean_std): + """ + Normalize the tensor `t` by the mean `mean_std[0]` and the standard + devation `mean_std[1]` + """ + return (t-mean_std[0])/mean_std[1] + + +def add_noise(tensor_list, noise_strength_list, seed_list, normalize=True, + normalization_list = None): + """ + Takes the tensors in `tensor_list`, adds random noise using the standard + deviations in `noise_strength_list` and the seeds in `seed_list`, then, if + normalize is True (default), computes according normalization and returns + the normalized noisy tensors and the normalized unnoisy tensors. If + `normalize` is False, no normalization is performed and the second returned + list will coincide with `tensor_list`. + :param tensor_list: A list of torch.tensors + :param noise_strength_list: A list of positive floats + :param seed_list: A list of integers. + :param normalize: A Boolean, defaults to True. + :param normalization_list: Either None (default) or a list of tensors. + If the latter, these tensors will be used for normalization and `normalize` + is assumed to be True. + :returns: noisy_tensor_list, unnoisy_tensor_list, both normalized + """ + noisy_t_list = [] + unnoisy_t_list = [] + if normalization_list is not None: + assert len(normalization_list) == len(tensor_list) + for i, (t,noise,seed) in enumerate(zip(tensor_list, noise_strength_list,\ + seed_list)): + noisy_t = t + noise * torch.randn(t.shape, + generator=torch.Generator().manual_seed(seed)) + if normalize: + if normalization_list is not None: + noisy_t_normalization =\ + get_normalization(normalization_list[i]) + else: + noisy_t_normalization = get_normalization(noisy_t) + noisy_t = normalize_tensor(noisy_t, noisy_t_normalization) + t = normalize_tensor(t, noisy_t_normalization) + noisy_t_list.append(noisy_t) + unnoisy_t_list.append(t) + return noisy_t_list, unnoisy_t_list + diff --git a/Experiments/configurations/eiv_replin.json b/Experiments/configurations/eiv_replin.json new file mode 100644 index 0000000000000000000000000000000000000000..2aca08363f423cea6994f3654bf5c672f7b7014e --- /dev/null +++ b/Experiments/configurations/eiv_replin.json @@ -0,0 +1,21 @@ +{ + "long_dataname": "repeated_linear", + "short_dataname": "replin", + "lr": 1e-3, + "batch_size": 64, + "test_batch_size": 800, + "number_of_epochs": 100, + "unscaled_reg": 10, + "report_point": 5, + "p": 0.1, + "lr_update": 20, + "std_y_update_points": [1,40], + "eiv_prediction_number_of_draws": [100,5], + "eiv_prediction_number_of_batches": 10, + "init_std_y_list": [0.5], + "gamma": 0.5, + "hidden_layers": [128, 128, 128, 128], + "fixed_std_x": 0.05, + "seed_range": [0,10], + "gpu_number": 1 +} diff --git a/Experiments/configurations/eiv_repquad.json b/Experiments/configurations/eiv_repquad.json new file mode 100644 index 0000000000000000000000000000000000000000..0e4943220009e3c5d552e4684d3f2bdddadab777 --- /dev/null +++ b/Experiments/configurations/eiv_repquad.json @@ -0,0 +1,21 @@ +{ + "long_dataname": "repeated_quadratic", + "short_dataname": "repquad", + "lr": 1e-3, + "batch_size": 64, + "test_batch_size": 800, + "number_of_epochs": 100, + "unscaled_reg": 10, + "report_point": 5, + "p": 0.1, + "lr_update": 20, + "std_y_update_points": [1,40], + "eiv_prediction_number_of_draws": [100,5], + "eiv_prediction_number_of_batches": 10, + "init_std_y_list": [0.5], + "gamma": 0.5, + "hidden_layers": [128, 128, 128, 128], + "fixed_std_x": 0.05, + "seed_range": [0,10], + "gpu_number": 1 +} diff --git a/Experiments/configurations/noneiv_replin.json b/Experiments/configurations/noneiv_replin.json new file mode 100644 index 0000000000000000000000000000000000000000..5f4956feb3fc8f22e93baf628c5ac5f816a85e3e --- /dev/null +++ b/Experiments/configurations/noneiv_replin.json @@ -0,0 +1,20 @@ +{ + "long_dataname": "repeated_linear", + "short_dataname": "replin", + "lr": 1e-3, + "batch_size": 64, + "test_batch_size": 800, + "number_of_epochs": 100, + "unscaled_reg": 10, + "report_point": 5, + "p": 0.1, + "lr_update": 20, + "std_y_update_points": [1,40] , + "noneiv_prediction_number_of_draws": 100, + "noneiv_prediction_number_of_batches": 10, + "init_std_y_list": [0.5], + "gamma": 0.5, + "hidden_layers": [128, 128, 128, 128], + "seed_range": [0,10], + "gpu_number": 1 +} diff --git a/Experiments/configurations/noneiv_repquad.json b/Experiments/configurations/noneiv_repquad.json new file mode 100644 index 0000000000000000000000000000000000000000..28b8759c2f0c032b9d5e7eed3a5902f8040fbb6f --- /dev/null +++ b/Experiments/configurations/noneiv_repquad.json @@ -0,0 +1,20 @@ +{ + "long_dataname": "repeated_quadratic", + "short_dataname": "repquad", + "lr": 1e-3, + "batch_size": 64, + "test_batch_size": 800, + "number_of_epochs": 100, + "unscaled_reg": 10, + "report_point": 5, + "p": 0.1, + "lr_update": 20, + "std_y_update_points": [1,40] , + "noneiv_prediction_number_of_draws": 100, + "noneiv_prediction_number_of_batches": 10, + "init_std_y_list": [0.5], + "gamma": 0.5, + "hidden_layers": [128, 128, 128, 128], + "seed_range": [0,10], + "gpu_number": 1 +}