From c8b6f1d41e6445fc309ca444db63901c68c29688 Mon Sep 17 00:00:00 2001
From: Joerg Martin <joerg.martin@ptb.de>
Date: Thu, 2 Dec 2021 13:33:23 +0100
Subject: [PATCH] EiV predict fixed

---
 EIVPackage/EIVArchitectures/Networks.py | 77 +++++++++++++++++++-----
 Experiments/evaluate_tabular.py         | 80 +++++++++++++++----------
 Experiments/train_eiv_california.py     |  2 +-
 Experiments/train_eiv_energy.py         |  2 +-
 4 files changed, 114 insertions(+), 47 deletions(-)

diff --git a/EIVPackage/EIVArchitectures/Networks.py b/EIVPackage/EIVArchitectures/Networks.py
index 1435f48..3a26b02 100644
--- a/EIVPackage/EIVArchitectures/Networks.py
+++ b/EIVPackage/EIVArchitectures/Networks.py
@@ -24,6 +24,9 @@ class FNNEIV(nn.Module):
     :param h: A list specifying the number of neurons in each layer.
     :param fixed_std_x: If given, this value will be the output of the method
     `get_std_x()`, no matter what the deming factor.
+    :param repetition: Positive integer, the default value for repeating input,
+    defaults to 1.  For a single call this can also be specified in the forward
+    method.
     **Note**: 
     - To change the deming factor afterwards, use the method `change_deming`
     - To change fixed_std_x afterwards, use the method `change_fixed_std_x`
@@ -31,14 +34,14 @@ class FNNEIV(nn.Module):
     LeakyReLUSlope = 1e-2
     def __init__(self, p = 0.2, init_std_y=1.0, precision_prior_zeta=0.0, 
             deming=1.0, h=[10, 1024,1024,1024,1024, 1], 
-            fixed_std_x = None):
+            fixed_std_x = None, repetition = 1):
         super().__init__()
         # part before Bernoulli dropout
         self.init_std_y = init_std_y
         InverseSoftplus = lambda sigma: torch.log(torch.exp(sigma) - 1 )
         self.std_y_par = nn.parameter.Parameter(
                 InverseSoftplus(torch.tensor([init_std_y])))
-        self._repetition = 1
+        self._repetition = repetition
         self.main = nn.Sequential(
                 EIVInput(precision_prior_zeta=precision_prior_zeta, 
                     external_std_x=self.get_std_x),
@@ -110,10 +113,11 @@ class FNNEIV(nn.Module):
         return self._repetition
 
     def forward(self, x, repetition=1):
+        old_repetition = self._repetition
         self._repetition = repetition
         mu = self.main(x)
         sigma = self.sigma(mu)
-        self._repetition = 1
+        self._repetition = old_repetition
         return mu, sigma
 
     def regularizer(self, x, lamb):
@@ -154,8 +158,10 @@ class FNNEIV(nn.Module):
                 regularization += self.main[0].neg_x_evidence(x)
         return regularization 
 
-
-    def predict(self, x, number_of_draws=100, remove_graph=True
+    # TODO: repetition
+    # + Test via breakpoints
+    def predict(self, x, number_of_draws=[100,5], number_of_parameter_chunks = None,
+            remove_graph=True
             , take_average_of_prediction=True):
         """
         Average over `number_of_draws` forward passes. If 
@@ -164,19 +170,46 @@ class FNNEIV(nn.Module):
         **Note**: This method does neither touch the input noise nor Dropout.
         The corresponding setting is left to the user!
         :param x: A torch.tensor, the input
-        :param number_of_draws: Number of draws to obtain from x
+        :param number_of_draws: An integer or a list. If an integer
+        `number_of_draws`, will be converted internally to
+        `[number_of_draws,1]`.Numbers of draws to obtain from x via parameter
+        sampling (first element) and noise input sampling (second element).
+        :param number_of_parameter_chunks: An integer or None (default). If
+        None, the second element of `number_of_draws` will be taken (and will
+        thus be identical to 1 if `number_of_draws` is an integer, see above).
+        Samples in the parameter space will be divided into
+        `number_of_parameter_chunks` chunks when collected. Can be used to
+        reduced the memory usage. 
         :param remove_graph: If True (default) the output will 
         be detached to save memory
         :param take_average_of_prediction: If False, no averaging will be 
         applied to the prediction and the second dimension of the first output
         will count the number_of_draws.
         """
-        x, = repeat_tensors(x, number_of_draws=number_of_draws)
-        pred, sigma = self.forward(x)
-        if remove_graph:
-            pred, sigma = pred.detach(), sigma.detach()
-        pred, sigma = reshape_to_chunks(pred, sigma, 
-                number_of_draws=number_of_draws)
+        # x, = repeat_tensors(x, number_of_draws=number_of_draws)
+        if type(number_of_draws) is int:
+            number_of_draws = [number_of_draws, 1]
+        if number_of_parameter_chunks is None:
+            number_of_parameter_chunks = number_of_draws[1]
+        chunk_size = number_of_draws[0] // number_of_parameter_chunks
+        pred_collection, sigma_collection = [], []
+        remaining_draws = number_of_draws[0]
+        while remaining_draws > 0:
+            if remaining_draws < chunk_size:
+                parameter_sample_size = remaining_draws
+            else:
+                parameter_sample_size = chunk_size
+            repeated_x, = repeat_tensors(x, number_of_draws=parameter_sample_size * number_of_draws[1])
+            pred, sigma = self.forward(repeated_x, repetition=number_of_draws[1])
+            if remove_graph:
+                pred, sigma = pred.detach(), sigma.detach()
+            pred, sigma = reshape_to_chunks(pred, sigma, 
+                    number_of_draws=parameter_sample_size * number_of_draws[1])
+            pred_collection.append(pred)
+            sigma_collection.append(pred)
+            remaining_draws -= parameter_sample_size
+        pred = torch.cat(pred_collection, dim=1)
+        sigma = torch.cat(sigma_collection, dim=1)
         # reduce along the draws (actually redundant for sigma)
         if take_average_of_prediction:
             pred, sigma = torch.mean(pred, dim=1), torch.mean(sigma, dim=1)
@@ -184,7 +217,10 @@ class FNNEIV(nn.Module):
             sigma = torch.mean(sigma, dim=1)
         return pred, sigma
 
-    def predictive_logdensity(self, x, y, number_of_draws=100, remove_graph=True,
+    # TODO: repetition
+    # + Implement
+    # + Test via breakpoints
+    def predictive_logdensity(self, x, y, number_of_draws=[100, 5], number_of_parameter_chunks = None, remove_graph=True,
             average_batch_dimension=True, scale_labels=None,
             decouple_dimensions=False):
         """
@@ -193,7 +229,16 @@ class FNNEIV(nn.Module):
         the batch dimension.
         :param x: A torch.tensor, the input
         :param y: A torch.tensor, labels on which to evaluate the density
-        :param number_of_draws: Number of draws to obtain from x
+        :param number_of_draws: An integer or a list. If an integer
+        `number_of_draws`, will be converted internally to
+        `[number_of_draws,1]`.Numbers of draws to obtain from x via parameter
+        sampling (first element) and noise input sampling (second element).
+        :param number_of_parameter_chunks: An integer or None (default). If
+        None, the second element of `number_of_draws` will be taken (and will
+        thus be identical to 1 if `number_of_draws` is an integer, see above).
+        Samples in the parameter space will be divided into
+        `number_of_parameter_chunks` chunks when collected. Can be used to
+        reduced the memory usage. 
         :param remove_graph: If True (default) the output will 
         be detached to save memory
         :param average_batch_dimension: Boolean. If True (default) the values
@@ -201,7 +246,9 @@ class FNNEIV(nn.Module):
         dimension will be left untouched and all values will be returned.
         """
         out, sigmas = self.predict(x, number_of_draws=number_of_draws,
-                take_average_of_prediction=False, remove_graph=remove_graph)
+                number_of_parameter_chunks=number_of_parameter_chunks,
+                remove_graph=remove_graph,
+                take_average_of_prediction=False)
         # Add "repetition" dimension to y and out
         y = y[:,None,...]
         sigmas = sigmas[:,None,...]
diff --git a/Experiments/evaluate_tabular.py b/Experiments/evaluate_tabular.py
index d97622d..90b972f 100644
--- a/Experiments/evaluate_tabular.py
+++ b/Experiments/evaluate_tabular.py
@@ -20,13 +20,13 @@ train_eiv = importlib.import_module(f'train_eiv_{short_dataname}')
 
 train_data, test_data = load_data()
 test_dataloader = DataLoader(test_data, batch_size=int(np.max((len(test_data),
-    800))))
+    64))), shuffle=True)
 input_dim = train_data[0][0].numel()
 output_dim = train_data[0][1].numel()
 
 def collect_metrics(x,y, seed=0,
-        noneiv_number_of_draws=500, eiv_number_of_draws=500,
-        decouple_dimensions=False):
+        noneiv_number_of_draws=100, eiv_number_of_draws=100,
+        decouple_dimensions=False, device=torch.device('cuda:1')):
     """
     :param x: A torch.tensor, taken as input
     :param y: A torch.tensor, taken as output
@@ -34,13 +34,14 @@ def collect_metrics(x,y, seed=0,
     :param noneiv_number_of_draws: Number of draws for non-EiV model
     for sampling from the posterior predictive. Defaults to 100.
     :param noneiv_number_of_draws: Number of draws for EiV model
-    for sampling from the posterior predictive. Defaults to 500.
+    for sampling from the posterior predictive. Defaults to 100.
     :param decouple_dimensions: Boolean. If True, the unsual convention
     of Gal et al. is followed where, in the evaluation of the
     log-posterior-predictive, each dimension is treated independently and then
     averaged. If False (default), a multivariate distribution is used.
     :returns: noneiv_rmse, noneiv_logdens,eiv_rmse, eiv_logdens
     """
+    x,y = x.to(device), y.to(device)
     init_std_y = train_noneiv.init_std_y_list[0]
     unscaled_reg = train_noneiv.unscaled_reg
     p = train_noneiv.p
@@ -50,9 +51,9 @@ def collect_metrics(x,y, seed=0,
                         f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
                         f'_p_{p:.2f}_seed_{seed}.pkl')
     net = Networks.FNNBer(p=p, init_std_y=init_std_y,
-            h=[input_dim, *hidden_layers, output_dim])
+            h=[input_dim, *hidden_layers, output_dim]).to(device)
     train_and_store.open_stored_training(saved_file=saved_file,
-            net=net)
+            net=net, device=device)
 
 
     # RMSE
@@ -64,10 +65,10 @@ def collect_metrics(x,y, seed=0,
         y = y.view((-1,1))
     assert y.shape == out.shape
     res = y-out
-    scale = train_data.dataset.std_labels
+    scale = train_data.dataset.std_labels.to(device)
     scaled_res = res * scale.view((1,-1))
     scaled_res = scaled_res.detach().cpu().numpy().flatten()
-    noneiv_rmse = np.sqrt(np.mean(scaled_res**2)) 
+    noneiv_rmse = np.sqrt(np.mean(scaled_res**2))
 
 
     # NLL
@@ -75,7 +76,9 @@ def collect_metrics(x,y, seed=0,
     net.train()
     noneiv_logdens = net.predictive_logdensity(x, y, number_of_draws=100,
             decouple_dimensions=decouple_dimensions,
-            scale_labels=train_data.dataset.std_labels.view((-1,))).mean()
+            scale_labels=\
+                   train_data.dataset.std_labels.view((-1,)).to(device)\
+                   ).mean().detach().cpu().numpy()
     if training_state:
         net.train()
     else:
@@ -88,11 +91,13 @@ def collect_metrics(x,y, seed=0,
     hidden_layers = train_eiv.hidden_layers
     fixed_std_x = train_eiv.fixed_std_x
     saved_file = os.path.join('saved_networks',
-                f'eiv_{short_dataname}'\
-                        f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
-                        f'_p_{p:.2f}_seed_{seed}.pkl')
+            f'eiv_energy'\
+                    f'_init_std_y_{init_std_y:.3f}_ureg_{unscaled_reg:.1f}'\
+                    f'_p_{p:.2f}_fixed_std_x_{fixed_std_x:.3f}'\
+                    f'_seed_{seed}.pkl')
     net = Networks.FNNEIV(p=p, init_std_y=init_std_y,
-            h=[input_dim, *hidden_layers, output_dim], fixed_std_x=fixed_std_x)
+            h=[input_dim, *hidden_layers, output_dim],
+            fixed_std_x=fixed_std_x).to(device)
     train_and_store.open_stored_training(saved_file=saved_file,
             net=net)
     # RMSE
@@ -106,10 +111,10 @@ def collect_metrics(x,y, seed=0,
         y = y.view((-1,1))
     assert y.shape == out.shape
     res = y-out
-    scale = train_data.dataset.std_labels
+    scale = train_data.dataset.std_labels.to(device)
     scaled_res = res * scale.view((1,-1))
     scaled_res = scaled_res.detach().cpu().numpy().flatten()
-    eiv_rmse = np.sqrt(np.mean(scaled_res**2)) 
+    eiv_rmse = np.sqrt(np.mean(scaled_res**2))
     if training_state:
         net.train()
     else:
@@ -125,7 +130,9 @@ def collect_metrics(x,y, seed=0,
     net.train()
     eiv_logdens = net.predictive_logdensity(x, y, number_of_draws=100,
             decouple_dimensions=decouple_dimensions,
-            scale_labels=train_data.dataset.std_labels.view((-1,))).mean()
+            scale_labels=\
+            train_data.dataset.std_labels.view((-1,)).to(device)\
+            ).mean().detach().cpu().numpy()
     if training_state:
         net.train()
     else:
@@ -136,19 +143,32 @@ noneiv_rmse_collection = []
 noneiv_logdens_collection = []
 eiv_rmse_collection = []
 eiv_logdens_collection = []
-number_of_samples = 20
-for _ in tqdm(range(number_of_samples)):
-    x,y = next(iter(test_dataloader))
-    noneiv_rmse, noneiv_logdens, eiv_rmse, eiv_logdens = collect_metrics(x,y)
-    noneiv_rmse_collection.append(noneiv_rmse)
-    noneiv_logdens_collection.append(noneiv_logdens)
-    eiv_rmse_collection.append(eiv_rmse)
-    eiv_logdens_collection.append(eiv_logdens)
-
-
+num_test_epochs = 20
+assert train_noneiv.seed_list == train_eiv.seed_list
+seed_list = train_noneiv.seed_list
+for seed in tqdm(seed_list):
+    train_data, test_data = load_data(seed=seed)
+    test_dataloader = DataLoader(test_data,
+            batch_size=int(np.max((len(test_data),
+        800))), shuffle=True)
+    for i in tqdm(range(num_test_epochs)):
+        for x,y in test_dataloader:
+            noneiv_rmse, noneiv_logdens, eiv_rmse, eiv_logdens =\
+                    collect_metrics(x,y, seed=seed)
+            noneiv_rmse_collection.append(noneiv_rmse)
+            noneiv_logdens_collection.append(noneiv_logdens)
+            eiv_rmse_collection.append(eiv_rmse)
+            eiv_logdens_collection.append(eiv_logdens)
+
+
+# TODO: Despite statistics, the fluctuations seem to be large
 print('Non-EiV')
-print(f'RMSE {np.mean(noneiv_rmse_collection):.3f} ({np.std(noneiv_rmse_collection)/np.sqrt(number_of_samples):.3f})')
-print(f'LogDens {np.mean(noneiv_logdens_collection):.3f} ({np.std(noneiv_logdens_collection)/np.sqrt(number_of_samples):.3f})')
+print(f'RMSE {np.mean(noneiv_rmse_collection):.3f}'\
+        f'({np.std(noneiv_rmse_collection)/np.sqrt(num_test_epochs):.3f})')
+print(f'LogDens {np.mean(noneiv_logdens_collection):.3f}'\
+        f'({np.std(noneiv_logdens_collection)/np.sqrt(num_test_epochs):.3f})')
 print('EiV')
-print(f'RMSE {np.mean(eiv_rmse_collection):.3f} ({np.std(eiv_rmse_collection)/np.sqrt(number_of_samples):.3f})')
-print(f'LogDens {np.mean(eiv_logdens_collection):.3f} ({np.std(eiv_logdens_collection)/np.sqrt(number_of_samples):.3f})')
+print(f'RMSE {np.mean(eiv_rmse_collection):.3f}'\
+        f'({np.std(eiv_rmse_collection)/np.sqrt(num_test_epochs):.3f})')
+print(f'LogDens {np.mean(eiv_logdens_collection):.3f}'\
+        f'({np.std(eiv_logdens_collection)/np.sqrt(num_test_epochs):.3f})')
diff --git a/Experiments/train_eiv_california.py b/Experiments/train_eiv_california.py
index 40a1c60..39c95fb 100644
--- a/Experiments/train_eiv_california.py
+++ b/Experiments/train_eiv_california.py
@@ -118,7 +118,7 @@ def train_on_data(init_std_y, seed):
     net.apply(initialize_weights.glorot_init)
     net = net.to(device)
     net.std_y_par.requires_grad = False
-    std_x_map = lambda: 0.0
+    std_x_map = lambda: net.get_std_x().detach().cpu().item()
     std_y_map = lambda: net.get_std_y().detach().cpu().item()
     # regularization
     reg = unscaled_reg/len(train_data)
diff --git a/Experiments/train_eiv_energy.py b/Experiments/train_eiv_energy.py
index a29a4ed..40be4b2 100644
--- a/Experiments/train_eiv_energy.py
+++ b/Experiments/train_eiv_energy.py
@@ -118,7 +118,7 @@ def train_on_data(init_std_y, seed):
     net.apply(initialize_weights.glorot_init)
     net = net.to(device)
     net.std_y_par.requires_grad = False
-    std_x_map = lambda: 0.0
+    std_x_map = lambda: net.get_std_x().detach().cpu().item()
     std_y_map = lambda: net.get_std_y().detach().cpu().item()
     # regularization
     reg = unscaled_reg/len(train_data)
-- 
GitLab