Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
Z
zema_emc_annotated
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
M4D
zema_emc_annotated
Commits
43360eb4
Verified
Commit
43360eb4
authored
2 years ago
by
Björn Ludwig
Browse files
Options
Downloads
Patches
Plain Diff
feat(dataset): reintroduce strict hash checking, which can optionally be skipped
parent
b90af831
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/zema_emc_annotated/dataset.py
+31
-24
31 additions, 24 deletions
src/zema_emc_annotated/dataset.py
with
31 additions
and
24 deletions
src/zema_emc_annotated/dataset.py
+
31
−
24
View file @
43360eb4
...
...
@@ -23,7 +23,12 @@ from h5py import Dataset
from
numpy._typing
import
NDArray
from
pooch
import
os_cache
,
retrieve
from
zema_emc_annotated.data_types
import
RealMatrix
,
RealVector
,
UncertainArray
from
zema_emc_annotated.data_types
import
(
RealMatrix
,
RealVector
,
SampleSize
,
UncertainArray
,
)
ZEMA_DATASET_HASH
=
(
"
sha256:fb0e80de4e8928ae8b859ad9668a1b6ea6310028a6690bb8d4c1abee31cb8833
"
...
...
@@ -63,54 +68,56 @@ class ZeMASamples:
Parameters
----------
n_samples : int, optional
number of samples each containing the first ``size_scaler`` readings from each
of the eleven sensors for one of the cycles with associated uncertainties,
defaults to 1 and must be between 1 and 4766 - idx_start
size_scaler : int, optional
number of sensor readings from each of the individual sensors per sample/cycle,
defaults to 1 and should be between 1 and 2000, as there are only 2000
readings per cycle, higher values will be clipped to 2000
sample_size : SampleSize, optional
tuple containing information about which samples to extract, defaults to
default of :class:`~zema_emc_annotated.data_types.SampleSize`
normalize : bool, optional
if ``True``, then values are centered around zero and values and
uncertainties are scaled to values
'
unit std, defaults to ``False``
idx_start : int, optional
index of first sample to be extracted, defaults to 0 and must be between 0
and 4765
skip_hash_check : bool, optional
allow to circumvent strict hash checking during the retrieve of dataset file,
to speed up concurrent calls as each check for the large file might take
several seconds, defaults to ``False``
Attributes
----------
uncertain_values : UncertainArray
The collection of samples of values with associated uncertainties,
will be of shape (n_samples, 11 x size_scaler)
will be of shape (``sample_size.n_cycles``, 11 x
``sample_size.datapoints_per_cycle``)
"""
uncertain_values
:
UncertainArray
def
__init__
(
self
,
n_samples
:
int
=
1
,
size_scaler
:
int
=
1
,
sample_size
:
SampleSize
=
SampleSize
(),
normalize
:
bool
=
False
,
idx_start
:
int
=
0
,
skip_hash_check
:
bool
=
False
,
):
self
.
samples_slice
:
slice
=
np
.
s_
[
idx_start
:
idx_start
+
n_samples
]
self
.
size_scaler
=
size_scaler
self
.
samples_slice
:
slice
=
np
.
s_
[
sample_size
.
idx_first_cycle
:
sample_size
.
idx_first_cycle
+
sample_size
.
n_cycles
]
self
.
size_scaler
=
sample_size
.
datapoints_per_cycle
if
cached_data
:
=
self
.
_check_and_load_cache
(
normalize
):
self
.
uncertain_values
=
cached_data
else
:
self
.
_uncertainties
=
np
.
empty
((
n_
samples
,
0
))
self
.
_values
=
np
.
empty
((
n_
samples
,
0
))
self
.
uncertain_values
=
self
.
_extract_data
(
normalize
)
self
.
_uncertainties
=
np
.
empty
((
sample
_size
.
n_cycle
s
,
0
))
self
.
_values
=
np
.
empty
((
sample
_size
.
n_cycle
s
,
0
))
self
.
uncertain_values
=
self
.
_extract_data
(
normalize
,
skip_hash_check
)
self
.
_store_cache
(
normalize
)
del
self
.
_uncertainties
del
self
.
_values
def
_extract_data
(
self
,
normalize
:
bool
)
->
UncertainArray
:
def
_extract_data
(
self
,
normalize
:
bool
,
skip_hash_check
:
bool
=
True
)
->
UncertainArray
:
"""
Extract the data as specified
"""
dataset_full_path
=
retrieve
(
url
=
ZEMA_DATASET_URL
,
known_hash
=
ZEMA_DATASET_HASH
,
known_hash
=
None
if
skip_hash_check
else
ZEMA_DATASET_HASH
,
progressbar
=
True
,
)
assert
exists
(
dataset_full_path
)
...
...
@@ -235,7 +242,7 @@ class ZeMASamples:
return
self
.
uncertain_values
.
uncertainties
def
_check_and_load_cache
(
self
,
normalize
:
bool
)
->
UncertainArray
|
None
:
"""
Checks if corresponding file for n_
samp
les exists and loads it with pickle
"""
"""
Checks if corresponding file for n_
cyc
les exists and loads it with pickle
"""
if
os
.
path
.
exists
(
cache_path
:
=
self
.
_cache_path
(
normalize
)):
with
open
(
cache_path
,
"
rb
"
)
as
cache_file
:
return
cast
(
UncertainArray
,
pickle
.
load
(
cache_file
))
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment