Skip to content
Snippets Groups Projects
Commit cdab26b2 authored by Daniele Nicolodi's avatar Daniele Nicolodi
Browse files

datasync: Optimize

parent 051099de
No related branches found
No related tags found
No related merge requests found
...@@ -36,7 +36,7 @@ class Stats: ...@@ -36,7 +36,7 @@ class Stats:
f'{time.time() - self.start_time:.3f} s' f'{time.time() - self.start_time:.3f} s'
def _sync(logger, src, dst, recursive, include, exclude, dry_run, stats, prefix=None): def _sync(logger, src, dst, recursive, include, exclude, dry_run, stats, prefix=''):
copy2 = shutil.copy2 if not dry_run else lambda x, y: None copy2 = shutil.copy2 if not dry_run else lambda x, y: None
replace = os.replace if not dry_run else lambda x, y: None replace = os.replace if not dry_run else lambda x, y: None
...@@ -47,10 +47,11 @@ def _sync(logger, src, dst, recursive, include, exclude, dry_run, stats, prefix= ...@@ -47,10 +47,11 @@ def _sync(logger, src, dst, recursive, include, exclude, dry_run, stats, prefix=
n_copied = 0 n_copied = 0
size_copied = 0 size_copied = 0
# ``os.scandir()`` or ``os.walk()`` are not used here. These functions # ``os.scandir()`` or ``os.walk()`` are not used to list the content of
# use the ``FindFirstFileW()`` and ``FindNextFileW()`` Windows APIs to # the ``src`` directory. These functions use the ``FindFirstFileW()``
# enumerate the directory contents. On NTFS filesystems, these functions # and ``FindNextFileW()`` Windows APIs to enumerate the directory
# report cached file metadata stored in the directory inode. # contents. On NTFS filesystems, these functions report cached file
# metadata stored in the directory inode.
# #
# The cached file metadata is updated a file handle is closed. However, # The cached file metadata is updated a file handle is closed. However,
# most data acquisition programs append lines to data files and keep the # most data acquisition programs append lines to data files and keep the
...@@ -59,15 +60,22 @@ def _sync(logger, src, dst, recursive, include, exclude, dry_run, stats, prefix= ...@@ -59,15 +60,22 @@ def _sync(logger, src, dst, recursive, include, exclude, dry_run, stats, prefix=
# #
# https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-findfirstfilea # https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-findfirstfilea
# https://devblogs.microsoft.com/oldnewthing/20111226-00/?p=8813 # https://devblogs.microsoft.com/oldnewthing/20111226-00/?p=8813
#
# However, the result of ``os.scandir()`` can be used as a cache to
# avoid an expensive ``GetFileInformationByHandle()`` call for every
# path in the ``dst`` directory: call ``os.stat()`` in the ``dst`` path
# entry only if the modification time returned by ``os.scandir()`` is
# older than the modification time of the ``src`` path entry.
dst_mtime_cache = {entry.name: entry.stat().st_mtime_ns for entry in os.scandir(dst)}
for entry in os.listdir(src): for name in os.listdir(src):
src_path = os.path.join(src, entry) src_path = os.path.join(src, name)
dst_path = os.path.join(dst, entry) dst_path = os.path.join(dst, name)
# Use the path relative to src for matching the include and exclude patterns. # Use the path relative to src for matching the include and exclude patterns.
if prefix: entry = os.path.join(prefix, name)
entry = os.path.join(prefix, entry)
s = os.stat(src_path) s = os.stat(src_path)
src_mtime = s.st_mtime_ns src_mtime = s.st_mtime_ns
...@@ -93,6 +101,9 @@ def _sync(logger, src, dst, recursive, include, exclude, dry_run, stats, prefix= ...@@ -93,6 +101,9 @@ def _sync(logger, src, dst, recursive, include, exclude, dry_run, stats, prefix=
logger.debug(f'{entry} matches exclude pattern') logger.debug(f'{entry} matches exclude pattern')
continue continue
if src_mtime <= dst_mtime_cache.get(name, -1):
continue
try: try:
s = os.stat(dst_path) s = os.stat(dst_path)
dst_mtime = s.st_mtime_ns dst_mtime = s.st_mtime_ns
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment