Source code for pydoit_nb.checklist

"""
Checklist file generation
"""
from __future__ import annotations

from collections.abc import Callable, Iterable
from pathlib import Path

from doit.dependency import get_file_md5

CHECKLIST_FNAME_DEFAULT: str = "checklist.chk"
"""Default name used for checklist files"""


[docs]def get_checklist_file(directory: Path, filename: str = CHECKLIST_FNAME_DEFAULT) -> Path: """ Get the full file path for a checklist Parameters ---------- directory Directory for which we want to get the checklist file filename Name of the checklist file Returns ------- Path of the checklist file """ return directory / filename
[docs]def is_checklist_file(fp: Path, checklist_filename: str = CHECKLIST_FNAME_DEFAULT) -> bool: """ Check if a file is a checklist file Parameters ---------- fp The file to check checklist_file_name If the filename matches this value, it will be considered a checklist file. Returns ------- ``True`` if the file is a checklist file, otherwise ``False`` """ return fp.name == checklist_filename
[docs]def create_md5_dict( files: Iterable[Path], exclusions: Iterable[Callable[[Path], bool]] | None = None, ) -> dict[Path, str]: """ Create dictionary of MD5 hashes for files Parameters ---------- files Files to create hashes for exclusions An iterable of callables. These are applied to each file. If any of the results is ``True`` then the file is skipped and will not be included in the dictionary of calculated hashes. Returns ------- Dictionary of MD5 hashes for each file which was not excluded. The keys are the file paths (as they appear in ``files``) and the values are the calculated MD5 hashes. """ out = {} for fp in files: if exclusions is not None and any(excl(fp) for excl in exclusions): # Don't include this file continue out[fp] = get_file_md5(fp) return out
[docs]def generate_directory_checklist( directory: Path, checklist_file: Path | None = None, exclusions: Iterable[Callable[[Path], bool]] = (is_checklist_file,), ) -> Path: """ Create a checklist file with checksums for all files in a directory Running this command multiple times should result in the same result. This enables the checklist to be used as a target for doit tasks. The resulting checklist file can also be used to verify the contents of a folder using the program `mdfsum` so can be included in any distributed results. .. code:: bash md5sum -c checklist.chk Parameters ---------- directory Directory containing arbitary files (we haven't tested this on any file but any directory containing hashable data files is the intended target) checklist_file Where to write the checklist file. If not supplied, the result of `get_checklist_file(directory)` is used. exclusions Functions used to check if a file should be excluded or not. Returns ------- Path of the generated checklist file Raises ------ NotADirectoryError If ``directory`` doesn't exist or isn't a directory """ if not directory.is_dir(): raise NotADirectoryError(directory) if checklist_file is None: checklist_file = get_checklist_file(directory) # sort to ensure same result for same set of files files = sorted([f for f in directory.rglob("*") if f.is_file()]) md5s = create_md5_dict(files, exclusions=exclusions) with open(checklist_file, "w") as fh: for fp, md5 in md5s.items(): fh.write(f"{md5} {fp.relative_to(directory)}\n") return checklist_file