-
Notifications
You must be signed in to change notification settings - Fork 2
/
report_check_sums.py
executable file
·127 lines (99 loc) · 3.72 KB
/
report_check_sums.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
import hashlib
import math
from pathlib import Path
from typing import Union
def file_size_formatter(i: int, binary: bool = True, precision: int = 1) -> str:
"""Format byte size into an appropriate nomenclature for prettier printing.
Notes
-----
Adapted from https://github.com/Ouranosinc/miranda/blob/main/miranda/storage.py
"""
_CONVERSIONS = ["B", "k{}B", "M{}B"]
# Determine the appropriate conversion factor
base = 1024 if binary else 1000
if i == 0:
return "0 B"
multiple = math.trunc(math.log2(i) / math.log2(base))
value = i / math.pow(base, multiple)
suffix = _CONVERSIONS[multiple].format("i" if binary else "")
return f"{value:.{precision}f} {suffix}"
def file_sha256_checksum(filename: Path) -> str:
"""Return sha256 checksum for file."""
hash_sha256 = hashlib.sha256()
with filename.open("rb") as f:
hash_sha256.update(f.read())
return hash_sha256.hexdigest()
def valid(path: Path) -> bool:
"""Return True if path should be considered for the creation of sha256 checksum.
Parameters
----------
path : Path
The path to the file.
"""
# Exclude top-level files
if len(path.parts) == 1:
return False
# Exclude hidden files
if any([p.startswith(".") for p in path.parts]):
return False
# Exclude the registry
if path.name == "registry.txt":
return False
if path.is_file():
return True
def main(dry_run: bool = False, readme: Union[str, Path] = "README.md"):
"""Create checksum files."""
data_folder = Path(".").joinpath("data")
files = list(filter(valid, data_folder.rglob("**/*")))
file_checksums_tmp = dict()
for file in files:
if valid(file):
file_checksums_tmp[file] = file_sha256_checksum(file)
# Sort the dictionary by key
file_checksums = dict(sorted(file_checksums_tmp.items()))
# Write the checksums dictionary to the bottom of the README.md file, replacing the existing table
readme = Path(readme)
with readme.open() as f:
lines = f.readlines()
# Find the index of the existing checksum table
start_index, end_index = None, None
for i, line in enumerate(lines):
if line.startswith("### Files"):
start_index = i
# Remove existing checksum table
if start_index is not None:
del lines[start_index:]
i = None
for i, line in enumerate(lines):
if line.startswith("## Available datasets"):
break
if not i:
raise ValueError("Could not find '## Available datasets' in README.md")
# Insert new checksum table
lines.insert(i + 1, "\n")
lines.insert(i + 2, "### Files\n")
lines.insert(i + 3, "\n")
lines.insert(i + 4, "| File | Size | Checksum |\n")
lines.insert(i + 5, "| ---- | ---- | -------- |\n")
for file, checksum in file_checksums.items():
lines.insert(
i + 6,
f"| {file.relative_to(data_folder).as_posix()} "
f"| {file_size_formatter(file.stat().st_size)} "
f"| sha256:{checksum} |\n",
)
# Remove trailing newline
if lines[-1].startswith("\n"):
del lines[-1]
with readme.open("w", encoding="utf-8") as r:
r.writelines(lines)
print(f"Successfully wrote {len(file_checksums)} checksums to {readme}.")
# Update the data registry file
registry = Path("data/registry.txt")
with registry.open("w", encoding="utf-8") as out:
for file, checksum in file_checksums.items():
out.write(f"{file.relative_to(data_folder).as_posix()} sha256:{checksum}\n")
print(f"Successfully wrote {len(file_checksums)} checksums to {registry}.")
if __name__ == "__main__":
main()