diff --git a/src/reuse/__init__.py b/src/reuse/__init__.py index 416f7d944..720c52710 100644 --- a/src/reuse/__init__.py +++ b/src/reuse/__init__.py @@ -22,13 +22,13 @@ import gettext import logging import os -import re from dataclasses import dataclass, field from enum import Enum from importlib.metadata import PackageNotFoundError, version from typing import Any, Optional from boolean.boolean import Expression +from license_expression import Licensing try: __version__ = version("reuse") @@ -43,6 +43,8 @@ _LOGGER = logging.getLogger(__name__) +_LICENSING = Licensing() + _PACKAGE_PATH = os.path.dirname(__file__) _LOCALE_DIR = os.path.join(_PACKAGE_PATH, "locale") @@ -54,43 +56,6 @@ _LOGGER.debug("no translations found at %s", _LOCALE_DIR) -_IGNORE_DIR_PATTERNS = [ - re.compile(r"^\.git$"), - re.compile(r"^\.hg$"), - re.compile(r"^\.sl$"), # Used by Sapling SCM - re.compile(r"^LICENSES$"), - re.compile(r"^\.reuse$"), -] - -_IGNORE_MESON_PARENT_DIR_PATTERNS = [ - re.compile(r"^subprojects$"), -] - -_IGNORE_FILE_PATTERNS = [ - # LICENSE, LICENSE-MIT, LICENSE.txt - re.compile(r"^LICEN[CS]E([-\.].*)?$"), - re.compile(r"^COPYING([-\.].*)?$"), - # ".git" as file happens in submodules - re.compile(r"^\.git$"), - re.compile(r"^\.hgtags$"), - re.compile(r".*\.license$"), - re.compile(r"^REUSE\.toml$"), - # Workaround for https://github.com/fsfe/reuse-tool/issues/229 - re.compile(r"^CAL-1.0(-Combined-Work-Exception)?(\..+)?$"), - re.compile(r"^SHL-2.1(\..+)?$"), -] - -_IGNORE_SPDX_PATTERNS = [ - # SPDX files from - # https://spdx.github.io/spdx-spec/conformance/#44-standard-data-format-requirements - re.compile(r".*\.spdx$"), - re.compile(r".*\.spdx.(rdf|json|xml|ya?ml)$"), -] - -# Combine SPDX patterns into file patterns to ease default ignore usage -_IGNORE_FILE_PATTERNS.extend(_IGNORE_SPDX_PATTERNS) - - class SourceType(Enum): """ An enumeration representing the types of sources for license information. diff --git a/src/reuse/_annotate.py b/src/reuse/_annotate.py index 564c0341b..2c49d6ac2 100644 --- a/src/reuse/_annotate.py +++ b/src/reuse/_annotate.py @@ -23,11 +23,7 @@ from jinja2.exceptions import TemplateNotFound from . import ReuseInfo -from ._util import ( - _determine_license_suffix_path, - contains_reuse_info, - detect_line_endings, -) +from ._util import _determine_license_suffix_path from .comment import ( NAME_STYLE_MAP, CommentStyle, @@ -35,6 +31,7 @@ get_comment_style, ) from .exceptions import CommentCreateError, MissingReuseInfoError +from .extract import contains_reuse_info, detect_line_endings from .header import add_new_header, find_and_replace_header from .i18n import _ from .project import Project diff --git a/src/reuse/_util.py b/src/reuse/_util.py index 04af137d5..af226cf1d 100644 --- a/src/reuse/_util.py +++ b/src/reuse/_util.py @@ -16,116 +16,16 @@ import logging import os -import re -import shutil import subprocess -from collections import Counter from hashlib import sha1 from inspect import cleandoc -from itertools import chain from pathlib import Path -from typing import IO, Any, BinaryIO, Iterator, Optional, Union +from typing import IO, Any, Optional, Union -from boolean.boolean import ParseError -from license_expression import ExpressionError, Licensing - -from . import ReuseInfo, SourceType -from .comment import _all_style_classes # TODO: This import is not ideal here. -from .i18n import _ from .types import StrPath -GIT_EXE = shutil.which("git") -HG_EXE = shutil.which("hg") -JUJUTSU_EXE = shutil.which("jj") -PIJUL_EXE = shutil.which("pijul") - -REUSE_IGNORE_START = "REUSE-IgnoreStart" -REUSE_IGNORE_END = "REUSE-IgnoreEnd" - -SPDX_SNIPPET_INDICATOR = b"SPDX-SnippetBegin" - -_LOGGER = logging.getLogger(__name__) -_LICENSING = Licensing() - # REUSE-IgnoreStart -_END_PATTERN = r"{}$".format( - "".join( - { - r"(?:{})*".format(item) # pylint: disable=consider-using-f-string - for item in chain( - ( - re.escape(style.MULTI_LINE.end) - for style in _all_style_classes() - if style.MULTI_LINE.end - ), - # These are special endings which do not belong to specific - # comment styles, but which we want to nonetheless strip away - # while parsing. - ( - ending - for ending in [ - # ex: - r'"\s*/*>', - r"'\s*/*>", - # ex: [SPDX-License-Identifier: GPL-3.0-or-later] :: - r"\]\s*::", - ] - ), - ) - } - ) -) -_LICENSE_IDENTIFIER_PATTERN = re.compile( - r"^(.*?)SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE -) -_CONTRIBUTOR_PATTERN = re.compile( - r"^(.*?)SPDX-FileContributor:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE -) -# The keys match the relevant attributes of ReuseInfo. -_SPDX_TAGS: dict[str, re.Pattern] = { - "spdx_expressions": _LICENSE_IDENTIFIER_PATTERN, - "contributor_lines": _CONTRIBUTOR_PATTERN, -} - -_COPYRIGHT_PATTERNS = [ - re.compile( - r"(?P(?PSPDX-(File|Snippet)CopyrightText:" - r"(\s(\([Cc]\)|©|Copyright(\s(©|\([Cc]\)))?))?)\s+" - r"((?P\d{4} ?- ?\d{4}|\d{4}),?\s+)?" - r"(?P.*?))" + _END_PATTERN - ), - re.compile( - r"(?P(?PCopyright(\s(\([Cc]\)|©))?)\s+" - r"((?P\d{4} ?- ?\d{4}|\d{4}),?\s+)?" - r"(?P.*?))" + _END_PATTERN - ), - re.compile( - r"(?P(?P©)\s+" - r"((?P\d{4} ?- ?\d{4}|\d{4}),?\s+)?" - r"(?P.*?))" + _END_PATTERN - ), -] -_COPYRIGHT_PREFIXES = { - "spdx": "SPDX-FileCopyrightText:", - "spdx-c": "SPDX-FileCopyrightText: (C)", - "spdx-string-c": "SPDX-FileCopyrightText: Copyright (C)", - "spdx-string": "SPDX-FileCopyrightText: Copyright", - "spdx-string-symbol": "SPDX-FileCopyrightText: Copyright ©", - "spdx-symbol": "SPDX-FileCopyrightText: ©", - "string": "Copyright", - "string-c": "Copyright (C)", - "string-symbol": "Copyright ©", - "symbol": "©", -} - -_LICENSEREF_PATTERN = re.compile("LicenseRef-[a-zA-Z0-9-.]+$") - -# Amount of bytes that we assume will be big enough to contain the entire -# comment header (including SPDX tags), so that we don't need to read the -# entire file. -_HEADER_BYTES = 4096 - def setup_logging(level: int = logging.WARNING) -> None: """Configure logging for reuse. @@ -191,22 +91,6 @@ def find_licenses_directory(root: Optional[StrPath] = None) -> Path: return licenses_path -def decoded_text_from_binary( - binary_file: BinaryIO, size: Optional[int] = None -) -> str: - """Given a binary file object, detect its encoding and return its contents - as a decoded string. Do not throw any errors if the encoding contains - errors: Just replace the false characters. - - If *size* is specified, only read so many bytes. - """ - if size is None: - size = -1 - rawdata = binary_file.read(size) - result = rawdata.decode("utf-8", errors="replace") - return result.replace("\r\n", "\n") - - def _determine_license_path(path: StrPath) -> Path: """Given a path FILE, return FILE.license if it exists, otherwise return FILE. @@ -225,169 +109,6 @@ def _determine_license_suffix_path(path: StrPath) -> Path: return Path(f"{path}.license") -def _parse_copyright_year(year: Optional[str]) -> list[str]: - """Parse copyright years and return list.""" - ret: list[str] = [] - if not year: - return ret - if re.match(r"\d{4}$", year): - ret = [year] - elif re.match(r"\d{4} ?- ?\d{4}$", year): - ret = [year[:4], year[-4:]] - return ret - - -def _contains_snippet(binary_file: BinaryIO) -> bool: - """Check if a file seems to contain a SPDX snippet""" - # Assumes that if SPDX_SNIPPET_INDICATOR (SPDX-SnippetBegin) is found in a - # file, the file contains a snippet - content = binary_file.read() - if SPDX_SNIPPET_INDICATOR in content: - return True - return False - - -def merge_copyright_lines(copyright_lines: set[str]) -> set[str]: - """Parse all copyright lines and merge identical statements making years - into a range. - - If a same statement uses multiple prefixes, use only the most frequent one. - """ - # pylint: disable=too-many-locals - # TODO: Rewrite this function. It's a bit of a mess. - copyright_in = [] - for line in copyright_lines: - for pattern in _COPYRIGHT_PATTERNS: - match = pattern.search(line) - if match is not None: - copyright_in.append( - { - "statement": match.groupdict()["statement"], - "year": _parse_copyright_year( - match.groupdict()["year"] - ), - "prefix": match.groupdict()["prefix"], - } - ) - break - - copyright_out = set() - for line_info in copyright_in: - statement = str(line_info["statement"]) - copyright_list = [ - item for item in copyright_in if item["statement"] == statement - ] - - # Get the most common prefix. - most_common = str( - Counter([item["prefix"] for item in copyright_list]).most_common(1)[ - 0 - ][0] - ) - prefix = "spdx" - for key, value in _COPYRIGHT_PREFIXES.items(): - if most_common == value: - prefix = key - break - - # get year range if any - years: list[str] = [] - for copy in copyright_list: - years += copy["year"] - - year: Optional[str] = None - if years: - if min(years) == max(years): - year = min(years) - else: - year = f"{min(years)} - {max(years)}" - - copyright_out.add(make_copyright_line(statement, year, prefix)) - return copyright_out - - -def extract_reuse_info(text: str) -> ReuseInfo: - """Extract REUSE information from comments in a string. - - Raises: - ExpressionError: if an SPDX expression could not be parsed. - ParseError: if an SPDX expression could not be parsed. - """ - text = filter_ignore_block(text) - spdx_tags: dict[str, set[str]] = {} - for tag, pattern in _SPDX_TAGS.items(): - spdx_tags[tag] = set(find_spdx_tag(text, pattern)) - # License expressions and copyright matches are special cases. - expressions = set() - copyright_matches = set() - for expression in spdx_tags.pop("spdx_expressions"): - try: - expressions.add(_LICENSING.parse(expression)) - except (ExpressionError, ParseError): - _LOGGER.error( - _("Could not parse '{expression}'").format( - expression=expression - ) - ) - raise - for line in text.splitlines(): - for pattern in _COPYRIGHT_PATTERNS: - match = pattern.search(line) - if match is not None: - copyright_matches.add(match.groupdict()["copyright"].strip()) - break - - return ReuseInfo( - spdx_expressions=expressions, - copyright_lines=copyright_matches, - **spdx_tags, # type: ignore - ) - - -def reuse_info_of_file( - path: StrPath, original_path: StrPath, root: StrPath -) -> ReuseInfo: - """Open *path* and return its :class:`ReuseInfo`. - - Normally only the first few :const:`_HEADER_BYTES` are read. But if a - snippet was detected, the entire file is read. - """ - path = Path(path) - with path.open("rb") as fp: - try: - read_limit: Optional[int] = _HEADER_BYTES - # Completely read the file once - # to search for possible snippets - if _contains_snippet(fp): - _LOGGER.debug(f"'{path}' seems to contain an SPDX Snippet") - read_limit = None - # Reset read position - fp.seek(0) - # Scan the file for REUSE info, possibly limiting the read - # length - file_result = extract_reuse_info( - decoded_text_from_binary(fp, size=read_limit) - ) - if file_result.contains_copyright_or_licensing(): - source_type = SourceType.FILE_HEADER - if path.suffix == ".license": - source_type = SourceType.DOT_LICENSE - return file_result.copy( - path=relative_from_root(original_path, root).as_posix(), - source_path=relative_from_root(path, root).as_posix(), - source_type=source_type, - ) - - except (ExpressionError, ParseError): - _LOGGER.error( - _( - "'{path}' holds an SPDX expression that cannot be" - " parsed, skipping the file" - ).format(path=path) - ) - return ReuseInfo() - - def relative_from_root(path: StrPath, root: StrPath) -> Path: """A helper function to get *path* relative to *root*.""" path = Path(path) @@ -397,88 +118,6 @@ def relative_from_root(path: StrPath, root: StrPath) -> Path: return Path(os.path.relpath(path, start=root)) -def find_spdx_tag(text: str, pattern: re.Pattern) -> Iterator[str]: - """Extract all the values in *text* matching *pattern*'s regex, taking care - of stripping extraneous whitespace of formatting. - """ - for prefix, value in pattern.findall(text): - prefix, value = prefix.strip(), value.strip() - - # Some comment headers have ASCII art to "frame" the comment, like this: - # - # /***********************\ - # |* This is a comment *| - # \***********************/ - # - # To ensure we parse them correctly, if the line ends with the inverse - # of the comment prefix, we strip that suffix. See #343 for a real - # world example of a project doing this (LLVM). - suffix = prefix[::-1] - if suffix and value.endswith(suffix): - value = value[: -len(suffix)] - - yield value.strip() - - -def filter_ignore_block(text: str) -> str: - """Filter out blocks beginning with REUSE_IGNORE_START and ending with - REUSE_IGNORE_END to remove lines that should not be treated as copyright and - licensing information. - """ - ignore_start = None - ignore_end = None - if REUSE_IGNORE_START in text: - ignore_start = text.index(REUSE_IGNORE_START) - if REUSE_IGNORE_END in text: - ignore_end = text.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END) - if not ignore_start: - return text - if not ignore_end: - return text[:ignore_start] - if ignore_end > ignore_start: - return text[:ignore_start] + filter_ignore_block(text[ignore_end:]) - rest = text[ignore_start + len(REUSE_IGNORE_START) :] - if REUSE_IGNORE_END in rest: - ignore_end = rest.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END) - return text[:ignore_start] + filter_ignore_block(rest[ignore_end:]) - return text[:ignore_start] - - -def contains_reuse_info(text: str) -> bool: - """The text contains REUSE info.""" - try: - return bool(extract_reuse_info(text)) - except (ExpressionError, ParseError): - return False - - -def make_copyright_line( - statement: str, year: Optional[str] = None, copyright_prefix: str = "spdx" -) -> str: - """Given a statement, prefix it with ``SPDX-FileCopyrightText:`` if it is - not already prefixed with some manner of copyright tag. - """ - if "\n" in statement: - raise RuntimeError(f"Unexpected newline in '{statement}'") - - prefix = _COPYRIGHT_PREFIXES.get(copyright_prefix) - if prefix is None: - # TODO: Maybe translate this. Also maybe reduce DRY here. - raise RuntimeError( - "Unexpected copyright prefix: Need 'spdx', 'spdx-c', " - "'spdx-symbol', 'string', 'string-c', " - "'string-symbol', or 'symbol'" - ) - - for pattern in _COPYRIGHT_PATTERNS: - match = pattern.search(statement) - if match is not None: - return statement - if year is not None: - return f"{prefix} {year} {statement}" - return f"{prefix} {statement}" - - def _checksum(path: StrPath) -> str: path = Path(path) @@ -490,17 +129,6 @@ def _checksum(path: StrPath) -> str: return file_sha1.hexdigest() -def detect_line_endings(text: str) -> str: - """Return one of '\n', '\r' or '\r\n' depending on the line endings used in - *text*. Return os.linesep if there are no line endings. - """ - line_endings = ["\r\n", "\r", "\n"] - for line_ending in line_endings: - if line_ending in text: - return line_ending - return os.linesep - - def cleandoc_nl(text: str) -> str: """Like :func:`inspect.cleandoc`, but with a newline at the end.""" return cleandoc(text) + "\n" diff --git a/src/reuse/cli/annotate.py b/src/reuse/cli/annotate.py index 4d3dd3904..ff40bfd01 100644 --- a/src/reuse/cli/annotate.py +++ b/src/reuse/cli/annotate.py @@ -29,12 +29,7 @@ from .. import ReuseInfo from .._annotate import add_header_to_file -from .._util import ( - _COPYRIGHT_PREFIXES, - _determine_license_path, - _determine_license_suffix_path, - make_copyright_line, -) +from .._util import _determine_license_path, _determine_license_suffix_path from ..comment import ( NAME_STYLE_MAP, CommentStyle, @@ -42,6 +37,7 @@ has_style, is_uncommentable, ) +from ..copyright import _COPYRIGHT_PREFIXES, make_copyright_line from ..i18n import _ from ..project import Project from .common import ClickObj, MutexOption, spdx_identifier diff --git a/src/reuse/cli/common.py b/src/reuse/cli/common.py index c2ce22132..0345cf24f 100644 --- a/src/reuse/cli/common.py +++ b/src/reuse/cli/common.py @@ -12,7 +12,7 @@ from boolean.boolean import Expression, ParseError from license_expression import ExpressionError -from .._util import _LICENSING +from .. import _LICENSING from ..exceptions import GlobalLicensingConflictError, GlobalLicensingParseError from ..i18n import _ from ..project import Project diff --git a/src/reuse/cli/spdx.py b/src/reuse/cli/spdx.py index 11a9933b5..1f29e4e0a 100644 --- a/src/reuse/cli/spdx.py +++ b/src/reuse/cli/spdx.py @@ -12,7 +12,7 @@ import click -from .. import _IGNORE_SPDX_PATTERNS +from ..covered_files import _IGNORE_SPDX_PATTERNS from ..i18n import _ from ..report import ProjectReport from .common import ClickObj diff --git a/src/reuse/copyright.py b/src/reuse/copyright.py new file mode 100644 index 000000000..4f4df9ff2 --- /dev/null +++ b/src/reuse/copyright.py @@ -0,0 +1,122 @@ +# SPDX-FileCopyrightText: 2024 Free Software Foundation Europe e.V. +# +# SPDX-License-Identifier: GPL-3.0-or-later + +"""Utilities related to the parsing and storing of copyright notices.""" + +import re +from collections import Counter +from typing import Optional + +from .extract import _COPYRIGHT_PATTERNS # TODO: Get rid of this import. + +_COPYRIGHT_PREFIXES = { + "spdx": "SPDX-FileCopyrightText:", + "spdx-c": "SPDX-FileCopyrightText: (C)", + "spdx-string-c": "SPDX-FileCopyrightText: Copyright (C)", + "spdx-string": "SPDX-FileCopyrightText: Copyright", + "spdx-string-symbol": "SPDX-FileCopyrightText: Copyright ©", + "spdx-symbol": "SPDX-FileCopyrightText: ©", + "string": "Copyright", + "string-c": "Copyright (C)", + "string-symbol": "Copyright ©", + "symbol": "©", +} + + +def merge_copyright_lines(copyright_lines: set[str]) -> set[str]: + """Parse all copyright lines and merge identical statements making years + into a range. + + If a same statement uses multiple prefixes, use only the most frequent one. + """ + # pylint: disable=too-many-locals + # TODO: Rewrite this function. It's a bit of a mess. + copyright_in = [] + for line in copyright_lines: + for pattern in _COPYRIGHT_PATTERNS: + match = pattern.search(line) + if match is not None: + copyright_in.append( + { + "statement": match.groupdict()["statement"], + "year": _parse_copyright_year( + match.groupdict()["year"] + ), + "prefix": match.groupdict()["prefix"], + } + ) + break + + copyright_out = set() + for line_info in copyright_in: + statement = str(line_info["statement"]) + copyright_list = [ + item for item in copyright_in if item["statement"] == statement + ] + + # Get the most common prefix. + most_common = str( + Counter([item["prefix"] for item in copyright_list]).most_common(1)[ + 0 + ][0] + ) + prefix = "spdx" + for key, value in _COPYRIGHT_PREFIXES.items(): + if most_common == value: + prefix = key + break + + # get year range if any + years: list[str] = [] + for copy in copyright_list: + years += copy["year"] + + year: Optional[str] = None + if years: + if min(years) == max(years): + year = min(years) + else: + year = f"{min(years)} - {max(years)}" + + copyright_out.add(make_copyright_line(statement, year, prefix)) + return copyright_out + + +def make_copyright_line( + statement: str, year: Optional[str] = None, copyright_prefix: str = "spdx" +) -> str: + """Given a statement, prefix it with ``SPDX-FileCopyrightText:`` if it is + not already prefixed with some manner of copyright tag. + """ + if "\n" in statement: + raise RuntimeError(f"Unexpected newline in '{statement}'") + + prefix = _COPYRIGHT_PREFIXES.get(copyright_prefix) + if prefix is None: + # TODO: Maybe translate this. Also maybe reduce DRY here. + raise RuntimeError( + "Unexpected copyright prefix: Need 'spdx', 'spdx-c', " + "'spdx-symbol', 'string', 'string-c', " + "'string-symbol', or 'symbol'" + ) + + for pattern in _COPYRIGHT_PATTERNS: + match = pattern.search(statement) + if match is not None: + return statement + if year is not None: + return f"{prefix} {year} {statement}" + return f"{prefix} {statement}" + + +def _parse_copyright_year(year: Optional[str]) -> list[str]: + """Parse copyright years and return list.""" + ret: list[str] = [] + if not year: + return ret + if re.match(r"\d{4}$", year): + ret = [year] + elif re.match(r"\d{4} ?- ?\d{4}$", year): + ret = [year[:4], year[-4:]] + return ret diff --git a/src/reuse/covered_files.py b/src/reuse/covered_files.py index ad4e019b6..cbe143550 100644 --- a/src/reuse/covered_files.py +++ b/src/reuse/covered_files.py @@ -11,19 +11,51 @@ import contextlib import logging import os +import re from pathlib import Path from typing import Collection, Generator, Optional, cast -from . import ( - _IGNORE_DIR_PATTERNS, - _IGNORE_FILE_PATTERNS, - _IGNORE_MESON_PARENT_DIR_PATTERNS, -) from .types import StrPath from .vcs import VCSStrategy _LOGGER = logging.getLogger(__name__) +_IGNORE_DIR_PATTERNS = [ + re.compile(r"^\.git$"), + re.compile(r"^\.hg$"), + re.compile(r"^\.sl$"), # Used by Sapling SCM + re.compile(r"^LICENSES$"), + re.compile(r"^\.reuse$"), +] + +_IGNORE_MESON_PARENT_DIR_PATTERNS = [ + re.compile(r"^subprojects$"), +] + +_IGNORE_FILE_PATTERNS = [ + # LICENSE, LICENSE-MIT, LICENSE.txt + re.compile(r"^LICEN[CS]E([-\.].*)?$"), + re.compile(r"^COPYING([-\.].*)?$"), + # ".git" as file happens in submodules + re.compile(r"^\.git$"), + re.compile(r"^\.hgtags$"), + re.compile(r".*\.license$"), + re.compile(r"^REUSE\.toml$"), + # Workaround for https://github.com/fsfe/reuse-tool/issues/229 + re.compile(r"^CAL-1.0(-Combined-Work-Exception)?(\..+)?$"), + re.compile(r"^SHL-2.1(\..+)?$"), +] + +_IGNORE_SPDX_PATTERNS = [ + # SPDX files from + # https://spdx.github.io/spdx-spec/conformance/#44-standard-data-format-requirements + re.compile(r".*\.spdx$"), + re.compile(r".*\.spdx.(rdf|json|xml|ya?ml)$"), +] + +# Combine SPDX patterns into file patterns to ease default ignore usage +_IGNORE_FILE_PATTERNS.extend(_IGNORE_SPDX_PATTERNS) + def is_path_ignored( path: Path, diff --git a/src/reuse/download.py b/src/reuse/download.py index 06ef1c944..677a0d094 100644 --- a/src/reuse/download.py +++ b/src/reuse/download.py @@ -15,7 +15,8 @@ from urllib.error import URLError from urllib.parse import urljoin -from ._util import _LICENSEREF_PATTERN, find_licenses_directory +from ._util import find_licenses_directory +from .extract import _LICENSEREF_PATTERN from .project import Project from .types import StrPath from .vcs import VCSStrategyNone diff --git a/src/reuse/extract.py b/src/reuse/extract.py new file mode 100644 index 000000000..34bcd0faf --- /dev/null +++ b/src/reuse/extract.py @@ -0,0 +1,282 @@ +# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. +# SPDX-FileCopyrightText: 2020 Tuomas Siipola +# SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker +# SPDX-FileCopyrightText: 2022 Florian Snow +# SPDX-FileCopyrightText: 2022 Nico Rikken +# SPDX-FileCopyrightText: 2022 Pietro Albini +# SPDX-FileCopyrightText: 2023 DB Systel GmbH +# SPDX-FileCopyrightText: 2023 Johannes Zarl-Zierl +# SPDX-FileCopyrightText: 2024 Rivos Inc. +# SPDX-FileCopyrightText: 2024 Skyler Grey +# SPDX-FileCopyrightText: © 2020 Liferay, Inc. +# +# SPDX-License-Identifier: GPL-3.0-or-later + +"""Utilities related to the extraction of REUSE information out of files.""" + +import logging +import os +import re +from itertools import chain +from pathlib import Path +from typing import BinaryIO, Iterator, Optional + +from boolean.boolean import ParseError +from license_expression import ExpressionError + +from . import _LICENSING, ReuseInfo, SourceType +from ._util import relative_from_root +from .comment import _all_style_classes +from .i18n import _ +from .types import StrPath + +REUSE_IGNORE_START = "REUSE-IgnoreStart" +REUSE_IGNORE_END = "REUSE-IgnoreEnd" + +# REUSE-IgnoreStart + +SPDX_SNIPPET_INDICATOR = b"SPDX-SnippetBegin" + +_LOGGER = logging.getLogger(__name__) + +_END_PATTERN = r"{}$".format( + "".join( + { + r"(?:{})*".format(item) # pylint: disable=consider-using-f-string + for item in chain( + ( + re.escape(style.MULTI_LINE.end) + for style in _all_style_classes() + if style.MULTI_LINE.end + ), + # These are special endings which do not belong to specific + # comment styles, but which we want to nonetheless strip away + # while parsing. + ( + ending + for ending in [ + # ex: + r'"\s*/*>', + r"'\s*/*>", + # ex: [SPDX-License-Identifier: GPL-3.0-or-later] :: + r"\]\s*::", + ] + ), + ) + } + ) +) +_LICENSE_IDENTIFIER_PATTERN = re.compile( + r"^(.*?)SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE +) +_CONTRIBUTOR_PATTERN = re.compile( + r"^(.*?)SPDX-FileContributor:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE +) +# The keys match the relevant attributes of ReuseInfo. +_SPDX_TAGS: dict[str, re.Pattern] = { + "spdx_expressions": _LICENSE_IDENTIFIER_PATTERN, + "contributor_lines": _CONTRIBUTOR_PATTERN, +} + +_COPYRIGHT_PATTERNS = [ + re.compile( + r"(?P(?PSPDX-(File|Snippet)CopyrightText:" + r"(\s(\([Cc]\)|©|Copyright(\s(©|\([Cc]\)))?))?)\s+" + r"((?P\d{4} ?- ?\d{4}|\d{4}),?\s+)?" + r"(?P.*?))" + _END_PATTERN + ), + re.compile( + r"(?P(?PCopyright(\s(\([Cc]\)|©))?)\s+" + r"((?P\d{4} ?- ?\d{4}|\d{4}),?\s+)?" + r"(?P.*?))" + _END_PATTERN + ), + re.compile( + r"(?P(?P©)\s+" + r"((?P\d{4} ?- ?\d{4}|\d{4}),?\s+)?" + r"(?P.*?))" + _END_PATTERN + ), +] + +_LICENSEREF_PATTERN = re.compile("LicenseRef-[a-zA-Z0-9-.]+$") + +# Amount of bytes that we assume will be big enough to contain the entire +# comment header (including SPDX tags), so that we don't need to read the +# entire file. +_HEADER_BYTES = 4096 + + +def decoded_text_from_binary( + binary_file: BinaryIO, size: Optional[int] = None +) -> str: + """Given a binary file object, detect its encoding and return its contents + as a decoded string. Do not throw any errors if the encoding contains + errors: Just replace the false characters. + + If *size* is specified, only read so many bytes. + """ + if size is None: + size = -1 + rawdata = binary_file.read(size) + result = rawdata.decode("utf-8", errors="replace") + return result.replace("\r\n", "\n") + + +def _contains_snippet(binary_file: BinaryIO) -> bool: + """Check if a file seems to contain a SPDX snippet""" + # Assumes that if SPDX_SNIPPET_INDICATOR (SPDX-SnippetBegin) is found in a + # file, the file contains a snippet + content = binary_file.read() + if SPDX_SNIPPET_INDICATOR in content: + return True + return False + + +def extract_reuse_info(text: str) -> ReuseInfo: + """Extract REUSE information from comments in a string. + + Raises: + ExpressionError: if an SPDX expression could not be parsed. + ParseError: if an SPDX expression could not be parsed. + """ + text = filter_ignore_block(text) + spdx_tags: dict[str, set[str]] = {} + for tag, pattern in _SPDX_TAGS.items(): + spdx_tags[tag] = set(find_spdx_tag(text, pattern)) + # License expressions and copyright matches are special cases. + expressions = set() + copyright_matches = set() + for expression in spdx_tags.pop("spdx_expressions"): + try: + expressions.add(_LICENSING.parse(expression)) + except (ExpressionError, ParseError): + _LOGGER.error( + _("Could not parse '{expression}'").format( + expression=expression + ) + ) + raise + for line in text.splitlines(): + for pattern in _COPYRIGHT_PATTERNS: + match = pattern.search(line) + if match is not None: + copyright_matches.add(match.groupdict()["copyright"].strip()) + break + + return ReuseInfo( + spdx_expressions=expressions, + copyright_lines=copyright_matches, + **spdx_tags, # type: ignore + ) + + +def reuse_info_of_file( + path: StrPath, original_path: StrPath, root: StrPath +) -> ReuseInfo: + """Open *path* and return its :class:`ReuseInfo`. + + Normally only the first few :const:`_HEADER_BYTES` are read. But if a + snippet was detected, the entire file is read. + """ + path = Path(path) + with path.open("rb") as fp: + try: + read_limit: Optional[int] = _HEADER_BYTES + # Completely read the file once + # to search for possible snippets + if _contains_snippet(fp): + _LOGGER.debug(f"'{path}' seems to contain an SPDX Snippet") + read_limit = None + # Reset read position + fp.seek(0) + # Scan the file for REUSE info, possibly limiting the read + # length + file_result = extract_reuse_info( + decoded_text_from_binary(fp, size=read_limit) + ) + if file_result.contains_copyright_or_licensing(): + source_type = SourceType.FILE_HEADER + if path.suffix == ".license": + source_type = SourceType.DOT_LICENSE + return file_result.copy( + path=relative_from_root(original_path, root).as_posix(), + source_path=relative_from_root(path, root).as_posix(), + source_type=source_type, + ) + + except (ExpressionError, ParseError): + _LOGGER.error( + _( + "'{path}' holds an SPDX expression that cannot be" + " parsed, skipping the file" + ).format(path=path) + ) + return ReuseInfo() + + +def find_spdx_tag(text: str, pattern: re.Pattern) -> Iterator[str]: + """Extract all the values in *text* matching *pattern*'s regex, taking care + of stripping extraneous whitespace of formatting. + """ + for prefix, value in pattern.findall(text): + prefix, value = prefix.strip(), value.strip() + + # Some comment headers have ASCII art to "frame" the comment, like this: + # + # /***********************\ + # |* This is a comment *| + # \***********************/ + # + # To ensure we parse them correctly, if the line ends with the inverse + # of the comment prefix, we strip that suffix. See #343 for a real + # world example of a project doing this (LLVM). + suffix = prefix[::-1] + if suffix and value.endswith(suffix): + value = value[: -len(suffix)] + + yield value.strip() + + +def filter_ignore_block(text: str) -> str: + """Filter out blocks beginning with REUSE_IGNORE_START and ending with + REUSE_IGNORE_END to remove lines that should not be treated as copyright and + licensing information. + """ + ignore_start = None + ignore_end = None + if REUSE_IGNORE_START in text: + ignore_start = text.index(REUSE_IGNORE_START) + if REUSE_IGNORE_END in text: + ignore_end = text.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END) + if not ignore_start: + return text + if not ignore_end: + return text[:ignore_start] + if ignore_end > ignore_start: + return text[:ignore_start] + filter_ignore_block(text[ignore_end:]) + rest = text[ignore_start + len(REUSE_IGNORE_START) :] + if REUSE_IGNORE_END in rest: + ignore_end = rest.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END) + return text[:ignore_start] + filter_ignore_block(rest[ignore_end:]) + return text[:ignore_start] + + +def contains_reuse_info(text: str) -> bool: + """The text contains REUSE info.""" + try: + return bool(extract_reuse_info(text)) + except (ExpressionError, ParseError): + return False + + +def detect_line_endings(text: str) -> str: + """Return one of '\n', '\r' or '\r\n' depending on the line endings used in + *text*. Return os.linesep if there are no line endings. + """ + line_endings = ["\r\n", "\r", "\n"] + for line_ending in line_endings: + if line_ending in text: + return line_ending + return os.linesep + + +# REUSE-IgnoreEnd diff --git a/src/reuse/global_licensing.py b/src/reuse/global_licensing.py index bacc8f92a..c8ad9cf1e 100644 --- a/src/reuse/global_licensing.py +++ b/src/reuse/global_licensing.py @@ -33,8 +33,7 @@ from debian.copyright import Error as DebianError from license_expression import ExpressionError -from . import ReuseInfo, SourceType -from ._util import _LICENSING +from . import _LICENSING, ReuseInfo, SourceType from .covered_files import iter_files from .exceptions import ( GlobalLicensingParseError, diff --git a/src/reuse/header.py b/src/reuse/header.py index 6000d9a84..0538155a3 100644 --- a/src/reuse/header.py +++ b/src/reuse/header.py @@ -23,17 +23,14 @@ from license_expression import ExpressionError from . import ReuseInfo -from ._util import ( - contains_reuse_info, - extract_reuse_info, - merge_copyright_lines, -) from .comment import CommentStyle, EmptyCommentStyle, PythonCommentStyle +from .copyright import merge_copyright_lines from .exceptions import ( CommentCreateError, CommentParseError, MissingReuseInfoError, ) +from .extract import contains_reuse_info, extract_reuse_info from .i18n import _ _LOGGER = logging.getLogger(__name__) diff --git a/src/reuse/project.py b/src/reuse/project.py index cf6ca24d5..4b7425f5b 100644 --- a/src/reuse/project.py +++ b/src/reuse/project.py @@ -23,17 +23,13 @@ from . import ReuseInfo from ._licenses import EXCEPTION_MAP, LICENSE_MAP -from ._util import ( - _LICENSEREF_PATTERN, - _determine_license_path, - relative_from_root, - reuse_info_of_file, -) +from ._util import _determine_license_path, relative_from_root from .covered_files import iter_files from .exceptions import ( GlobalLicensingConflictError, SpdxIdentifierNotFoundError, ) +from .extract import _LICENSEREF_PATTERN, reuse_info_of_file from .global_licensing import ( GlobalLicensing, NestedReuseTOML, diff --git a/src/reuse/report.py b/src/reuse/report.py index dd2a0355a..4277a9cec 100644 --- a/src/reuse/report.py +++ b/src/reuse/report.py @@ -31,8 +31,9 @@ ) from uuid import uuid4 -from . import __REUSE_version__, __version__ -from ._util import _LICENSEREF_PATTERN, _LICENSING, _checksum +from . import _LICENSING, __REUSE_version__, __version__ +from ._util import _checksum +from .extract import _LICENSEREF_PATTERN from .global_licensing import ReuseDep5 from .i18n import _ from .project import Project, ReuseInfo diff --git a/src/reuse/vcs.py b/src/reuse/vcs.py index 135c91393..8c1cb5c8e 100644 --- a/src/reuse/vcs.py +++ b/src/reuse/vcs.py @@ -12,19 +12,13 @@ import logging import os +import shutil from abc import ABC, abstractmethod from inspect import isclass from pathlib import Path from typing import TYPE_CHECKING, Generator, Optional, Type -from ._util import ( - GIT_EXE, - HG_EXE, - JUJUTSU_EXE, - PIJUL_EXE, - execute_command, - relative_from_root, -) +from ._util import execute_command, relative_from_root from .types import StrPath if TYPE_CHECKING: @@ -32,6 +26,11 @@ _LOGGER = logging.getLogger(__name__) +GIT_EXE = shutil.which("git") +HG_EXE = shutil.which("hg") +JUJUTSU_EXE = shutil.which("jj") +PIJUL_EXE = shutil.which("pijul") + class VCSStrategy(ABC): """Strategy pattern for version control systems.""" diff --git a/tests/conftest.py b/tests/conftest.py index 075a5fe4d..929533075 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -39,14 +39,9 @@ except ImportError: sys.path.append(os.path.join(Path(__file__).parent.parent, "src")) finally: - from reuse._util import ( - GIT_EXE, - HG_EXE, - JUJUTSU_EXE, - PIJUL_EXE, - setup_logging, - ) + from reuse._util import setup_logging from reuse.global_licensing import ReuseDep5 + from reuse.vcs import GIT_EXE, HG_EXE, JUJUTSU_EXE, PIJUL_EXE CWD = Path.cwd() @@ -117,7 +112,6 @@ def optional_git_exe( """Run the test with or without git.""" exe = GIT_EXE if request.param else "" monkeypatch.setattr("reuse.vcs.GIT_EXE", exe) - monkeypatch.setattr("reuse._util.GIT_EXE", exe) yield exe @@ -136,7 +130,6 @@ def optional_hg_exe( """Run the test with or without mercurial.""" exe = HG_EXE if request.param else "" monkeypatch.setattr("reuse.vcs.HG_EXE", exe) - monkeypatch.setattr("reuse._util.HG_EXE", exe) yield exe @@ -155,7 +148,6 @@ def optional_jujutsu_exe( """Run the test with or without Jujutsu.""" exe = JUJUTSU_EXE if request.param else "" monkeypatch.setattr("reuse.vcs.JUJUTSU_EXE", exe) - monkeypatch.setattr("reuse._util.JUJUTSU_EXE", exe) yield exe @@ -174,7 +166,6 @@ def optional_pijul_exe( """Run the test with or without Pijul.""" exe = PIJUL_EXE if request.param else "" monkeypatch.setattr("reuse.vcs.PIJUL_EXE", exe) - monkeypatch.setattr("reuse._util.PIJUL_EXE", exe) yield exe diff --git a/tests/test_cli_annotate.py b/tests/test_cli_annotate.py index 4aecfd85e..0326af280 100644 --- a/tests/test_cli_annotate.py +++ b/tests/test_cli_annotate.py @@ -17,8 +17,8 @@ import pytest from click.testing import CliRunner -from reuse._util import _COPYRIGHT_PREFIXES from reuse.cli.main import main +from reuse.copyright import _COPYRIGHT_PREFIXES # pylint: disable=too-many-public-methods,too-many-lines,unused-argument diff --git a/tests/test_copyright.py b/tests/test_copyright.py new file mode 100644 index 000000000..662b5ecb5 --- /dev/null +++ b/tests/test_copyright.py @@ -0,0 +1,135 @@ +# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. +# SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker +# SPDX-FileCopyrightText: 2022 Florian Snow +# SPDX-FileCopyrightText: 2022 Nico Rikken +# SPDX-FileCopyrightText: 2022 Pietro Albini +# SPDX-FileCopyrightText: 2024 Rivos Inc. +# SPDX-FileCopyrightText: © 2020 Liferay, Inc. +# +# SPDX-License-Identifier: GPL-3.0-or-later + +"""Tests for reuse.copyright""" + +import pytest + +from reuse.copyright import make_copyright_line + +# REUSE-IgnoreStart + + +def test_make_copyright_line_simple(): + """Given a simple statement, make it a copyright line.""" + assert make_copyright_line("hello") == "SPDX-FileCopyrightText: hello" + + +def test_make_copyright_line_year(): + """Given a simple statement and a year, make it a copyright line.""" + assert ( + make_copyright_line("hello", year="2019") + == "SPDX-FileCopyrightText: 2019 hello" + ) + + +def test_make_copyright_line_prefix_spdx(): + """Given a simple statement and prefix, make it a copyright line.""" + statement = make_copyright_line("hello", copyright_prefix="spdx") + assert statement == "SPDX-FileCopyrightText: hello" + + +def test_make_copyright_line_prefix_spdx_year(): + """Given a simple statement, prefix and a year, make it a copyright line.""" + statement = make_copyright_line("hello", year=2019, copyright_prefix="spdx") + assert statement == "SPDX-FileCopyrightText: 2019 hello" + + +def test_make_copyright_line_prefix_spdx_c_year(): + """Given a simple statement, prefix and a year, make it a copyright line.""" + statement = make_copyright_line( + "hello", year=2019, copyright_prefix="spdx-c" + ) + assert statement == "SPDX-FileCopyrightText: (C) 2019 hello" + + +def test_make_copyright_line_prefix_spdx_symbol_year(): + """Given a simple statement, prefix and a year, make it a copyright line.""" + statement = make_copyright_line( + "hello", year=2019, copyright_prefix="spdx-symbol" + ) + assert statement == "SPDX-FileCopyrightText: © 2019 hello" + + +def test_make_copyright_line_prefix_string_year(): + """Given a simple statement, prefix and a year, make it a copyright line.""" + statement = make_copyright_line( + "hello", year=2019, copyright_prefix="string" + ) + assert statement == "Copyright 2019 hello" + + +def test_make_copyright_line_prefix_string_c_year(): + """Given a simple statement, prefix and a year, make it a copyright line.""" + statement = make_copyright_line( + "hello", year=2019, copyright_prefix="string-c" + ) + assert statement == "Copyright (C) 2019 hello" + + +def test_make_copyright_line_prefix_spdx_string_c_year(): + """Given a simple statement, prefix and a year, make it a copyright line.""" + statement = make_copyright_line( + "hello", year=2019, copyright_prefix="spdx-string-c" + ) + assert statement == "SPDX-FileCopyrightText: Copyright (C) 2019 hello" + + +def test_make_copyright_line_prefix_spdx_string_year(): + """Given a simple statement, prefix and a year, make it a copyright line.""" + statement = make_copyright_line( + "hello", year=2019, copyright_prefix="spdx-string" + ) + assert statement == "SPDX-FileCopyrightText: Copyright 2019 hello" + + +def test_make_copyright_line_prefix_spdx_string_symbol_year(): + """Given a simple statement, prefix and a year, make it a copyright line.""" + statement = make_copyright_line( + "hello", year=2019, copyright_prefix="spdx-string-symbol" + ) + assert statement == "SPDX-FileCopyrightText: Copyright © 2019 hello" + + +def test_make_copyright_line_prefix_string_symbol_year(): + """Given a simple statement, prefix and a year, make it a copyright line.""" + statement = make_copyright_line( + "hello", year=2019, copyright_prefix="string-symbol" + ) + assert statement == "Copyright © 2019 hello" + + +def test_make_copyright_line_prefix_symbol_year(): + """Given a simple statement, prefix and a year, make it a copyright line.""" + statement = make_copyright_line( + "hello", year=2019, copyright_prefix="symbol" + ) + assert statement == "© 2019 hello" + + +def test_make_copyright_line_existing_spdx_copyright(): + """Given a copyright line, do nothing.""" + value = "SPDX-FileCopyrightText: hello" + assert make_copyright_line(value) == value + + +def test_make_copyright_line_existing_other_copyright(): + """Given a non-SPDX copyright line, do nothing.""" + value = "© hello" + assert make_copyright_line(value) == value + + +def test_make_copyright_line_multine_error(): + """Given a multiline argument, expect an error.""" + with pytest.raises(RuntimeError): + make_copyright_line("hello\nworld") + + +# REUSE-IgnoreEnd diff --git a/tests/test_util.py b/tests/test_extract.py similarity index 62% rename from tests/test_util.py rename to tests/test_extract.py index e26445fe8..46bcad68e 100644 --- a/tests/test_util.py +++ b/tests/test_extract.py @@ -8,7 +8,7 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -"""Tests for reuse._util""" +"""Tests for reuse.extract""" import os from inspect import cleandoc @@ -17,8 +17,13 @@ import pytest from boolean.boolean import ParseError -from reuse import _util -from reuse._util import _LICENSING +from reuse import _LICENSING, ReuseInfo +from reuse.extract import ( + decoded_text_from_binary, + detect_line_endings, + extract_reuse_info, + filter_ignore_block, +) # REUSE-IgnoreStart @@ -27,15 +32,13 @@ def test_extract_expression(): """Parse various expressions.""" expressions = ["GPL-3.0+", "GPL-3.0 AND CC0-1.0", "nonsense"] for expression in expressions: - result = _util.extract_reuse_info( - f"SPDX-License-Identifier: {expression}" - ) + result = extract_reuse_info(f"SPDX-License-Identifier: {expression}") assert result.spdx_expressions == {_LICENSING.parse(expression)} def test_extract_expression_from_ascii_art_frame(): """Parse an expression from an ASCII art frame""" - result = _util.extract_reuse_info( + result = extract_reuse_info( cleandoc( """ /**********************************\\ @@ -51,20 +54,20 @@ def test_extract_erroneous_expression(): """Parse an incorrect expression.""" expression = "SPDX-License-Identifier: GPL-3.0-or-later AND (MIT OR)" with pytest.raises(ParseError): - _util.extract_reuse_info(expression) + extract_reuse_info(expression) def test_extract_no_info(): """Given a string without REUSE information, return an empty ReuseInfo object. """ - result = _util.extract_reuse_info("") - assert result == _util.ReuseInfo() + result = extract_reuse_info("") + assert result == ReuseInfo() def test_extract_tab(): """A tag followed by a tab is also valid.""" - result = _util.extract_reuse_info("SPDX-License-Identifier:\tMIT") + result = extract_reuse_info("SPDX-License-Identifier:\tMIT") assert result.spdx_expressions == {_LICENSING.parse("MIT")} @@ -72,14 +75,14 @@ def test_extract_many_whitespace(): """When a tag is followed by a lot of whitespace, the whitespace should be filtered out. """ - result = _util.extract_reuse_info("SPDX-License-Identifier: MIT") + result = extract_reuse_info("SPDX-License-Identifier: MIT") assert result.spdx_expressions == {_LICENSING.parse("MIT")} def test_extract_bibtex_comment(): """A special case for BibTex comments.""" expression = "@Comment{SPDX-License-Identifier: GPL-3.0-or-later}" - result = _util.extract_reuse_info(expression) + result = extract_reuse_info(expression) assert str(list(result.spdx_expressions)[0]) == "GPL-3.0-or-later" @@ -88,23 +91,21 @@ def test_extract_copyright(): information. """ copyright_line = "SPDX-FileCopyrightText: 2019 Jane Doe" - result = _util.extract_reuse_info(copyright_line) + result = extract_reuse_info(copyright_line) assert result.copyright_lines == {copyright_line} def test_extract_copyright_duplicate(): """When a copyright line is duplicated, only yield one.""" copyright_line = "SPDX-FileCopyrightText: 2019 Jane Doe" - result = _util.extract_reuse_info( - "\n".join((copyright_line, copyright_line)) - ) + result = extract_reuse_info("\n".join((copyright_line, copyright_line))) assert result.copyright_lines == {copyright_line} def test_extract_copyright_tab(): """A tag followed by a tab is also valid.""" copyright_line = "SPDX-FileCopyrightText:\t2019 Jane Doe" - result = _util.extract_reuse_info(copyright_line) + result = extract_reuse_info(copyright_line) assert result.copyright_lines == {copyright_line} @@ -113,7 +114,7 @@ def test_extract_copyright_many_whitespace(): whitespace is not filtered out. """ copyright_line = "SPDX-FileCopyrightText: 2019 Jane Doe" - result = _util.extract_reuse_info(copyright_line) + result = extract_reuse_info(copyright_line) assert result.copyright_lines == {copyright_line} @@ -133,7 +134,7 @@ def test_extract_copyright_variations(): """ ) - result = _util.extract_reuse_info(text) + result = extract_reuse_info(text) lines = text.splitlines() for line in lines: assert line in result.copyright_lines @@ -155,7 +156,7 @@ def test_extract_with_ignore_block(): SPDX-FileCopyrightText: 2019 Eve """ ) - result = _util.extract_reuse_info(text) + result = extract_reuse_info(text) assert len(result.copyright_lines) == 2 assert len(result.spdx_expressions) == 1 @@ -165,7 +166,7 @@ def test_extract_sameline_multiline(): do not include the comment end pattern as part of the copyright. """ text = "" - result = _util.extract_reuse_info(text) + result = extract_reuse_info(text) assert len(result.copyright_lines) == 1 assert result.copyright_lines == {"SPDX-FileCopyrightText: Jane Doe"} @@ -185,7 +186,7 @@ def test_extract_special_endings(): [Copyright 2019 Ajnulo] :: """ ) - result = _util.extract_reuse_info(text) + result = extract_reuse_info(text) for item in result.copyright_lines: assert ">" not in item assert "] ::" not in item @@ -198,7 +199,7 @@ def test_extract_contributors(): # SPDX-FileContributor: Jane Doe """ ) - result = _util.extract_reuse_info(text) + result = extract_reuse_info(text) assert result.contributor_lines == {"Jane Doe"} @@ -217,7 +218,7 @@ def test_filter_ignore_block_with_comment_style(): ) expected = "Relevant text\n# \nOther relevant text" - result = _util.filter_ignore_block(text) + result = filter_ignore_block(text) assert result == expected @@ -242,7 +243,7 @@ def test_filter_ignore_block_non_comment_style(): """ ) - result = _util.filter_ignore_block(text) + result = filter_ignore_block(text) assert result == expected @@ -267,7 +268,7 @@ def test_filter_ignore_block_with_ignored_information_on_same_line(): """ ) - result = _util.filter_ignore_block(text) + result = filter_ignore_block(text) assert result == expected @@ -284,7 +285,7 @@ def test_filter_ignore_block_with_relevant_information_on_same_line(): ) expected = "Relevant textOther relevant text" - result = _util.filter_ignore_block(text) + result = filter_ignore_block(text) assert result == expected @@ -305,7 +306,7 @@ def test_filter_ignore_block_with_beginning_and_end_on_same_line_correct_order() """ ) - result = _util.filter_ignore_block(text) + result = filter_ignore_block(text) assert result == expected @@ -316,7 +317,7 @@ def test_filter_ignore_block_with_beginning_and_end_on_same_line_wrong_order(): text = "Relevant textREUSE-IgnoreEndOther relevant textREUSE-IgnoreStartIgnored text" # pylint: disable=line-too-long expected = "Relevant textREUSE-IgnoreEndOther relevant text" - result = _util.filter_ignore_block(text) + result = filter_ignore_block(text) assert result == expected @@ -334,7 +335,7 @@ def test_filter_ignore_block_without_end(): ) expected = "Relevant text\n" - result = _util.filter_ignore_block(text) + result = filter_ignore_block(text) assert result == expected @@ -365,166 +366,49 @@ def test_filter_ignore_block_with_multiple_ignore_blocks(): """ ) - result = _util.filter_ignore_block(text) + result = filter_ignore_block(text) assert result == expected -def test_make_copyright_line_simple(): - """Given a simple statement, make it a copyright line.""" - assert _util.make_copyright_line("hello") == "SPDX-FileCopyrightText: hello" - - -def test_make_copyright_line_year(): - """Given a simple statement and a year, make it a copyright line.""" - assert ( - _util.make_copyright_line("hello", year="2019") - == "SPDX-FileCopyrightText: 2019 hello" - ) - - -def test_make_copyright_line_prefix_spdx(): - """Given a simple statement and prefix, make it a copyright line.""" - statement = _util.make_copyright_line("hello", copyright_prefix="spdx") - assert statement == "SPDX-FileCopyrightText: hello" - - -def test_make_copyright_line_prefix_spdx_year(): - """Given a simple statement, prefix and a year, make it a copyright line.""" - statement = _util.make_copyright_line( - "hello", year=2019, copyright_prefix="spdx" - ) - assert statement == "SPDX-FileCopyrightText: 2019 hello" - - -def test_make_copyright_line_prefix_spdx_c_year(): - """Given a simple statement, prefix and a year, make it a copyright line.""" - statement = _util.make_copyright_line( - "hello", year=2019, copyright_prefix="spdx-c" - ) - assert statement == "SPDX-FileCopyrightText: (C) 2019 hello" - - -def test_make_copyright_line_prefix_spdx_symbol_year(): - """Given a simple statement, prefix and a year, make it a copyright line.""" - statement = _util.make_copyright_line( - "hello", year=2019, copyright_prefix="spdx-symbol" - ) - assert statement == "SPDX-FileCopyrightText: © 2019 hello" - - -def test_make_copyright_line_prefix_string_year(): - """Given a simple statement, prefix and a year, make it a copyright line.""" - statement = _util.make_copyright_line( - "hello", year=2019, copyright_prefix="string" - ) - assert statement == "Copyright 2019 hello" - - -def test_make_copyright_line_prefix_string_c_year(): - """Given a simple statement, prefix and a year, make it a copyright line.""" - statement = _util.make_copyright_line( - "hello", year=2019, copyright_prefix="string-c" - ) - assert statement == "Copyright (C) 2019 hello" - - -def test_make_copyright_line_prefix_spdx_string_c_year(): - """Given a simple statement, prefix and a year, make it a copyright line.""" - statement = _util.make_copyright_line( - "hello", year=2019, copyright_prefix="spdx-string-c" - ) - assert statement == "SPDX-FileCopyrightText: Copyright (C) 2019 hello" - - -def test_make_copyright_line_prefix_spdx_string_year(): - """Given a simple statement, prefix and a year, make it a copyright line.""" - statement = _util.make_copyright_line( - "hello", year=2019, copyright_prefix="spdx-string" - ) - assert statement == "SPDX-FileCopyrightText: Copyright 2019 hello" - - -def test_make_copyright_line_prefix_spdx_string_symbol_year(): - """Given a simple statement, prefix and a year, make it a copyright line.""" - statement = _util.make_copyright_line( - "hello", year=2019, copyright_prefix="spdx-string-symbol" - ) - assert statement == "SPDX-FileCopyrightText: Copyright © 2019 hello" - - -def test_make_copyright_line_prefix_string_symbol_year(): - """Given a simple statement, prefix and a year, make it a copyright line.""" - statement = _util.make_copyright_line( - "hello", year=2019, copyright_prefix="string-symbol" - ) - assert statement == "Copyright © 2019 hello" - - -def test_make_copyright_line_prefix_symbol_year(): - """Given a simple statement, prefix and a year, make it a copyright line.""" - statement = _util.make_copyright_line( - "hello", year=2019, copyright_prefix="symbol" - ) - assert statement == "© 2019 hello" - - -def test_make_copyright_line_existing_spdx_copyright(): - """Given a copyright line, do nothing.""" - value = "SPDX-FileCopyrightText: hello" - assert _util.make_copyright_line(value) == value - - -def test_make_copyright_line_existing_other_copyright(): - """Given a non-SPDX copyright line, do nothing.""" - value = "© hello" - assert _util.make_copyright_line(value) == value - - -def test_make_copyright_line_multine_error(): - """Given a multiline argument, expect an error.""" - with pytest.raises(RuntimeError): - _util.make_copyright_line("hello\nworld") - - def test_decoded_text_from_binary_simple(): """A unicode string encoded as bytes object decodes back correctly.""" text = "Hello, world ☺" encoded = text.encode("utf-8") - assert _util.decoded_text_from_binary(BytesIO(encoded)) == text + assert decoded_text_from_binary(BytesIO(encoded)) == text def test_decoded_text_from_binary_size(): """Only a given amount of bytes is decoded.""" text = "Hello, world ☺" encoded = text.encode("utf-8") - assert _util.decoded_text_from_binary(BytesIO(encoded), size=5) == "Hello" + assert decoded_text_from_binary(BytesIO(encoded), size=5) == "Hello" def test_decoded_text_from_binary_crlf(): """Given CRLF line endings, convert to LF.""" text = "Hello\r\nworld" encoded = text.encode("utf-8") - assert _util.decoded_text_from_binary(BytesIO(encoded)) == "Hello\nworld" + assert decoded_text_from_binary(BytesIO(encoded)) == "Hello\nworld" def test_detect_line_endings_windows(): """Given a CRLF string, detect the line endings.""" - assert _util.detect_line_endings("hello\r\nworld") == "\r\n" + assert detect_line_endings("hello\r\nworld") == "\r\n" def test_detect_line_endings_mac(): """Given a CR string, detect the line endings.""" - assert _util.detect_line_endings("hello\rworld") == "\r" + assert detect_line_endings("hello\rworld") == "\r" def test_detect_line_endings_linux(): """Given a LF string, detect the line endings.""" - assert _util.detect_line_endings("hello\nworld") == "\n" + assert detect_line_endings("hello\nworld") == "\n" def test_detect_line_endings_no_newlines(): """Given a file without line endings, default to os.linesep.""" - assert _util.detect_line_endings("hello world") == os.linesep + assert detect_line_endings("hello world") == os.linesep -# REUSE-IgnoreEnd +# Reuse-IgnoreEnd diff --git a/tests/test_global_licensing.py b/tests/test_global_licensing.py index 62a69557a..1b7385381 100644 --- a/tests/test_global_licensing.py +++ b/tests/test_global_licensing.py @@ -13,8 +13,7 @@ from debian.copyright import Copyright from license_expression import LicenseSymbol -from reuse import ReuseInfo, SourceType -from reuse._util import _LICENSING +from reuse import _LICENSING, ReuseInfo, SourceType from reuse.exceptions import ( GlobalLicensingParseError, GlobalLicensingParseTypeError, diff --git a/tests/test_project.py b/tests/test_project.py index 448a2bf79..3606816d3 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -19,8 +19,7 @@ from conftest import RESOURCES_DIRECTORY from license_expression import LicenseSymbol -from reuse import ReuseInfo, SourceType -from reuse._util import _LICENSING +from reuse import _LICENSING, ReuseInfo, SourceType from reuse.covered_files import iter_files from reuse.exceptions import ( GlobalLicensingConflictError,