roskakori · roskakori · May 26, 2025 · May 26, 2025 · May 26, 2025
diff --git a/docs/background.md b/docs/background.md
@@ -59,10 +59,23 @@ When a file is considered to be binary when all the following
 conditions match:
 
 1. The file does not start with a BOM for UTF-8, UTF-16 or UTF-32 (which indicates text files).
-2. The initial 8192 bytes contain at least one 0 byte.
+2. The initial 8192 bytes contain at least one 0-byte.
 
 In this case, pygount assigns it the pseudo language `__binary__` and performs no further analysis.
 
+## Generated files
+
+Generated files are recognized either by their content (`--generated`) or name (`--generated-names`). Use `--help` to see the current default patterns.
+
+In case you think the standard patterns should be extended, modify `pygount.analysis.DEFAULT_GENERATED_LINE|NAME_PATTERNS_TEXT` and [contribute a pull request](contributing.md).
+
+For source code repositories, committing generated files should generally be avoided. Instead, make the generation part of the build process. However, there are valid reasons to include generated files:
+
+1. Package managers generate "lock" files from the package specification to ensure builds use the exact same versions and hashes. For example, "pyproject.toml" and "uv.lock".
+2. Generation takes too long, for example, in Flutter projects with many nested sub-packages.
+3. Generated files cannot be bootstrapped from scratch because of interdependencies.
+4. Cloud tools require certain generated files to be present in the repository. An example would be [ReadTheDocs.org](https://readthedocs.org), which as of May 2025 in combination with [MkDocs](https://www.mkdocs.org/) needs additional dependencies to be specified in a `requirements.txt`. Many Python projects specify their dependencies in `pyproject.toml`, which can be used to generate the `requirements.txt`. However, the ReadTheDocs build does not allow easily including such a step, so the path of least resistance is to just include the generated `requirements.txt` file in the repository.
+
 ## Comparison with other tools
 
 Pygount can analyze more languages than other common tools such as sloccount or cloc because it builds on `pygments`, which provides lexers for hundreds of languages. This also makes enables supporting another language: [Write your own lexer](http://pygments.org/docs/lexerdevelopment/).

diff --git a/docs/changes.md b/docs/changes.md
@@ -3,13 +3,15 @@
 This chapter describes the changes coming with each new version of
 pygount.
 
-## Version 3.1.0, 2025-xx-xx
+## Version 3.1.0, 2025-05-27
 
+- Add command line option [`--generated-names`](usage.md#-generated-names) to specify which file names should be considered to be generated. The current patterns recognized are somewhat limited, so contributions are welcome. See the section on "[Generated files](background.md#generated-files)" for hints on how to do that (issue [#190](https://github.com/roskakori/pygount/issues/190)).
 - Change documentation from Sphinx to MkDocs in the hope to avoid it breaking regularly (issue [#191](https://github.com/roskakori/pygount/issues/191)).
 
 Development:
 
 - Replace `format()` with f-strings (contributed by Ben Allen, issue [#166](https://github.com/roskakori/pygount/issues/166)).
+- Change sdist archive to include more than just the Python source code.
 
 ## Version 3.0.0, 2025-05-23
 

diff --git a/docs/contributing.md b/docs/contributing.md
@@ -69,7 +69,7 @@ version = "3.x.x"
 Next, build the project and run the tests to ensure everything works:
 
 ```sh
-$ rm -rf build  # Remove any files from previous builds.
+$ rm -rf dist  # Remove any files from previous builds.
 $ uv build
 $ uv run pytest
 ```

diff --git a/docs/usage.md b/docs/usage.md
@@ -120,6 +120,12 @@ If the first actual pattern is `[...]`, default patterns are included. Without i
 
 So for example, to specify that generated code can also contain the German word "generiert" in a case-insensitive way use `--generated="[regex][...](?i).*generiert"`.
 
+### `--generated-names`
+
+In addition to the source code, the file name can indicate that a source code is generated. For example, `--generated-names="*.lock,*.g.dart"`.
+
+The default already recognizes several standard generated names.
+
 ## Counting duplicates
 
 ### `--duplicates`

diff --git a/pygount/analysis.py b/pygount/analysis.py
@@ -16,6 +16,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from io import SEEK_CUR, BufferedIOBase, IOBase, RawIOBase, TextIOBase
+from pathlib import Path
 from re import Pattern
 from typing import Optional, Union
 
@@ -27,7 +28,7 @@
 import pygount.common
 import pygount.lexers
 import pygount.xmldialect
-from pygount.common import mapped_repr
+from pygount.common import mapped_repr, matching_regex
 from pygount.git_storage import GitStorage, git_remote_url_and_revision_if_any
 
 HTTP_URL_REGEX = re.compile(r"^(https?://)")
@@ -49,7 +50,11 @@
 
 #: Default glob patterns for folders not to analyze.
 DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT = ", ".join(
-    [".?*", "_svn", "__pycache__"]  # Subversion hack for Windows  # Python byte code
+    [
+        ".?*",
+        "_svn",  # Subversion hack for Windows
+        "__pycache__",  # Python byte code
+    ]
 )
 
 #: Pygments token type; we need to define our own type because pygments' ``_TokenType`` is internal.
@@ -85,8 +90,9 @@ class SourceState(Enum):
 
 #: Default patterns for regular expressions to detect generated code.
 #: The '(?i)' indicates that the patterns are case-insensitive.
-DEFAULT_GENERATED_PATTERNS_TEXT = pygount.common.REGEX_PATTERN_PREFIX + ", ".join(
+DEFAULT_GENERATED_LINE_PATTERNS_TEXT = pygount.common.REGEX_PATTERN_PREFIX + ", ".join(
     [
+        r"(?i).*autogenerated",
         r"(?i).*automatically generated",
         r"(?i).*do not edit",
         r"(?i).*generated with the .+ utility",
@@ -95,6 +101,18 @@ class SourceState(Enum):
     ]
 )
 
+#: Default patterns for file names that are considered to be generated.
+DEFAULT_GENERATED_NAME_PATTERNS_TEXT = ", ".join(
+    [
+        "*.g.dart",  # See, for example, <https://codewithandrea.com/articles/dart-flutter-code-generation/>
+        "*.lock",  # For example, Cargo.lock, poetry.lock, uv.lock.
+        "npm-shrinkwrap.json",  # See <https://docs.npmjs.com/cli/v11/configuring-npm/npm-shrinkwrap-json>.
+        "go.sum",  # See <https://go.dev/ref/mod#go-sum-files>.
+        "package-lock.json",  # See <https://docs.npmjs.com/cli/v11/configuring-npm/package-lock-json>.
+        "pnpm-lock.yaml",  # See <https://pnpm.io/cli/install>.
+    ]
+)
+
 #: Default glob patterns for file names not to analyze.
 DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT = ", ".join([".*", "*~"])
 
@@ -319,6 +337,8 @@ def from_file(
         file_handle: Optional[IOBase] = None,
         merge_embedded_language: bool = False,
         tmp_dir: Optional[str] = None,
+        *,
+        generated_name_regexes: Optional[list[Pattern]] = None,
     ) -> "SourceAnalysis":
         """
         Factory method to create a :py:class:`SourceAnalysis` by analyzing
@@ -332,13 +352,15 @@ def from_file(
           :func:`encoding_for`
         :param generated_regexes: list of regular expression that if found within the first few lines
           if a source code identify is as generated source code for which SLOC should not be counted
+        :param generated_name_regexes: list of regular expression that if the base file name matches,
+          the file is considered to be generated and the SLOC should not be counted
         :param duplicate_pool: a :class:`DuplicatePool` where information about possible duplicates is
           collected, or ``None`` if possible duplicates should be counted multiple times.
         :param file_handle: a file-like object, or ``None`` to read and open the file from
           ``source_path``. If the file is open in text mode, it must be opened with the correct
           encoding.
         :param merge_embedded_language: If pygments detects a base and embedded language, the source
-          code counts towards the base language. For example: "JavaScript+Lasso" counts as
+          code counts towards the base language. For example, "JavaScript+Lasso" counts as
           "JavaScript".
         :param tmp_dir: If a temporary directory was created, strip it from the path name. This happens
           right now only for git repositories.
@@ -348,7 +370,13 @@ def from_file(
         result = None
         lexer = None
         source_code = None
-        if file_handle is None:
+        if generated_name_regexes is not None:
+            generated_name_regex = matching_regex(Path(source_path).name, generated_name_regexes)
+            if generated_name_regex is not None:
+                result = SourceAnalysis.from_state(
+                    source_path, group, SourceState.generated, state_info=generated_name_regex.pattern
+                )
+        if result is None and file_handle is None:
             source_size = os.path.getsize(source_path)
             if source_size == 0:
                 _log.info("%s: is empty", source_path)
@@ -386,7 +414,7 @@ def from_file(
         actual_generated_regexes = (
             generated_regexes
             if generated_regexes is not None
-            else pygount.common.regexes_from(DEFAULT_GENERATED_PATTERNS_TEXT)
+            else pygount.common.regexes_from(DEFAULT_GENERATED_LINE_PATTERNS_TEXT)
         )
         if (result is None) and (len(actual_generated_regexes) != 0):
             number_line_and_regex = matching_number_line_and_regex(

diff --git a/pygount/command.py b/pygount/command.py
@@ -49,6 +49,10 @@
 _HELP_GENERATED = """comma separated list of regular expressions to detect
  generated code; default: %(default)s"""
 
+_HELP_GENERATED_NAMES = """comma separated list of glob patterns for file names
+ not to treat as generated. Use "..." as first entry to append patterns to the default
+ patterns; default: %(default)s"""
+
 _HELP_MERGE_EMBEDDED_LANGUAGES = """merge counts for embedded languages into
  their base language; for example, HTML+Jinja2 counts as HTML"""
 
@@ -104,7 +108,10 @@ class Command:
     def __init__(self):
         self.set_encodings(_DEFAULT_ENCODING)
         self._folders_to_skip = pygount.common.regexes_from(pygount.analysis.DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT)
-        self._generated_regexs = pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_PATTERNS_TEXT)
+        self._generated_line_regexs = pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT)
+        self._generated_name_regexps = pygount.common.regexes_from(
+            pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT
+        )
         self._has_duplicates = False
         self._has_summary = False
         self._has_to_merge_embedded_languages = False
@@ -159,11 +166,20 @@ def set_folders_to_skip(self, regexes_or_patterns_text, source=None):
 
     @property
     def generated_regexps(self):
-        return self._generated_regexs
+        return self._generated_line_regexs
 
     def set_generated_regexps(self, regexes_or_patterns_text, source=None):
-        self._generated_regexs = pygount.common.regexes_from(
-            regexes_or_patterns_text, pygount.analysis.DEFAULT_GENERATED_PATTERNS_TEXT, source
+        self._generated_line_regexs = pygount.common.regexes_from(
+            regexes_or_patterns_text, pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT, source
+        )
+
+    @property
+    def generated_name_regexps(self):
+        return self._generated_name_regexps
+
+    def set_generated_name_regexps(self, regexes_or_pattern_text, source=None):
+        self._generated_name_regexps = pygount.common.regexes_from(
+            regexes_or_pattern_text, pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT, source
         )
 
     @property
@@ -256,9 +272,16 @@ def argument_parser(self):
             "--generated",
             "-g",
             metavar="PATTERNS",
-            default=pygount.analysis.DEFAULT_GENERATED_PATTERNS_TEXT,
+            default=pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT,
             help=_HELP_GENERATED,
         )
+        parser.add_argument(
+            "--generated-names",
+            "-G",
+            metavar="PATTERNS",
+            default=pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT,
+            help=_HELP_GENERATED_NAMES,
+        )
         parser.add_argument(
             "--merge-embedded-languages",
             "-m",
@@ -330,10 +353,11 @@ def apply_arguments(self, arguments=None):
         self.set_fallback_encoding(fallback_encoding, "option --encoding")
         self.set_folders_to_skip(args.folders_to_skip, "option --folders-to-skip")
         self.set_generated_regexps(args.generated, "option --generated")
+        self.set_generated_name_regexps(args.generated_names, "option --generated-names")
         self.set_has_duplicates(args.duplicates, "option --duplicates")
         self.set_has_to_merge_embedded_languages(args.merge_embedded_languages, "option --merge-embedded-languages")
         self.set_is_verbose(args.verbose, "option --verbose")
-        self.set_names_to_skip(args.names_to_skip, "option --folders-to-skip")
+        self.set_names_to_skip(args.names_to_skip, "option --names-to-skip")
         self.set_output(args.out, "option --out")
         self.set_output_format(args.format, "option --format")
         self.set_source_patterns(args.source_patterns, "option PATTERNS")
@@ -366,7 +390,8 @@ def execute(self):
                                 path_data.group,
                                 self.default_encoding,
                                 self.fallback_encoding,
-                                generated_regexes=self._generated_regexs,
+                                generated_regexes=self._generated_line_regexs,
+                                generated_name_regexes=self._generated_name_regexps,
                                 duplicate_pool=duplicate_pool,
                                 merge_embedded_language=self.has_to_merge_embedded_languages,
                                 tmp_dir=path_data.tmp_dir,

diff --git a/pygount/common.py b/pygount/common.py
@@ -8,6 +8,7 @@
 import functools
 import inspect
 import re
+import typing
 import warnings
 from collections.abc import Iterator, Sequence
 from re import Pattern
@@ -100,6 +101,10 @@ def regexes_from(
     return result
 
 
+def matching_regex(text: str, regexes: list[typing.Pattern]) -> Optional[typing.Pattern]:
+    return next((regex for regex in regexes if regex.match(text)), None)
+
+
 def lines(text: str) -> Iterator[str]:
     """
     Generator function to yield lines (delimited with ``'\n'``) stored in

diff --git a/pyproject.toml b/pyproject.toml
@@ -56,6 +56,7 @@ testpaths = [
 dev = [
     "coveralls>=4,<5",
     "coverage>=7,<8",
+    "hatchling>=1.27.0",
     "mkdocs>=1,<2",
     "mkdocs-material>=9,<10",
     "pytest>=8,<9",
@@ -71,7 +72,7 @@ default-groups = [
 ]
 
 [tool.hatch.build.targets.sdist]
-include = ["pygount"]
+exclude = [".idea", ".github", ".readthedocs.yaml"]
 
 [tool.hatch.build.targets.wheel]
 packages = ["pygount"]

diff --git a/tests/test_analysis.py b/tests/test_analysis.py
@@ -286,6 +286,15 @@ def test_can_analyze_embedded_language(self):
         assert source_analysis.language.lower() == "html+django/jinja"
         assert source_analysis.code_count == 3
 
+    def test_can_analyze_generated_name(self):
+        test_uv_lock_path = self.create_temp_file("uv.lock", [])
+        source_analysis = analysis.SourceAnalysis.from_file(
+            test_uv_lock_path,
+            "test",
+            generated_name_regexes=pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT),
+        )
+        assert source_analysis.state == analysis.SourceState.generated
+
     def test_can_merge_embedded_language(self):
         test_html_django_path = self.create_temp_file(
             "some.html",
@@ -407,7 +416,7 @@ class GeneratedCodeTest(TempFolderTest):
     )
 
     def test_can_detect_non_generated_code(self):
-        default_generated_regexes = common.regexes_from(analysis.DEFAULT_GENERATED_PATTERNS_TEXT)
+        default_generated_regexes = common.regexes_from(analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT)
         with open(__file__, encoding="utf-8") as source_file:
             matching_line_number_and_regex = analysis.matching_number_line_and_regex(
                 source_file, default_generated_regexes

diff --git a/tests/test_common.py b/tests/test_common.py
@@ -6,7 +6,10 @@
 # All rights reserved. Distributed under the BSD License.
 import re
 
+import pytest
+
 import pygount.common
+from pygount.common import matching_regex
 
 
 def test_can_build_str():
@@ -82,6 +85,24 @@ def test_can_represent_iterable_as_list():
     assert pygount.common.as_list(range(3)) == [0, 1, 2]
 
 
+@pytest.mark.parametrize(
+    "text,patterns,expected_regex_index",
+    [
+        ("some", [], -1),
+        ("some", ["some"], 0),
+        ("some", ["other"], -1),
+        ("some", ["other", "some"], 1),
+        ("some", ["s.+"], 0),
+        ("some", [".*T.*"], -1),
+    ],
+)
+def test_can_compute_matching_regex(text: str, patterns: list[str], expected_regex_index: int):
+    regexes = [re.compile(pattern) for pattern in patterns]
+    regex = matching_regex(text, regexes)
+    regex_index = regexes.index(regex) if regex is not None else -1
+    assert regex_index == expected_regex_index
+
+
 def test_can_convert_empty_text_to_lines():
     assert list(pygount.common.lines("")) == []