BioactivityDataAcquisition/pyproject.toml at main · SatoryKono/BioactivityDataAcquisition · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"

[project]
name = "bioetl"
version = "6.1.0"
description = "BioETL: Bioactivity data acquisition and processing pipeline"
readme = "README.md"
requires-python = ">=3.12"
classifiers = [
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "License :: OSI Approved :: MIT License",
    "Operating System :: OS Independent",
]
dependencies = [
    "httpx>=0.27",
    "pydantic>=2.0",
    "pydantic-settings>=2.0",
    "pyyaml>=6.0",
    "polars>=1.0",
    "deltalake>=0.18",
    "arro3-core==0.6.5",  # Pin for Windows compatibility with deltalake 1.4.x (exposes arro3.core.types)
    "pyarrow>=15.0",
    "zstandard>=0.22",
    "pandas>=2.0,<2.3",  # Pin to 2.2.x series for reliable pre-built wheels on Windows (2.3.x requires Meson build)
    # Validation
    "pandera>=0.20",
    # Providers (RULES.md Appendix A)
    "pubchempy>=1.0.4",
    # Secure XML parsing for PubMed payloads (XXE/billion-laughs mitigation)
    "defusedxml>=0.7.1",
    # Metrics
    "prometheus-client>=0.20",
    # CLI (EXC-008: Click chosen over Typer)
    "click>=8.1",
    # Logging
    "structlog>=24.0",
    # High-performance JSON serialization (required by batch_writer, bronze_writer, delta_writer)
    "orjson>=3.11.0,<4.0",
]

[project.scripts]
bioetl = "bioetl.interfaces.cli:main"

[project.optional-dependencies]
tracing = [
    "opentelemetry-api>=1.20",
    "opentelemetry-sdk>=1.20",
    "opentelemetry-exporter-otlp>=1.20",
]
performance = [
    # Explicit performance extra to ensure optional installs include optimized JSON
    "orjson>=3.11.0,<4.0",
]
export = [
    # Excel export support (required for XLSX format)
    "openpyxl>=3.1",
]
tests = [
    # Testing frameworks
    "pytest>=9.0.3",
    "pytest-cov>=4.0",
    "pytest-asyncio>=0.24",
    "pytest-xdist>=3.5",
    "pytest-timeout>=2.3",  # Catch stuck tests (prevents CI hangs)
    "syrupy>=4.0",
    # Resource monitoring
    "psutil>=5.9",
    # Tracing stack for OpenTelemetry-backed tests
    "opentelemetry-api>=1.20",
    "opentelemetry-sdk>=1.20",
    "opentelemetry-exporter-otlp>=1.20",
    # HTTP mocking (required by adapter tests)
    "respx>=0.21",
    # Property-based testing (required by domain tests)
    "hypothesis>=6.100",
    # VCR.py for API testing (RULES.md Section 4.2)
    "vcrpy>=6.0",
    "pytest-vcr>=1.0",
    # Code formatting (required by architecture tests) - ruff replaces black+isort
    "ruff>=0.4",
    "detect-secrets>=1.4",
]
dev = [
    "pytest>=9.0.3",
    "pytest-cov>=4.0",
    "pytest-asyncio>=0.24",
    "pytest-xdist>=3.5",
    "pytest-timeout>=2.3",  # Catch stuck tests (prevents CI hangs)
    "pytest-benchmark>=4.0",
    "hypothesis>=6.100",
    # Resource monitoring
    "psutil>=5.9",
    "respx>=0.21",
    "mypy==1.20.2",
    "ruff>=0.4",
    "basedpyright>=1.0",
    "pyrefly>=0.18",
    "ty>=0.0.1",
    # Security tools (RULES.md - SAST)
    "pip-audit>=2.7",
    "bandit[toml]>=1.7",
    # Note: osv-scanner is the primary scanner (Go binary, not Python package)
    # safety removed: requires paid API key for scan command
    # VCR.py for API testing (RULES.md Section 4.2)
    "vcrpy>=6.0",
    "pytest-vcr>=1.0",
    # Code formatting and linting - ruff replaces black+isort
    "import-linter>=2.0",
    # Docstring style checking
    "pydocstyle>=6.3",
    # Tracing stack for OpenTelemetry-backed tests
    "opentelemetry-api>=1.20",
    "opentelemetry-sdk>=1.20",
    "opentelemetry-exporter-otlp>=1.20",
    # Architecture testing
    "pytest-archon>=0.0.6",
    # Snapshot testing for transformers
    "syrupy>=4.0",
    # JSON Schema validation (scripts/validate_pipeline_configs.py)
    "jsonschema>=4.20",
    # Code complexity analysis (RULES.md)
    "xenon>=0.9",
    "radon>=6.0",
    # Code duplication detection
    "pylint>=3.0",
    "detect-secrets>=1.4",
    # Dead code detection
    "vulture>=2.11",
    # Mutation testing
    "mutmut>=2.5",
    # Auto-cleanup
    "autoflake>=2.3.1",
    "pre-commit>=4.5.1",
]
docs = [
    "mkdocs>=1.6,<2.0",
    "mkdocs-material>=9.5",
    # Keep explicit because local strict builds import material.plugins.search.plugin,
    # which requires backrefs at runtime; making it direct avoids environment drift.
    "backrefs>=6.2",
    "mkdocs-mermaid2-plugin>=1.1",
    "pymdown-extensions>=10.8",
    "mkdocstrings[python]>=0.25",
]

[tool.setuptools.packages.find]
where = ["src"]

# ============ PYTEST ============
[tool.pytest.ini_options]
testpaths = ["tests"]
norecursedirs = [
    ".venv",
    ".venv-win",
    ".venv-wsl",
    "venv",
    "env",
    "ENV",
    "node_modules",
    ".cache",
    ".pytest_cache",
    ".hypothesis",
    ".python-user",
    ".worktrees",
]
pythonpath = ["src", "."]
python_files = ["test_*.py"]
python_functions = ["test_*"]
# Local default is serial for stability.
# CI commands explicitly enable xdist where needed.
# For benchmarks: pytest -p no:xdist -m benchmark
addopts = [
    "-ra",
    "-q",
    "--strict-markers",
    "--strict-config",
    "-m", "not benchmark and not slow",
    "--import-mode=importlib",
]
# required_plugins check moved to CI for compatibility
markers = [
    "unit: Unit tests (fast, no I/O)",
    "integration: Integration tests (may use I/O)",
    "e2e: End-to-end tests (slow, requires Docker)",
    "asyncio: Async IO tests powered by pytest-asyncio",
    "slow: Slow tests (hypothesis, long-running)",
    "memory: Neo4j project-memory and MCP tests run in a dedicated non-coverage lane",
    "hypothesis: Property-based tests using Hypothesis",
    "security: Security tests",
    "vcr: VCR.py recorded HTTP interactions",
    "performance: Performance regression tests (optional CI stage)",
    "architecture: Architecture tests (layer boundaries, contracts)",
    "contracts: Contract tests (API contracts, schema stability)",
    "benchmark: Benchmark tests (excluded from standard runs, run with -m benchmark)",
    "smoke: Smoke tests (quick sanity checks for local development)",
    "serial: Tests that must run serially (not parallel-safe, e.g., shared state)",
    "no_api: Tests that do not require external API access",
    "uniprot: UniProt provider-specific tests",
    "crossref: CrossRef provider-specific tests",
    "openalex: OpenAlex provider-specific tests",
    "semanticscholar: Semantic Scholar provider-specific tests",
]
# Default timeout: 60 seconds per test (prevents CI hangs from stuck tests)
# Override per-test with @pytest.mark.timeout(N) or disable with @pytest.mark.timeout(0)
timeout = 60
# Timeout method: thread works on all platforms (signal only on Unix)
timeout_method = "thread"
asyncio_mode = "auto"
asyncio_default_fixture_loop_scope = "module"
cache_dir = "/tmp/bioetl-pytest-cache"
filterwarnings = [
    "error",
    "ignore::DeprecationWarning",
    # NoOpMetrics warns when metrics are disabled (expected in tests)
    "ignore:NoOpMetrics is being used:UserWarning",
    # Explicit norecursedirs is intentional to avoid mixed Windows/WSL venv traversal.
    "ignore:Skipping collection of '.hypothesis' directory:UserWarning",
    # Pytest cleanup warnings (sockets/event loops not closed during teardown)
    "ignore::pytest.PytestUnraisableExceptionWarning",
    # pytest-benchmark warning when xdist is active (expected in parallel mode)
    "ignore:Benchmarks are automatically disabled",
]

# ============ COVERAGE ============
[tool.coverage.run]
source = ["src/bioetl"]
branch = true
relative_files = true
omit = [
    "*/tests/*",
    "*/__pycache__/*",
    # Entry point modules - minimal code, tested via CLI tests
    "*/__main__.py",
]

[tool.coverage.report]
exclude_lines = [
    "pragma: no cover",
    "if TYPE_CHECKING:",
    "raise NotImplementedError",
    "@abstractmethod",
    "@overload",
    "^\\s*pass\\s*$",
    "^\\s*\\.\\.\\.\\s*$",
]
# Note: fail_under is NOT set here because:
# 1. CI test-matrix runs parallel test groups that cover partial codebase
# 2. CI coverage-verify step explicitly uses --cov-fail-under=85
# 3. Setting fail_under here would cause partial test groups to fail
show_missing = true
precision = 2

# ============ MYPY ============
# Mypy strict mode validates async/await usage:
# - Ensures async def methods are awaited (error: "coroutine never awaited")
# - Validates AsyncIterator return types require "async for"
# - Checks Awaitable[T] type annotations are properly awaited
# Architecture tests in tests/test_architecture.py provide additional validation.
[tool.mypy]
python_version = "3.12"
strict = true
incremental = false  # Work around unstable cache/build-graph crashes in compiled mypy on large graphs.
sqlite_cache = false  # Avoid sqlite metastore corruption on synced/multi-host worktrees.
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
check_untyped_defs = true
no_implicit_optional = true
warn_redundant_casts = true
warn_unused_ignores = false  # Different envs have different type stubs; avoids CI/local mismatch
show_error_codes = true
# Additional strict checks
disallow_any_generics = false  # Allow generic types for external libraries
disallow_subclassing_any = false  # Allow subclassing pydantic BaseModel and pandera DataFrameModel
warn_unreachable = false  # Disable for unreachable code warnings in type narrowing
strict_equality = true
extra_checks = true
# Exclude tools directory (auxiliary scripts without strict type annotations)
exclude = ["src/tools/"]


[[tool.mypy.overrides]]
module = [
    "deltalake.*",
    "polars.*",
    "pandera.*",
    "pubchempy.*",
    "prometheus_client.*",
    "vcr.*",
    "structlog.*",
    "pyarrow",
    "pyarrow.*",
    "httpx",
    "httpx.*",
    "pydantic",
    "pydantic.*",
    "pydantic_settings",
    "pydantic_settings.*",
    "zstandard",
    "zstandard.*",
    "yaml",
    "yaml.*",
    "pandas",
    "pandas.*",
    "opentelemetry.*",
    "orjson",
    "orjson.*",
    "psutil",
    "psutil.*",
    "click",
    "click.*",
    "boto3",
    "boto3.*",
    "botocore",
    "botocore.*",
    "s3transfer",
    "s3transfer.*",
]
ignore_missing_imports = true

# Pandera @register_check_method decorator lacks type stubs
[[tool.mypy.overrides]]
module = [
    "bioetl.domain.schemas.validators",
]
disallow_untyped_decorators = false

# Modules with type: ignore comments that may be unused in some environments
# but needed in others (different type stub versions). Per-module
# warn_unused_ignores=false overrides --strict to prevent [unused-ignore].
[[tool.mypy.overrides]]
module = [
    "bioetl.domain.serialization",
    "bioetl.domain.schemas.validators",
]
warn_unused_ignores = false

# Note: All layers pass strict mypy checks.
# RF-007 (2026-03-03): Reduced overrides from 69 to 12 bioetl modules.
# RF-002 (2026-03-10): Reduced from 11 to 2 bioetl modules (removed 9 with zero type:ignore).

# ============ RUFF ============
[tool.ruff]
line-length = 88
target-version = "py312"
src = ["src", "tests"]

[tool.ruff.lint]
select = [
    "E", # pycodestyle errors
    "W", # pycodestyle warnings
    "F", # pyflakes
    "FA", # flake8-future-annotations (enforce `from __future__ import annotations`)
    "I", # isort
    "B", # flake8-bugbear
    "C4", # flake8-comprehensions
    "UP", # pyupgrade
    "ARG", # flake8-unused-arguments
    "SIM", # flake8-simplify
    # TCH disabled: types are used at runtime for isinstance checks with Protocols
    "PTH", # flake8-use-pathlib
    "RUF", # ruff-specific
    "T201", # flake8-print (no print statements in production code)
]
ignore = [
    "RUF001",  # Ambiguous unicode in strings - project uses Russian help text
    "RUF002",  # Ambiguous unicode - project uses Russian docstrings
    "RUF003",  # Ambiguous unicode in comments - project uses math symbols (×, ÷)
]

[tool.ruff.lint.pycodestyle]
# Keep formatter width conservative (88) while linting E501 against a migration target.
max-line-length = 120

[tool.ruff.lint.isort]
known-first-party = ["bioetl", "tests"]

[tool.ruff.lint.per-file-ignores]
# Tests may have unused arguments, ignore import order (handled by isort)
# C901: Architecture tests perform complex static analysis checks
# E402: Allow imports after future imports and docstrings
# RUF012: Mutable class attributes for parametrize test data are intentional
# RUF059: Unused unpacked variables are common in test fixtures
"tests/**/*.py" = ["ARG001", "ARG002", "ARG005", "PTH", "SIM", "B017", "I001", "C901", "E402", "RUF012", "RUF059", "T201"]
# Domain __init__.py uses semantic grouping with comments in __all__ (not alphabetical)
# I001: Import order handled by isort (semantic grouping with comments)
"src/bioetl/domain/__init__.py" = ["RUF022", "I001"]
# Crossref schema __init__.py uses semantic grouping, isort handles sorting
"src/bioetl/domain/schemas/crossref/__init__.py" = ["I001"]
# Entrypoints uses semantic grouping with comments in __all__ (not alphabetical)
"src/bioetl/composition/entrypoints.py" = ["RUF022"]
# Tools scripts use print() for output, may have simpler patterns, and are not production code
"src/tools/**/*.py" = ["PTH", "SIM", "T201", "F401", "F541", "UP015", "RUF005", "B007", "F841"]
# Scripts are CLI utilities: print() is expected, pathlib not required
"scripts/**/*.py" = ["PTH", "SIM", "T201", "F401", "F541", "UP015", "RUF005", "B007", "F841"]
# Infrastructure adapters implement Protocol interfaces, may not use all args
"src/bioetl/infrastructure/adapters/**/*.py" = ["ARG001", "ARG002", "SIM102", "SIM113"]
# Infrastructure may have unused args for interface consistency, use open() is fine
"src/bioetl/infrastructure/**/*.py" = ["ARG001", "ARG002", "PTH123"]
# config.py uses SecretStr at runtime for Pydantic models
"src/bioetl/infrastructure/config.py" = ["ARG001", "ARG002", "PTH123", "TC002"]
# Application core may have unused context args for future use
"src/bioetl/application/**/*.py" = ["ARG001", "ARG002", "SIM105"]
# Composition factories may have unused args for interface consistency
# E402: Allow imports after future imports and docstrings
# ARG005: Unused lambda arguments for interface consistency
"src/bioetl/composition/**/*.py" = ["ARG001", "ARG002", "ARG004", "ARG005", "E402"]
# Interfaces layer
"src/bioetl/interfaces/**/*.py" = ["ARG001", "ARG002"]
# Memory graph module is a standalone tool; print() is expected, line lengths are relaxed
"src/memory/**/*.py" = ["T201", "E501", "F821", "B008"]

# ============ MUTMUT (Mutation Testing) ============
[tool.mutmut]
paths_to_mutate = "src/bioetl/domain/,src/bioetl/application/"
tests_dir = "tests/"
runner = "python -m pytest -x --tb=short"
# Skip infrastructure layer - focus on business logic
dict_synonyms = "Struct, NamedStruct"

# ============ XENON (Complexity Thresholds) ============
# Enforced in CI: xenon --max-absolute B --max-modules B --max-average A
# Factories: xenon --max-absolute B --max-modules A --max-average A
# B = max CC of 10 per function, A = average CC of 5

# ============ BANDIT (Security) ============
[tool.bandit]
exclude_dirs = ["tests", "docs", ".venv", "build", "dist"]
skips = [
    "B101", # assert_used - acceptable in production for invariants
    "B104", # hardcoded_bind_all_interfaces - not applicable for ETL jobs
    "B311", # random - not used for security purposes in data processing
]
# Severity thresholds
# HIGH severity issues block CI merge
# MEDIUM severity issues are warnings only

[tool.bandit.assert_used]
# Allow assert in domain layer for invariants
skips = ["*/domain/*"]

[dependency-groups]
dev = [
    "bioetl[dev]",
]