From 9917e2aa8710cfc1c2cbb0ea297ea9fa265ade94 Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Wed, 18 Feb 2026 23:19:35 +0000 Subject: [PATCH 1/9] Hook based 3.12+ --- .gitignore | 2 + .../dd_wrapper/include/ddup_interface.hpp | 4 + .../dd_wrapper/include/libdatadog_helpers.hpp | 1 + .../profiling/dd_wrapper/include/sample.hpp | 10 + .../dd_wrapper/src/ddup_interface.cpp | 13 + .../profiling/dd_wrapper/src/sample.cpp | 76 +++ .../internal/datadog/profiling/ddup/_ddup.pyi | 2 + .../internal/datadog/profiling/ddup/_ddup.pyx | 38 ++ ddtrace/internal/settings/profiling.py | 34 ++ ddtrace/internal/settings/profiling.pyi | 4 + ddtrace/profiling/collector/_exception.pyi | 19 + ddtrace/profiling/collector/_exception.pyx | 104 ++++ ddtrace/profiling/collector/_fast_poisson.pyi | 3 + ddtrace/profiling/collector/_fast_poisson.pyx | 85 +++ ddtrace/profiling/collector/exception.py | 4 + ddtrace/profiling/profiler.py | 15 + setup.py | 10 + .../crashtracker/test_crashtracker.py | 2 +- tests/profiling/collector/pprof_utils.py | 9 +- tests/profiling/collector/test_exception.py | 547 ++++++++++++++++++ tests/profiling/collector/test_stack.py | 88 --- tests/telemetry/test_writer.py | 3 + 22 files changed, 983 insertions(+), 90 deletions(-) create mode 100644 ddtrace/profiling/collector/_exception.pyi create mode 100644 ddtrace/profiling/collector/_exception.pyx create mode 100644 ddtrace/profiling/collector/_fast_poisson.pyi create mode 100644 ddtrace/profiling/collector/_fast_poisson.pyx create mode 100644 ddtrace/profiling/collector/exception.py create mode 100644 tests/profiling/collector/test_exception.py diff --git a/.gitignore b/.gitignore index 3c49b31c04a..05d5bc432aa 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ ddtrace/appsec/_ddwaf.cpp ddtrace/appsec/include ddtrace/appsec/share ddtrace/profiling/collector/_task.c +ddtrace/profiling/collector/_fast_poisson.c +ddtrace/profiling/collector/_exception.c ddtrace/profiling/_threading.c ddtrace/profiling/collector/_traceback.c ddtrace/profiling/collector/stack.c diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/include/ddup_interface.hpp b/ddtrace/internal/datadog/profiling/dd_wrapper/include/ddup_interface.hpp index 0e90ca0ac98..645e2124cce 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/include/ddup_interface.hpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/include/ddup_interface.hpp @@ -17,6 +17,8 @@ class Sample; struct _frame; typedef struct _frame PyFrameObject; // NOLINTEND(bugprone-reserved-identifier) +struct _traceback; +typedef struct _traceback PyTracebackObject; #ifdef __cplusplus extern "C" @@ -80,6 +82,7 @@ extern "C" void ddup_push_local_root_span_id(Datadog::Sample* sample, uint64_t local_root_span_id); void ddup_push_trace_type(Datadog::Sample* sample, std::string_view trace_type); void ddup_push_exceptioninfo(Datadog::Sample* sample, std::string_view exception_type, int64_t count); + void ddup_push_exception_message(Datadog::Sample* sample, std::string_view exception_message); void ddup_push_class_name(Datadog::Sample* sample, std::string_view class_name); void ddup_push_gpu_device_name(Datadog::Sample*, std::string_view device_name); void ddup_push_frame(Datadog::Sample* sample, @@ -88,6 +91,7 @@ extern "C" uint64_t address, int64_t line); void ddup_push_pyframes(Datadog::Sample* sample, PyFrameObject* frame); + void ddup_push_pytraceback(Datadog::Sample* sample, PyTracebackObject* tb); void ddup_push_absolute_ns(Datadog::Sample* sample, int64_t timestamp_ns); void ddup_push_monotonic_ns(Datadog::Sample* sample, int64_t monotonic_ns); diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/include/libdatadog_helpers.hpp b/ddtrace/internal/datadog/profiling/dd_wrapper/include/libdatadog_helpers.hpp index f1057930953..65af41f1299 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/include/libdatadog_helpers.hpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/include/libdatadog_helpers.hpp @@ -48,6 +48,7 @@ intern_string(std::string_view s); // to have spaces in the names. #define EXPORTER_LABELS(X) \ X(exception_type, "exception type") \ + X(exception_message, "exception message") \ X(thread_id, "thread id") \ X(thread_native_id, "thread native id") \ X(thread_name, "thread name") \ diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/include/sample.hpp b/ddtrace/internal/datadog/profiling/dd_wrapper/include/sample.hpp index 27967c9811a..dc324df31a5 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/include/sample.hpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/include/sample.hpp @@ -18,6 +18,8 @@ struct _frame; typedef struct _frame PyFrameObject; // NOLINTEND(bugprone-reserved-identifier) +struct _traceback; +typedef struct _traceback PyTracebackObject; namespace Datadog { @@ -132,6 +134,7 @@ class Sample bool push_local_root_span_id(uint64_t local_root_span_id); bool push_trace_type(std::string_view trace_type); bool push_exceptioninfo(std::string_view exception_type, int64_t count); + bool push_exception_message(std::string_view exception_message); bool push_class_name(std::string_view class_name); bool push_monotonic_ns(int64_t monotonic_ns); bool push_absolute_ns(int64_t timestamp_ns); @@ -163,6 +166,13 @@ class Sample // released by this function. void push_pyframes(PyFrameObject* frame); + // Push frames from a Python traceback chain to the sample. + // Walks tb → tb_next (root→leaf) and pushes frames in leaf-to-root order, + // using tb_lineno for accurate exception site line numbers. + // Ownership: does not take ownership of `tb`; all code object references + // obtained via PyFrame_GetCode() are released internally. + void push_pytraceback(PyTracebackObject* tb); + // Flushes the current buffer, clearing it bool flush_sample(); diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp index 79a75023d46..5314f2a50e2 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp @@ -254,6 +254,13 @@ ddup_push_exceptioninfo(Datadog::Sample* sample, // cppcheck-suppress unusedFunc sample->push_exceptioninfo(exception_type, count); } +void +ddup_push_exception_message(Datadog::Sample* sample, + std::string_view exception_message) // cppcheck-suppress unusedFunction +{ + sample->push_exception_message(exception_message); +} + void ddup_push_class_name(Datadog::Sample* sample, std::string_view class_name) // cppcheck-suppress unusedFunction { @@ -282,6 +289,12 @@ ddup_push_pyframes(Datadog::Sample* sample, PyFrameObject* frame) // cppcheck-su sample->push_pyframes(frame); } +void +ddup_push_pytraceback(Datadog::Sample* sample, PyTracebackObject* tb) // cppcheck-suppress unusedFunction +{ + sample->push_pytraceback(tb); +} + void ddup_push_absolute_ns(Datadog::Sample* sample, int64_t timestamp_ns) // cppcheck-suppress unusedFunction { diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp index 657f893b060..86bd3059256 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp @@ -257,6 +257,75 @@ Datadog::Sample::incr_dropped_frames(size_t count) dropped_frames += count; } +void +Datadog::Sample::push_pytraceback(PyTracebackObject* tb) +{ + /* Walk the Python traceback chain and push each frame to the sample. + * The chain goes from outermost (root) to innermost (leaf) via tb_next. + * We collect frames first, then push in reverse (leaf-to-root) order to + * match the convention used by push_pyframes and the rest of the profiler. + * + * tb_lineno is used instead of PyFrame_GetLineNumber() because it records + * the exact line where the exception was raised/re-raised at each level, + * giving more accurate exception-site attribution than the frame's current + * execution line. + * + * Ownership: tb_frame is a borrowed reference owned by the traceback. + * PyFrame_GetCode() returns a new reference that we DECREF internally. */ + + PythonErrorRestorer error_restorer; + + struct TracebackFrameInfo + { + PyCodeObject* code; // new reference from PyFrame_GetCode; must be DECREF'd + int lineno; + }; + + // Collect frame info root→leaf by following tb_next. + std::vector frames; + for (; tb != nullptr; tb = reinterpret_cast(tb->tb_next)) { + int lineno = tb->tb_lineno < 0 ? 0 : tb->tb_lineno; + PyCodeObject* code = (tb->tb_frame != nullptr) ? PyFrame_GetCode(tb->tb_frame) : nullptr; + frames.push_back({ code, lineno }); + } + + // Push in leaf-to-root order (reverse of collected). + for (int i = static_cast(frames.size()) - 1; i >= 0; --i) { + // Early exit: once we've hit the frame limit, count all remaining + // frames as dropped and release their code refs without further + // expensive string extraction. + if (locations.size() > max_nframes) { + for (int j = i; j >= 0; --j) { + ++dropped_frames; + Py_XDECREF(frames[j].code); + } + break; + } + + PyCodeObject* code = frames[i].code; + int lineno = frames[i].lineno; + + std::string_view name_sv = ""; + std::string_view filename_sv = ""; + + if (code != nullptr) { + // Use co_qualname for Python 3.11+ for better context (e.g. Class.method) +#if defined(_PY311_AND_LATER) + PyObject* name_obj = code->co_qualname ? code->co_qualname : code->co_name; +#else + PyObject* name_obj = code->co_name; +#endif + name_sv = unicode_to_string_view(name_obj); + filename_sv = unicode_to_string_view(code->co_filename); + } + + // push_frame_impl copies strings immediately into the StringArena. + push_frame_impl(name_sv, filename_sv, 0, lineno); + Py_XDECREF(code); + } + // Error state is automatically restored by error_restorer destructor. +} + void Datadog::Sample::push_frame(function_id function_id, uint64_t address, int64_t line) { @@ -570,6 +639,13 @@ Datadog::Sample::push_gpu_flops(int64_t size, int64_t count) return false; } +bool +Datadog::Sample::push_exception_message(std::string_view exception_message) +{ + push_label(ExportLabelKey::exception_message, exception_message); + return true; +} + bool Datadog::Sample::push_lock_name(std::string_view lock_name) { diff --git a/ddtrace/internal/datadog/profiling/ddup/_ddup.pyi b/ddtrace/internal/datadog/profiling/ddup/_ddup.pyi index 6a84e23cb08..1349a762ba0 100644 --- a/ddtrace/internal/datadog/profiling/ddup/_ddup.pyi +++ b/ddtrace/internal/datadog/profiling/ddup/_ddup.pyi @@ -1,4 +1,5 @@ from types import FrameType +from types import TracebackType from typing import Mapping from typing import Optional from typing import Union @@ -46,6 +47,7 @@ class SampleHandle: def push_exceptioninfo(self, exc_type: Union[None, bytes, str, type], count: int) -> None: ... def push_frame(self, name: StringType, filename: StringType, address: int, line: int) -> None: ... def push_pyframes(self, frame: FrameType) -> None: ... + def push_pytraceback(self, tb: TracebackType) -> None: ... def push_gpu_device_name(self, device_name: StringType) -> None: ... def push_gpu_flops(self, value: int, count: int) -> None: ... def push_gpu_gputime(self, value: int, count: int) -> None: ... diff --git a/ddtrace/internal/datadog/profiling/ddup/_ddup.pyx b/ddtrace/internal/datadog/profiling/ddup/_ddup.pyx index 4b5c73f8c6f..1f182923cc6 100644 --- a/ddtrace/internal/datadog/profiling/ddup/_ddup.pyx +++ b/ddtrace/internal/datadog/profiling/ddup/_ddup.pyx @@ -42,6 +42,8 @@ cdef extern from "sample.hpp" namespace "Datadog": cdef extern from "ddup_interface.hpp": ctypedef struct PyFrameObject: pass + ctypedef struct PyTracebackObject: + pass void ddup_config_env(string_view env) void ddup_config_service(string_view service) @@ -85,10 +87,12 @@ cdef extern from "ddup_interface.hpp": void ddup_push_local_root_span_id(Sample *sample, uint64_t local_root_span_id) void ddup_push_trace_type(Sample *sample, string_view trace_type) void ddup_push_exceptioninfo(Sample *sample, string_view exception_type, int64_t count) + void ddup_push_exception_message(Sample *sample, string_view exception_message) void ddup_push_class_name(Sample *sample, string_view class_name) void ddup_push_gpu_device_name(Sample *sample, string_view device_name) void ddup_push_frame(Sample *sample, string_view _name, string_view _filename, uint64_t address, int64_t line) void ddup_push_pyframes(Sample *sample, PyFrameObject* frame) + void ddup_push_pytraceback(Sample *sample, PyTracebackObject* tb) void ddup_push_monotonic_ns(Sample *sample, int64_t monotonic_ns) void ddup_push_absolute_ns(Sample *sample, int64_t monotonic_ns) void ddup_flush_sample(Sample *sample) @@ -272,6 +276,18 @@ cdef call_ddup_push_exceptioninfo(Sample* sample, exception_name: StringType, ui if utf8_data != NULL: ddup_push_exceptioninfo(sample, string_view(utf8_data, utf8_size), count) +cdef call_ddup_push_exception_message(Sample* sample, exception_message: StringType): + if not exception_message: + return + if isinstance(exception_message, bytes): + ddup_push_exception_message(sample, string_view(exception_message, len(exception_message))) + return + cdef const char* utf8_data + cdef Py_ssize_t utf8_size + utf8_data = PyUnicode_AsUTF8AndSize(exception_message, &utf8_size) + if utf8_data != NULL: + ddup_push_exception_message(sample, string_view(utf8_data, utf8_size)) + cdef call_ddup_push_class_name(Sample* sample, class_name: StringType): if not class_name: return @@ -492,6 +508,21 @@ cdef class SampleHandle: frame_ptr = frame_obj ddup_push_pyframes(self.ptr, frame_ptr) + def push_pytraceback(self, object tb) -> None: + cdef PyObject* tb_obj + cdef PyTracebackObject* tb_ptr + + if self.ptr is not NULL and tb is not None: + # Validate that tb is actually a traceback object to avoid crashes + # from invalid casts (e.g., if tb contains a non-traceback object) + if not isinstance(tb, types.TracebackType): + return + # In Cython, 'tb' is already a PyObject*. Get the raw pointer. + tb_obj = tb + # Cast to PyTracebackObject* - both are just pointers to the same memory + tb_ptr = tb_obj + ddup_push_pytraceback(self.ptr, tb_ptr) + def push_threadinfo(self, thread_id: int, thread_native_id: int, thread_name: StringType) -> None: if self.ptr is not NULL: thread_id = thread_id if thread_id is not None else 0 @@ -522,6 +553,13 @@ cdef class SampleHandle: exc_name = exc_type call_ddup_push_exceptioninfo(self.ptr, exc_name, clamp_to_uint64_unsigned(count)) + def push_exception_message(self, exception_message: StringType) -> None: + if self.ptr is NULL: + return + if exception_message is None: + return + call_ddup_push_exception_message(self.ptr, exception_message) + def push_class_name(self, class_name: StringType) -> None: if self.ptr is not NULL: call_ddup_push_class_name(self.ptr, class_name) diff --git a/ddtrace/internal/settings/profiling.py b/ddtrace/internal/settings/profiling.py index f07ef57c952..27eb4c179fe 100644 --- a/ddtrace/internal/settings/profiling.py +++ b/ddtrace/internal/settings/profiling.py @@ -401,12 +401,44 @@ class ProfilingConfigPytorch(DDConfig): ) +class ProfilingConfigException(DDConfig): + __item__ = __prefix__ = "exception" + + enabled = DDConfig.v( + bool, + "enabled", + default=True, + help_type="Boolean", + help="Whether to enable the exception profiler", + ) + + sampling_interval = DDConfig.v( + int, + "sampling_interval", + default=100, + help_type="Integer", + help=( + "Average number of exceptions between samples (uses Poisson distribution). " + "Lower values sample more frequently but add more overhead." + ), + ) + + collect_message = DDConfig.v( + bool, + "collect_message", + default=True, + help_type="Boolean", + help="Whether to collect exception messages, which can contain sensitive data.", + ) + + # Include all the sub-configs ProfilingConfig.include(ProfilingConfigStack, namespace="stack") ProfilingConfig.include(ProfilingConfigLock, namespace="lock") ProfilingConfig.include(ProfilingConfigMemory, namespace="memory") ProfilingConfig.include(ProfilingConfigHeap, namespace="heap") ProfilingConfig.include(ProfilingConfigPytorch, namespace="pytorch") +ProfilingConfig.include(ProfilingConfigException, namespace="exception") config = ProfilingConfig() report_configuration(config) @@ -454,6 +486,8 @@ def config_str(config) -> str: configured_features.append("heap") if config.pytorch.enabled: configured_features.append("pytorch") + if config.exception.enabled: + configured_features.append("exception") configured_features.append("exp_dd") configured_features.append("CAP" + str(config.capture_pct)) configured_features.append("MAXF" + str(config.max_frames)) diff --git a/ddtrace/internal/settings/profiling.pyi b/ddtrace/internal/settings/profiling.pyi index 343052310e1..369286aaf1c 100644 --- a/ddtrace/internal/settings/profiling.pyi +++ b/ddtrace/internal/settings/profiling.pyi @@ -23,6 +23,7 @@ class ProfilingConfig(DDConfig): memory: ProfilingConfigMemory heap: ProfilingConfigHeap pytorch: ProfilingConfigPytorch + exception: ProfilingConfigException class ProfilingConfigStack(DDConfig): enabled: bool @@ -46,6 +47,9 @@ class ProfilingConfigPytorch(DDConfig): enabled: bool events_limit: int +class ProfilingConfigException(DDConfig): + enabled: bool + config: ProfilingConfig ddup_failure_msg: Optional[str] ddup_is_available: bool diff --git a/ddtrace/profiling/collector/_exception.pyi b/ddtrace/profiling/collector/_exception.pyi new file mode 100644 index 00000000000..cdd2dabf297 --- /dev/null +++ b/ddtrace/profiling/collector/_exception.pyi @@ -0,0 +1,19 @@ +from typing import Any +from typing import Optional + +from ddtrace.profiling import collector + +class ExceptionCollector(collector.Collector): + collect_message: bool + + def __init__( + self, + sampling_interval: Optional[int] = None, + collect_message: Optional[bool] = None, + ) -> None: ... + def _start_service(self) -> None: ... + def _stop_service(self) -> None: ... + +def _on_exception_handled(code: Any, instruction_offset: int, exception: BaseException) -> None: ... + +# TODO: Define bytecode injection handler for < versions 3.12 diff --git a/ddtrace/profiling/collector/_exception.pyx b/ddtrace/profiling/collector/_exception.pyx new file mode 100644 index 00000000000..77b67ab9510 --- /dev/null +++ b/ddtrace/profiling/collector/_exception.pyx @@ -0,0 +1,104 @@ +import logging +import sys +import threading + +from ddtrace.internal.datadog.profiling import ddup +from ddtrace.internal.settings.profiling import config +from ddtrace.profiling import collector +from ddtrace.profiling.collector import _fast_poisson + + +LOG = logging.getLogger(__name__) +HAS_MONITORING = hasattr(sys, "monitoring") +_current_thread = threading.current_thread + + +# These are global variables. We are okay with this because this is only ever accessed +# with the GIL held +cdef int _sampling_interval = 100 +cdef bool _collect_message = False +cdef int _sample_counter = 0 +cdef int _next_sample = 100 + + +cdef void _collect_exception(object exc_type, object exc_value, object exc_traceback): + if not ddup.is_available: + return + + cdef str module = exc_type.__module__ + cdef str exception_type = f"{module}.{exc_type.__name__}" if module else exc_type.__name__ + cdef str exception_message = str(exc_value) if _collect_message else "" + cdef object handle = ddup.SampleHandle() + + try: + handle.push_exceptioninfo(exception_type, 1) + if exception_message: + handle.push_exception_message(exception_message) + + thread = _current_thread() + handle.push_threadinfo(thread.ident or 0, getattr(thread, "native_id", 0) or 0, thread.name) + + handle.push_pytraceback(exc_traceback) + + handle.flush_sample() + except: + handle.drop_sample() + + +cpdef void _on_exception_handled(object code, int instruction_offset, object exception): + # sys.monitoring.EXCEPTION_HANDLED callback - HOT PATH + global _sample_counter, _next_sample + + _sample_counter += 1 + + if _sample_counter < _next_sample: + return + + _next_sample = _fast_poisson.sample(_sampling_interval) or 1 + _sample_counter = 0 + + _collect_exception(type(exception), exception, exception.__traceback__) + + +class ExceptionCollector(collector.Collector): + # Collects exception samples using sys.monitoring (Python 3.12+) + + def __init__(self, sampling_interval: int = None, collect_message: bool = None): + global _sampling_interval, _next_sample, _sample_counter + global _collect_message + + super().__init__() + _sampling_interval = sampling_interval if sampling_interval is not None else config.exception.sampling_interval + _collect_message = collect_message if collect_message is not None else config.exception.collect_message + + _next_sample = _sampling_interval + _sample_counter = 0 + + def _start_service(self) -> None: + if sys.version_info >= (3, 12) and HAS_MONITORING: + # Python 3.12+: Use sys.monitoring + try: + sys.monitoring.use_tool_id(sys.monitoring.PROFILER_ID, "dd-trace-exception-profiler") + sys.monitoring.set_events(sys.monitoring.PROFILER_ID, sys.monitoring.events.EXCEPTION_HANDLED) + sys.monitoring.register_callback( + sys.monitoring.PROFILER_ID, + sys.monitoring.events.EXCEPTION_HANDLED, + _on_exception_handled, + ) + LOG.debug("Using sys.monitoring.EXCEPTION_HANDLED") + except Exception: + LOG.exception("Failed to set up exception monitoring") + return + else: + LOG.debug("Exception profiling only supports Python 3.12+, skipping") + return + + LOG.info("ExceptionCollector started: interval=%d", _sampling_interval) + + def _stop_service(self) -> None: + if sys.version_info >= (3, 12) and HAS_MONITORING: + try: + sys.monitoring.set_events(sys.monitoring.PROFILER_ID, 0) + sys.monitoring.free_tool_id(sys.monitoring.PROFILER_ID) + except: + pass diff --git a/ddtrace/profiling/collector/_fast_poisson.pyi b/ddtrace/profiling/collector/_fast_poisson.pyi new file mode 100644 index 00000000000..0afe349401b --- /dev/null +++ b/ddtrace/profiling/collector/_fast_poisson.pyi @@ -0,0 +1,3 @@ +def seed(s: int) -> None: ... +def sample(lam: float) -> int: ... +def sample_n(lam: float, n: int) -> list[int]: ... diff --git a/ddtrace/profiling/collector/_fast_poisson.pyx b/ddtrace/profiling/collector/_fast_poisson.pyx new file mode 100644 index 00000000000..5e42a106e90 --- /dev/null +++ b/ddtrace/profiling/collector/_fast_poisson.pyx @@ -0,0 +1,85 @@ +from libc.math cimport log, exp, sqrt, floor +from libc.stdint cimport uint64_t + +cdef uint64_t SPLITMIX_INC = 0x9E3779B97F4A7C15ULL +cdef uint64_t SPLITMIX_MUL1 = 0xBF58476D1CE4E5B9ULL +cdef uint64_t SPLITMIX_MUL2 = 0x94D049BB133111EBULL +cdef double INV_2_53 = 1.0 / 9007199254740992.0 +cdef double TWO_PI = 6.283185307179586 + +# Module-level RNG state (seeded randomly at import) +cdef uint64_t _rng_state + +import os as _os +_rng_state = int.from_bytes(_os.urandom(8), "little") + +# SplitMix64: Taken from https://prng.di.unimi.it/splitmix64.c +cdef inline uint64_t _splitmix64() noexcept nogil: + global _rng_state + cdef uint64_t z + _rng_state = _rng_state + SPLITMIX_INC + z = _rng_state + z = (z ^ (z >> 30)) * SPLITMIX_MUL1 + z = (z ^ (z >> 27)) * SPLITMIX_MUL2 + return z ^ (z >> 31) + +# Convert to double in [0, 1) +cdef inline double _uniform() noexcept nogil: + cdef uint64_t x = _splitmix64() >> 11 + return (x + 0.5) * INV_2_53 + + +# Taken from: https://en.wikipedia.org/wiki/Poisson_distribution#Generating_Poisson-distributed_random_variables +cdef inline int _poisson_knuth(double lam) noexcept nogil: + cdef double L = exp(-lam) + cdef int k = 0 + cdef double p = 1.0 + while p > L: + k = k + 1 + p = p * _uniform() + return k - 1 if k > 0 else 0 + +# Taken from: https://hpaulkeeler.com/simulating-poisson-random-variables-with-large-means-in-c/ +cdef inline int _poisson_ptrs(double lam) noexcept nogil: + cdef double slam = sqrt(lam) + cdef double loglam = log(lam) + cdef double b = 0.931 + 2.53 * slam + cdef double a = -0.059 + 0.02483 * b + cdef double inv_alpha = 1.1239 + 1.1328 / (b - 3.4) + cdef double vr = 0.9277 - 3.6224 / (b - 2.0) + cdef double U, V, us, k, v, x, rhs + cdef int ik + + while True: + U = _uniform() - 0.5 + V = _uniform() + if U < 0: + us = 0.5 + U + else: + us = 0.5 - U + k = floor((2.0 * a / us + b) * U + lam + 0.43) + if us >= 0.07 and V <= vr: + ik = k + if ik >= 0: + return ik + if k < 0: + continue + v = log(V * inv_alpha / (a / (us * us) + b)) + x = k + 1.0 + rhs = -lam + k * loglam - (k * log(x) - k + 0.5 * log(TWO_PI * x)) + if v <= rhs: + return k + + +cpdef void seed(uint64_t s): + global _rng_state + _rng_state = s + + +cpdef int sample(double lam): + if lam <= 0.0: + return 0 + elif lam < 30.0: + return _poisson_knuth(lam) + else: + return _poisson_ptrs(lam) diff --git a/ddtrace/profiling/collector/exception.py b/ddtrace/profiling/collector/exception.py new file mode 100644 index 00000000000..af0877d07d7 --- /dev/null +++ b/ddtrace/profiling/collector/exception.py @@ -0,0 +1,4 @@ +from ddtrace.profiling.collector._exception import ExceptionCollector + + +__all__ = ["ExceptionCollector"] diff --git a/ddtrace/profiling/profiler.py b/ddtrace/profiling/profiler.py index 134e237c659..5a78a486f8d 100644 --- a/ddtrace/profiling/profiler.py +++ b/ddtrace/profiling/profiler.py @@ -24,6 +24,7 @@ from ddtrace.profiling import collector from ddtrace.profiling import scheduler from ddtrace.profiling.collector import asyncio +from ddtrace.profiling.collector import exception from ddtrace.profiling.collector import memalloc from ddtrace.profiling.collector import pytorch from ddtrace.profiling.collector import stack @@ -133,6 +134,7 @@ def __init__( _stack_collector_enabled: bool = profiling_config.stack.enabled, _lock_collector_enabled: bool = profiling_config.lock.enabled, _pytorch_collector_enabled: bool = profiling_config.pytorch.enabled, + _exception_profiling_enabled: bool = profiling_config.exception.enabled, enable_code_provenance: bool = profiling_config.code_provenance, endpoint_collection_enabled: bool = profiling_config.endpoint_collection, ): @@ -148,6 +150,7 @@ def __init__( self._stack_collector_enabled: bool = _stack_collector_enabled self._lock_collector_enabled: bool = _lock_collector_enabled self._pytorch_collector_enabled: bool = _pytorch_collector_enabled + self._exception_profiling_enabled: bool = _exception_profiling_enabled self.enable_code_provenance: bool = enable_code_provenance self.endpoint_collection_enabled: bool = endpoint_collection_enabled @@ -198,6 +201,18 @@ def _build_default_exporters(self) -> None: ddup.start() def __post_init__(self) -> None: + if self._exception_profiling_enabled: + LOG.debug("Profiling collector (exception) enabled") + try: + exc_collector = exception.ExceptionCollector( + sampling_interval=profiling_config.exception.sampling_interval, + collect_message=profiling_config.exception.collect_message, + ) + self._collectors.append(exc_collector) + LOG.debug("Profiling collector (exception) initialized") + except Exception: + LOG.error("Failed to start exception collector, disabling.", exc_info=True) + if self._stack_collector_enabled: LOG.debug("Profiling collector (stack) enabled") try: diff --git a/setup.py b/setup.py index 79598d19338..cb4092d35df 100644 --- a/setup.py +++ b/setup.py @@ -1267,6 +1267,16 @@ def get_exts_for(name): sources=["ddtrace/profiling/collector/_task.pyx"], language="c", ), + Cython.Distutils.Extension( + "ddtrace.profiling.collector._exception", + sources=["ddtrace/profiling/collector/_exception.pyx"], + language="c", + ), + Cython.Distutils.Extension( + "ddtrace.profiling.collector._fast_poisson", + sources=["ddtrace/profiling/collector/_fast_poisson.pyx"], + language="c", + ), ], compile_time_env={ "PY_MAJOR_VERSION": sys.version_info.major, diff --git a/tests/internal/crashtracker/test_crashtracker.py b/tests/internal/crashtracker/test_crashtracker.py index 889dc00c29f..bb31c100aa0 100644 --- a/tests/internal/crashtracker/test_crashtracker.py +++ b/tests/internal/crashtracker/test_crashtracker.py @@ -542,7 +542,7 @@ def test_crashtracker_set_tag_profiler_config(snapshot_context, run_python_code_ report = utils.get_crash_report(client) # Now check for the profiler_config tag assert b"profiler_config" in report["body"] - profiler_config = "stack_v2_lock_mem_heap_exp_dd_CAP1.0_MAXF64" + profiler_config = "stack_v2_lock_mem_heap_exception_exp_dd_CAP1.0_MAXF64" assert profiler_config.encode() in report["body"] diff --git a/tests/profiling/collector/pprof_utils.py b/tests/profiling/collector/pprof_utils.py index 55f3c403975..a684174dd0e 100644 --- a/tests/profiling/collector/pprof_utils.py +++ b/tests/profiling/collector/pprof_utils.py @@ -104,10 +104,16 @@ def __init__( class StackEvent(EventBaseClass): def __init__( - self, locations: Optional[Sequence[StackLocation]] = None, exception_type: Optional[str] = None, *args, **kwargs + self, + locations: Optional[Sequence[StackLocation]] = None, + exception_type: Optional[str] = None, + exception_message: Optional[str] = None, + *args, + **kwargs, ) -> None: self.locations = locations self.exception_type = exception_type + self.exception_message = exception_message super().__init__(*args, **kwargs) @@ -500,6 +506,7 @@ def assert_stack_event( try: # Check that the sample has label "exception type" with value (no-op if expected_event.exception_type is None) assert_str_label(profile.string_table, sample, "exception type", expected_event.exception_type) + assert_str_label(profile.string_table, sample, "exception message", expected_event.exception_message) assert_sample_has_locations(profile, sample, expected_event.locations) assert_base_event(profile.string_table, sample, expected_event) except AssertionError as e: diff --git a/tests/profiling/collector/test_exception.py b/tests/profiling/collector/test_exception.py new file mode 100644 index 00000000000..908c2e98da0 --- /dev/null +++ b/tests/profiling/collector/test_exception.py @@ -0,0 +1,547 @@ +import _thread +import os +from pathlib import Path +import sys +import threading +import time + +import pytest + +from ddtrace.internal.datadog.profiling import ddup +from ddtrace.profiling.collector import exception +from ddtrace.trace import Tracer +from tests.profiling.collector import pprof_utils + + +# Exception profiling requires Python 3.12 +pytestmark = pytest.mark.skipif(sys.version_info < (3, 12), reason="Exception profiling requires Python 3.12+") + + +def _setup_profiler(tmp_path: Path, test_name: str) -> str: + """Configure ddup and return the output filename for profile parsing.""" + pprof_prefix = str(tmp_path / test_name) + output_filename = pprof_prefix + "." + str(os.getpid()) + assert ddup.is_available + ddup.config(env="test", service=test_name, version="1.0", output_filename=pprof_prefix) + ddup.start() + return output_filename + + +# Helper functions to throw exceptions + + +def _raise_value_error() -> None: + raise ValueError("test value error") + + +def _handle_value_error() -> None: + try: + _raise_value_error() + except ValueError: + pass + + +def _level_3() -> None: + raise RuntimeError("deep error") + + +def _level_2() -> None: + _level_3() + + +def _level_1() -> None: + try: + _level_2() + except RuntimeError: + pass + + +def _raise_value_error_handled() -> None: + try: + raise ValueError("value error") + except ValueError: + pass + + +def _raise_type_error_handled() -> None: + try: + raise TypeError("type error") + except TypeError: + pass + + +def _raise_runtime_error_handled() -> None: + try: + raise RuntimeError("runtime error") + except RuntimeError: + pass + + +def _nested_exception_handling() -> None: + try: + try: + raise ValueError("inner error") + except ValueError: + raise RuntimeError("outer error") + except RuntimeError: + pass + + +def _deep_recursion(depth: int) -> None: + if depth == 0: + raise ValueError("deep error") + return _deep_recursion(depth - 1) + + +def _handle_deep_exception() -> None: + try: + _deep_recursion(20) + except ValueError: + pass + + +class CustomError(Exception): + pass + + +def _raise_custom_error() -> None: + try: + raise CustomError("custom exception") + except CustomError: + pass + + +def _thread_raise_value_errors() -> None: + for _ in range(5): + try: + raise ValueError("thread exception") + except ValueError: + pass + + +def _thread_raise_runtime_errors() -> None: + for _ in range(5): + try: + raise RuntimeError("thread exception") + except RuntimeError: + pass + + +def test_exception_config_defaults() -> None: + """Test that exception profiling config has expected default values.""" + from ddtrace.internal.settings.profiling import config as profiling_config + + assert profiling_config.exception.enabled is True + assert profiling_config.exception.sampling_interval == 100 + assert profiling_config.exception.collect_message is True + + +def test_poisson_sampling_distribution() -> None: + """Test that Poisson sampling mean is close to the configured lambda.""" + from ddtrace.profiling.collector import _fast_poisson + + samples = [_fast_poisson.sample(100) for _ in range(1000)] + + assert all(s >= 0 for s in samples), "All samples should be non-negative" + + mean = sum(samples) / len(samples) + # Mean of 1000 exponential(100) samples has std ≈ 3.16; use wide bounds to avoid flakes + assert 80 <= mean <= 120, f"Expected mean ~100, got {mean}" + + +# Pprof profile tests + + +def test_simple_exception_profiling(tmp_path: Path) -> None: + """Test that exception type, stack locations, and thread info are captured.""" + output_filename = _setup_profiler(tmp_path, "test_simple_exception") + + with exception.ExceptionCollector(sampling_interval=1): + for _ in range(10): + _handle_value_error() + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + thread_id=_thread.get_ident(), + thread_name="MainThread", + exception_type="builtins\\.ValueError", + locations=[ + pprof_utils.StackLocation(function_name="_raise_value_error", filename="test_exception.py", line_no=-1), + pprof_utils.StackLocation( + function_name="_handle_value_error", filename="test_exception.py", line_no=-1 + ), + ], + ), + print_samples_on_failure=True, + ) + + +def test_exception_stack_trace(tmp_path: Path) -> None: + """Test that a multi-level call chain is captured in the stack trace.""" + output_filename = _setup_profiler(tmp_path, "test_exception_stack") + + with exception.ExceptionCollector(sampling_interval=1): + for _ in range(10): + _level_1() + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + exception_type="builtins\\.RuntimeError", + locations=[ + pprof_utils.StackLocation(function_name="_level_3", filename="test_exception.py", line_no=-1), + pprof_utils.StackLocation(function_name="_level_2", filename="test_exception.py", line_no=-1), + pprof_utils.StackLocation(function_name="_level_1", filename="test_exception.py", line_no=-1), + ], + ), + print_samples_on_failure=True, + ) + + +def test_multiple_exception_types(tmp_path: Path) -> None: + """Test that all distinct exception types are captured.""" + output_filename = _setup_profiler(tmp_path, "test_multiple_exceptions") + + with exception.ExceptionCollector(sampling_interval=1): + for _ in range(10): + _raise_value_error_handled() + _raise_type_error_handled() + _raise_runtime_error_handled() + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + # Verify all three exception types are present + for exc_type in ["builtins\\.ValueError", "builtins\\.TypeError", "builtins\\.RuntimeError"]: + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent(exception_type=exc_type), + print_samples_on_failure=True, + ) + + +def test_nested_exception_handling(tmp_path: Path) -> None: + """Test that both inner and outer exceptions are captured in nested try-except.""" + output_filename = _setup_profiler(tmp_path, "test_nested_exceptions") + + with exception.ExceptionCollector(sampling_interval=1): + for _ in range(10): + _nested_exception_handling() + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + # Both the inner ValueError and outer RuntimeError should be captured + for exc_type in ["builtins\\.ValueError", "builtins\\.RuntimeError"]: + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + exception_type=exc_type, + locations=[ + pprof_utils.StackLocation( + function_name="_nested_exception_handling", filename="test_exception.py", line_no=-1 + ), + ], + ), + print_samples_on_failure=True, + ) + + +def test_custom_exception_class(tmp_path: Path) -> None: + """Test that custom exception classes are tracked with module-qualified names.""" + output_filename = _setup_profiler(tmp_path, "test_custom_exception") + + with exception.ExceptionCollector(sampling_interval=1): + for _ in range(10): + _raise_custom_error() + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + exception_type=".*\\.CustomError", + locations=[ + pprof_utils.StackLocation( + function_name="_raise_custom_error", filename="test_exception.py", line_no=-1 + ), + ], + ), + print_samples_on_failure=True, + ) + + +def test_multithreaded_exception_profiling(tmp_path: Path) -> None: + """Test exceptions from multiple threads are captured with correct types.""" + output_filename = _setup_profiler(tmp_path, "test_multithreaded") + + with exception.ExceptionCollector(sampling_interval=1): + threads: list[threading.Thread] = [] + for i in range(3): + t_val = threading.Thread(target=_thread_raise_value_errors, name=f"ExcThread-{i}") + t_val.start() + threads.append(t_val) + t_rt = threading.Thread(target=_thread_raise_runtime_errors, name=f"RtThread-{i}") + t_rt.start() + threads.append(t_rt) + + for t in threads: + t.join() + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + # Both exception types should be present + for exc_type in ["builtins\\.ValueError", "builtins\\.RuntimeError"]: + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent(exception_type=exc_type), + print_samples_on_failure=True, + ) + + # Verify samples came from multiple threads + thread_names: set[str] = set() + for sample in samples: + label = pprof_utils.get_label_with_key(profile.string_table, sample, "thread name") + if label: + thread_names.add(profile.string_table[label.str]) + assert len(thread_names) > 1, f"Expected multiple thread names, got: {thread_names}" + + +def test_exception_with_tracer(tmp_path: Path, tracer: Tracer) -> None: + """Test exception profiling captures samples during active tracer spans.""" + from ddtrace import ext + from ddtrace.profiling.collector import stack + + output_filename = _setup_profiler(tmp_path, "test_exception_with_tracer") + + tracer._endpoint_call_counter_span_processor.enable() + + with exception.ExceptionCollector(sampling_interval=1): + with stack.StackCollector(tracer=tracer): + with tracer.trace("foobar", resource="resource", span_type=ext.SpanTypes.WEB): + for _ in range(10): + try: + raise ValueError("traced exception") + except ValueError: + pass + time.sleep(0.5) + + ddup.upload(tracer=tracer) + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + exception_type="builtins\\.ValueError", + ), + print_samples_on_failure=True, + ) + + +def test_exception_message_collection(tmp_path: Path) -> None: + """Test that exception messages are collected.""" + output_filename = _setup_profiler(tmp_path, "test_exception_message_collection") + + with exception.ExceptionCollector(sampling_interval=1, collect_message=True): + for _ in range(10): + try: + raise ValueError("test exception message") + except ValueError: + pass + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent(exception_message="test exception message"), + print_samples_on_failure=True, + ) + + +# Callable type helpers for instrumentation coverage tests + + +class _ExceptionInMethod: + def handle(self) -> None: + try: + raise ValueError("method error") + except ValueError: + pass + + +class _ExceptionInStaticMethod: + @staticmethod + def handle() -> None: + try: + raise ValueError("static error") + except ValueError: + pass + + +class _ExceptionInClassMethod: + @classmethod + def handle(cls) -> None: + try: + raise ValueError("classmethod error") + except ValueError: + pass + + +class _CallableWithException: + def __call__(self) -> None: + try: + raise ValueError("callable error") + except ValueError: + pass + + +def test_exception_in_instance_method(tmp_path: Path) -> None: + """Test that exceptions in instance methods are captured.""" + output_filename = _setup_profiler(tmp_path, "test_instance_method") + + obj = _ExceptionInMethod() + with exception.ExceptionCollector(sampling_interval=1): + for _ in range(10): + obj.handle() + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + exception_type="builtins\\.ValueError", + locations=[ + pprof_utils.StackLocation(function_name="handle", filename="test_exception.py", line_no=-1), + ], + ), + print_samples_on_failure=True, + ) + + +def test_exception_in_static_method(tmp_path: Path) -> None: + """Test that exceptions in static methods are captured.""" + output_filename = _setup_profiler(tmp_path, "test_static_method") + + with exception.ExceptionCollector(sampling_interval=1): + for _ in range(10): + _ExceptionInStaticMethod.handle() + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + exception_type="builtins\\.ValueError", + locations=[ + pprof_utils.StackLocation(function_name="handle", filename="test_exception.py", line_no=-1), + ], + ), + print_samples_on_failure=True, + ) + + +def test_exception_in_class_method(tmp_path: Path) -> None: + """Test that exceptions in class methods are captured.""" + output_filename = _setup_profiler(tmp_path, "test_class_method") + + with exception.ExceptionCollector(sampling_interval=1): + for _ in range(10): + _ExceptionInClassMethod.handle() + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + exception_type="builtins\\.ValueError", + locations=[ + pprof_utils.StackLocation(function_name="handle", filename="test_exception.py", line_no=-1), + ], + ), + print_samples_on_failure=True, + ) + + +def test_exception_in_callable_instance(tmp_path: Path) -> None: + """Test that exceptions in callable instances (__call__) are captured.""" + output_filename = _setup_profiler(tmp_path, "test_callable_instance") + + obj = _CallableWithException() + with exception.ExceptionCollector(sampling_interval=1): + for _ in range(10): + obj() + + ddup.upload() + + profile = pprof_utils.parse_newest_profile(output_filename) + samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + exception_type="builtins\\.ValueError", + locations=[ + pprof_utils.StackLocation(function_name="__call__", filename="test_exception.py", line_no=-1), + ], + ), + print_samples_on_failure=True, + ) diff --git a/tests/profiling/collector/test_stack.py b/tests/profiling/collector/test_stack.py index 19f3900b9f1..976e51aa00d 100644 --- a/tests/profiling/collector/test_stack.py +++ b/tests/profiling/collector/test_stack.py @@ -351,94 +351,6 @@ def test_push_span_none_span_type(tmp_path: Path, tracer: Tracer) -> None: ) -def test_exception_collection(tmp_path: Path) -> None: - test_name = "test_exception_collection" - pprof_prefix = str(tmp_path / test_name) - output_filename = pprof_prefix + "." + str(os.getpid()) - - assert ddup.is_available - ddup.config(env="test", service=test_name, version="my_version", output_filename=pprof_prefix) - ddup.start() - ddup.upload() - - with stack.StackCollector(): - try: - raise ValueError("hello") - except Exception: - time.sleep(1) - - ddup.upload() - - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_label_key(profile, "exception type") - - # DEV: update the test once we have exception profiling for stack v2 - # using echion - assert len(samples) == 0 - - -def test_exception_collection_threads(tmp_path: Path) -> None: - test_name = "test_exception_collection_threads" - pprof_prefix = str(tmp_path / test_name) - output_filename = pprof_prefix + "." + str(os.getpid()) - - assert ddup.is_available - ddup.config(env="test", service=test_name, version="my_version", output_filename=pprof_prefix) - ddup.start() - ddup.upload() - - with stack.StackCollector(): - - def target_fun() -> None: - try: - raise ValueError("hello") - except Exception: - time.sleep(1) - - threads = [] - for _ in range(10): - t = threading.Thread(target=target_fun) - threads.append(t) - t.start() - - for t in threads: - t.join() - - ddup.upload() - - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_label_key(profile, "exception type") - - assert len(samples) == 0 - - -def test_exception_collection_trace(tmp_path: Path, tracer: Tracer) -> None: - test_name = "test_exception_collection_trace" - pprof_prefix = str(tmp_path / test_name) - output_filename = pprof_prefix + "." + str(os.getpid()) - - tracer._endpoint_call_counter_span_processor.enable() - - assert ddup.is_available - ddup.config(env="test", service=test_name, version="my_version", output_filename=pprof_prefix) - ddup.start() - ddup.upload() - - with stack.StackCollector(tracer=tracer): - with tracer.trace("foobar", resource="resource", span_type=ext.SpanTypes.WEB): - try: - raise ValueError("hello") - except Exception: - time.sleep(1) - - ddup.upload(tracer=tracer) - - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_label_key(profile, "exception type") - - assert len(samples) == 0 - - def test_collect_once_with_class(tmp_path: Path) -> None: class SomeClass(object): @classmethod diff --git a/tests/telemetry/test_writer.py b/tests/telemetry/test_writer.py index 1e2e5956e94..1378fd5eafb 100644 --- a/tests/telemetry/test_writer.py +++ b/tests/telemetry/test_writer.py @@ -298,6 +298,9 @@ def test_app_started_event_configuration_override(test_agent_session, run_python {"name": "DD_PROFILING_ENABLE_ASSERTS", "origin": "default", "value": False}, {"name": "DD_PROFILING_ENABLE_CODE_PROVENANCE", "origin": "default", "value": True}, {"name": "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED", "origin": "default", "value": True}, + {"name": "DD_PROFILING_EXCEPTION_COLLECT_MESSAGE", "origin": "default", "value": True}, + {"name": "DD_PROFILING_EXCEPTION_ENABLED", "origin": "default", "value": True}, + {"name": "DD_PROFILING_EXCEPTION_SAMPLING_INTERVAL", "origin": "default", "value": 100}, {"name": "DD_PROFILING_HEAP_ENABLED", "origin": "env_var", "value": False}, {"name": "DD_PROFILING_HEAP_SAMPLE_SIZE", "origin": "default", "value": None}, {"name": "DD_PROFILING_IGNORE_PROFILER", "origin": "default", "value": False}, From b7fbd782eaa4a6326f5c34113aba392b23ece294 Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Thu, 19 Feb 2026 14:35:04 +0000 Subject: [PATCH 2/9] Document traceback direction, set default to false --- .../profiling/dd_wrapper/src/sample.cpp | 10 ++-- ddtrace/internal/settings/profiling.py | 2 +- .../crashtracker/test_crashtracker.py | 2 +- tests/profiling/collector/test_exception.py | 54 ++++++++++--------- tests/telemetry/test_writer.py | 2 +- 5 files changed, 36 insertions(+), 34 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp index 86bd3059256..9a17b23ade2 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp @@ -264,11 +264,9 @@ Datadog::Sample::push_pytraceback(PyTracebackObject* tb) * The chain goes from outermost (root) to innermost (leaf) via tb_next. * We collect frames first, then push in reverse (leaf-to-root) order to * match the convention used by push_pyframes and the rest of the profiler. - * - * tb_lineno is used instead of PyFrame_GetLineNumber() because it records - * the exact line where the exception was raised/re-raised at each level, - * giving more accurate exception-site attribution than the frame's current - * execution line. + * This is because the in the traceback chain tb_next is the next level in + * the stack trace (towards the frame where the exception occurred) + * https://docs.python.org/3/reference/datamodel.html#traceback.tb_next * * Ownership: tb_frame is a borrowed reference owned by the traceback. * PyFrame_GetCode() returns a new reference that we DECREF internally. */ @@ -293,7 +291,7 @@ Datadog::Sample::push_pytraceback(PyTracebackObject* tb) for (int i = static_cast(frames.size()) - 1; i >= 0; --i) { // Early exit: once we've hit the frame limit, count all remaining // frames as dropped and release their code refs without further - // expensive string extraction. + // string extraction. if (locations.size() > max_nframes) { for (int j = i; j >= 0; --j) { ++dropped_frames; diff --git a/ddtrace/internal/settings/profiling.py b/ddtrace/internal/settings/profiling.py index 27eb4c179fe..0ad9b65aefc 100644 --- a/ddtrace/internal/settings/profiling.py +++ b/ddtrace/internal/settings/profiling.py @@ -407,7 +407,7 @@ class ProfilingConfigException(DDConfig): enabled = DDConfig.v( bool, "enabled", - default=True, + default=False, help_type="Boolean", help="Whether to enable the exception profiler", ) diff --git a/tests/internal/crashtracker/test_crashtracker.py b/tests/internal/crashtracker/test_crashtracker.py index bb31c100aa0..889dc00c29f 100644 --- a/tests/internal/crashtracker/test_crashtracker.py +++ b/tests/internal/crashtracker/test_crashtracker.py @@ -542,7 +542,7 @@ def test_crashtracker_set_tag_profiler_config(snapshot_context, run_python_code_ report = utils.get_crash_report(client) # Now check for the profiler_config tag assert b"profiler_config" in report["body"] - profiler_config = "stack_v2_lock_mem_heap_exception_exp_dd_CAP1.0_MAXF64" + profiler_config = "stack_v2_lock_mem_heap_exp_dd_CAP1.0_MAXF64" assert profiler_config.encode() in report["body"] diff --git a/tests/profiling/collector/test_exception.py b/tests/profiling/collector/test_exception.py index 908c2e98da0..e9a3a681333 100644 --- a/tests/profiling/collector/test_exception.py +++ b/tests/profiling/collector/test_exception.py @@ -4,6 +4,7 @@ import sys import threading import time +from typing import TYPE_CHECKING import pytest @@ -13,6 +14,9 @@ from tests.profiling.collector import pprof_utils +if TYPE_CHECKING: + from tests.profiling.collector import pprof_pb2 + # Exception profiling requires Python 3.12 pytestmark = pytest.mark.skipif(sys.version_info < (3, 12), reason="Exception profiling requires Python 3.12+") @@ -131,7 +135,7 @@ def test_exception_config_defaults() -> None: """Test that exception profiling config has expected default values.""" from ddtrace.internal.settings.profiling import config as profiling_config - assert profiling_config.exception.enabled is True + assert profiling_config.exception.enabled is False assert profiling_config.exception.sampling_interval == 100 assert profiling_config.exception.collect_message is True @@ -162,8 +166,8 @@ def test_simple_exception_profiling(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 pprof_utils.assert_profile_has_sample( @@ -194,8 +198,8 @@ def test_exception_stack_trace(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 pprof_utils.assert_profile_has_sample( @@ -225,8 +229,8 @@ def test_multiple_exception_types(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 # Verify all three exception types are present @@ -249,8 +253,8 @@ def test_nested_exception_handling(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 # Both the inner ValueError and outer RuntimeError should be captured @@ -280,8 +284,8 @@ def test_custom_exception_class(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 pprof_utils.assert_profile_has_sample( @@ -318,8 +322,8 @@ def test_multithreaded_exception_profiling(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 # Both exception types should be present @@ -361,8 +365,8 @@ def test_exception_with_tracer(tmp_path: Path, tracer: Tracer) -> None: ddup.upload(tracer=tracer) - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 pprof_utils.assert_profile_has_sample( @@ -388,8 +392,8 @@ def test_exception_message_collection(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 pprof_utils.assert_profile_has_sample( @@ -448,8 +452,8 @@ def test_exception_in_instance_method(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 pprof_utils.assert_profile_has_sample( @@ -475,8 +479,8 @@ def test_exception_in_static_method(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 pprof_utils.assert_profile_has_sample( @@ -502,8 +506,8 @@ def test_exception_in_class_method(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 pprof_utils.assert_profile_has_sample( @@ -530,8 +534,8 @@ def test_exception_in_callable_instance(tmp_path: Path) -> None: ddup.upload() - profile = pprof_utils.parse_newest_profile(output_filename) - samples = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") assert len(samples) > 0 pprof_utils.assert_profile_has_sample( diff --git a/tests/telemetry/test_writer.py b/tests/telemetry/test_writer.py index 1378fd5eafb..2a2cba058d1 100644 --- a/tests/telemetry/test_writer.py +++ b/tests/telemetry/test_writer.py @@ -299,7 +299,7 @@ def test_app_started_event_configuration_override(test_agent_session, run_python {"name": "DD_PROFILING_ENABLE_CODE_PROVENANCE", "origin": "default", "value": True}, {"name": "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED", "origin": "default", "value": True}, {"name": "DD_PROFILING_EXCEPTION_COLLECT_MESSAGE", "origin": "default", "value": True}, - {"name": "DD_PROFILING_EXCEPTION_ENABLED", "origin": "default", "value": True}, + {"name": "DD_PROFILING_EXCEPTION_ENABLED", "origin": "default", "value": False}, {"name": "DD_PROFILING_EXCEPTION_SAMPLING_INTERVAL", "origin": "default", "value": 100}, {"name": "DD_PROFILING_HEAP_ENABLED", "origin": "env_var", "value": False}, {"name": "DD_PROFILING_HEAP_SAMPLE_SIZE", "origin": "default", "value": None}, From ac996ebfb232dd97d827aec7590af9cee432faef Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Thu, 19 Feb 2026 23:01:34 +0000 Subject: [PATCH 3/9] Make sampler into a class, enhance tests, cleanups, from TK review --- .../profiling/dd_wrapper/include/sample.hpp | 4 +- .../profiling/dd_wrapper/src/sample.cpp | 34 ++-- .../internal/datadog/profiling/ddup/_ddup.pyi | 1 + .../internal/datadog/profiling/ddup/_ddup.pyx | 3 +- ddtrace/internal/settings/profiling.py | 3 +- ddtrace/internal/settings/profiling.pyi | 2 + ddtrace/profiling/collector/_exception.pyi | 10 +- ddtrace/profiling/collector/_exception.pyx | 160 ++++++++++++------ ddtrace/profiling/collector/_fast_poisson.pyi | 6 +- ddtrace/profiling/collector/_fast_poisson.pyx | 53 +++--- ddtrace/profiling/profiler.py | 6 +- tests/profiling/collector/test_exception.py | 157 +++++++++++++---- tests/telemetry/test_writer.py | 2 +- 13 files changed, 307 insertions(+), 134 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/include/sample.hpp b/ddtrace/internal/datadog/profiling/dd_wrapper/include/sample.hpp index dc324df31a5..2e91b7a2565 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/include/sample.hpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/include/sample.hpp @@ -167,10 +167,12 @@ class Sample void push_pyframes(PyFrameObject* frame); // Push frames from a Python traceback chain to the sample. - // Walks tb → tb_next (root→leaf) and pushes frames in leaf-to-root order, + // Walks tb -> tb_next (root->leaf) and pushes frames in leaf-to-root order, // using tb_lineno for accurate exception site line numbers. // Ownership: does not take ownership of `tb`; all code object references // obtained via PyFrame_GetCode() are released internally. + // The GIL must be held when calling this function. Some of its operations, + // call Python APIs, such as PyFrame_GetCode() void push_pytraceback(PyTracebackObject* tb); // Flushes the current buffer, clearing it diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp index 9a17b23ade2..76700a1f19d 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp @@ -282,33 +282,48 @@ Datadog::Sample::push_pytraceback(PyTracebackObject* tb) // Collect frame info root→leaf by following tb_next. std::vector frames; for (; tb != nullptr; tb = reinterpret_cast(tb->tb_next)) { - int lineno = tb->tb_lineno < 0 ? 0 : tb->tb_lineno; + int lineno = tb->tb_lineno; + if (lineno < 0) { + // In Python 3.12+, tb_lineno can be -1 (lazy). Resolve it through + // the Python property which calls PyCode_Addr2Line internally. + PyObject* lineno_obj = PyObject_GetAttrString(reinterpret_cast(tb), "tb_lineno"); + if (lineno_obj != nullptr) { + lineno = PyLong_AsLong(lineno_obj); + Py_DECREF(lineno_obj); + if (lineno < 0) { + lineno = 0; + } + } else { + PyErr_Clear(); + lineno = 0; + } + } PyCodeObject* code = (tb->tb_frame != nullptr) ? PyFrame_GetCode(tb->tb_frame) : nullptr; frames.push_back({ code, lineno }); } // Push in leaf-to-root order (reverse of collected). - for (int i = static_cast(frames.size()) - 1; i >= 0; --i) { + for (auto it = frames.rbegin(); it != frames.rend(); ++it) { // Early exit: once we've hit the frame limit, count all remaining // frames as dropped and release their code refs without further // string extraction. - if (locations.size() > max_nframes) { - for (int j = i; j >= 0; --j) { + if (locations.size() >= max_nframes) { + for (auto jt = it; jt != frames.rend(); ++jt) { ++dropped_frames; - Py_XDECREF(frames[j].code); + Py_XDECREF(jt->code); } break; } - PyCodeObject* code = frames[i].code; - int lineno = frames[i].lineno; + PyCodeObject* code = it->code; + int lineno = it->lineno; std::string_view name_sv = ""; std::string_view filename_sv = ""; if (code != nullptr) { // Use co_qualname for Python 3.11+ for better context (e.g. Class.method) -#if defined(_PY311_AND_LATER) +#if defined(PY311_AND_LATER) PyObject* name_obj = code->co_qualname ? code->co_qualname : code->co_name; #else PyObject* name_obj = code->co_name; @@ -317,8 +332,7 @@ Datadog::Sample::push_pytraceback(PyTracebackObject* tb) filename_sv = unicode_to_string_view(code->co_filename); } - // push_frame_impl copies strings immediately into the StringArena. - push_frame_impl(name_sv, filename_sv, 0, lineno); + push_frame(name_sv, filename_sv, 0, lineno); Py_XDECREF(code); } // Error state is automatically restored by error_restorer destructor. diff --git a/ddtrace/internal/datadog/profiling/ddup/_ddup.pyi b/ddtrace/internal/datadog/profiling/ddup/_ddup.pyi index 1349a762ba0..116022b2ff2 100644 --- a/ddtrace/internal/datadog/profiling/ddup/_ddup.pyi +++ b/ddtrace/internal/datadog/profiling/ddup/_ddup.pyi @@ -45,6 +45,7 @@ class SampleHandle: def push_class_name(self, class_name: StringType) -> None: ... def push_cputime(self, value: int, count: int) -> None: ... def push_exceptioninfo(self, exc_type: Union[None, bytes, str, type], count: int) -> None: ... + def push_exception_message(self, exception_message: StringType) -> None: ... def push_frame(self, name: StringType, filename: StringType, address: int, line: int) -> None: ... def push_pyframes(self, frame: FrameType) -> None: ... def push_pytraceback(self, tb: TracebackType) -> None: ... diff --git a/ddtrace/internal/datadog/profiling/ddup/_ddup.pyx b/ddtrace/internal/datadog/profiling/ddup/_ddup.pyx index 1f182923cc6..703729f6c96 100644 --- a/ddtrace/internal/datadog/profiling/ddup/_ddup.pyx +++ b/ddtrace/internal/datadog/profiling/ddup/_ddup.pyx @@ -548,7 +548,8 @@ cdef class SampleHandle: if self.ptr is not NULL: exc_name = None if isinstance(exc_type, type): - exc_name = exc_type.__module__ + "." + exc_type.__name__ + module = exc_type.__module__ + exc_name = f"{module}.{exc_type.__name__}" if module else exc_type.__name__ else: exc_name = exc_type call_ddup_push_exceptioninfo(self.ptr, exc_name, clamp_to_uint64_unsigned(count)) diff --git a/ddtrace/internal/settings/profiling.py b/ddtrace/internal/settings/profiling.py index 0ad9b65aefc..b9d7724b0cc 100644 --- a/ddtrace/internal/settings/profiling.py +++ b/ddtrace/internal/settings/profiling.py @@ -420,13 +420,14 @@ class ProfilingConfigException(DDConfig): help=( "Average number of exceptions between samples (uses Poisson distribution). " "Lower values sample more frequently but add more overhead." + "This value must be >= 1. If set to less than 1, it will default to 100." ), ) collect_message = DDConfig.v( bool, "collect_message", - default=True, + default=False, help_type="Boolean", help="Whether to collect exception messages, which can contain sensitive data.", ) diff --git a/ddtrace/internal/settings/profiling.pyi b/ddtrace/internal/settings/profiling.pyi index 369286aaf1c..fa022ab7a5a 100644 --- a/ddtrace/internal/settings/profiling.pyi +++ b/ddtrace/internal/settings/profiling.pyi @@ -49,6 +49,8 @@ class ProfilingConfigPytorch(DDConfig): class ProfilingConfigException(DDConfig): enabled: bool + sampling_interval: int + collect_message: bool config: ProfilingConfig ddup_failure_msg: Optional[str] diff --git a/ddtrace/profiling/collector/_exception.pyi b/ddtrace/profiling/collector/_exception.pyi index cdd2dabf297..60eab52d511 100644 --- a/ddtrace/profiling/collector/_exception.pyi +++ b/ddtrace/profiling/collector/_exception.pyi @@ -3,9 +3,13 @@ from typing import Optional from ddtrace.profiling import collector -class ExceptionCollector(collector.Collector): - collect_message: bool +HAS_MONITORING: bool +MAX_EXCEPTION_MESSAGE_LEN: int +class ExceptionCollector(collector.Collector): + _sampling_interval: int + _collect_message: bool + _monitoring_registered: bool def __init__( self, sampling_interval: Optional[int] = None, @@ -15,5 +19,3 @@ class ExceptionCollector(collector.Collector): def _stop_service(self) -> None: ... def _on_exception_handled(code: Any, instruction_offset: int, exception: BaseException) -> None: ... - -# TODO: Define bytecode injection handler for < versions 3.12 diff --git a/ddtrace/profiling/collector/_exception.pyx b/ddtrace/profiling/collector/_exception.pyx index 77b67ab9510..e1518cbd07a 100644 --- a/ddtrace/profiling/collector/_exception.pyx +++ b/ddtrace/profiling/collector/_exception.pyx @@ -1,104 +1,162 @@ import logging +import sysconfig as _sysconfig import sys import threading +import time from ddtrace.internal.datadog.profiling import ddup from ddtrace.internal.settings.profiling import config from ddtrace.profiling import collector -from ddtrace.profiling.collector import _fast_poisson +from ddtrace.profiling.collector._fast_poisson import PoissonSampler LOG = logging.getLogger(__name__) HAS_MONITORING = hasattr(sys, "monitoring") +_GIL_DISABLED = _sysconfig.get_config_var("Py_GIL_DISABLED") _current_thread = threading.current_thread +MAX_EXCEPTION_MESSAGE_LEN = 128 -# These are global variables. We are okay with this because this is only ever accessed -# with the GIL held -cdef int _sampling_interval = 100 -cdef bool _collect_message = False -cdef int _sample_counter = 0 -cdef int _next_sample = 100 +cdef class _SamplerState: + """ + Accessed via the single module-level ``_state`` global from the + sys.monitoring callback -cdef void _collect_exception(object exc_type, object exc_value, object exc_traceback): + Free-threaded CPython is guarded in + ExceptionCollector._start_service; if free-threading support is + added, these fields need synchronization + """ + cdef int sampling_interval + cdef bint collect_message + cdef int counter + cdef int next_sample + cdef object sampler # PoissonSampler + + def __init__(self, int sampling_interval, bint collect_message): + self.sampling_interval = sampling_interval + self.collect_message = collect_message + self.counter = 0 + self.next_sample = sampling_interval + self.sampler = PoissonSampler() + + +# Single module-level global: None when inactive, set by ExceptionCollector. This +# is okay to be global. Data race is protected by the GIL and we actually want threads +# to share sampler state, because the profiler is process scoped, not thread scoped +cdef _SamplerState _state = None + + +cdef void _collect_exception(_SamplerState state, object exc_type, object exc_value, object exc_traceback) except *: if not ddup.is_available: return cdef str module = exc_type.__module__ cdef str exception_type = f"{module}.{exc_type.__name__}" if module else exc_type.__name__ - cdef str exception_message = str(exc_value) if _collect_message else "" + cdef object handle = ddup.SampleHandle() + handle.push_exceptioninfo(exception_type, 1) - try: - handle.push_exceptioninfo(exception_type, 1) - if exception_message: - handle.push_exception_message(exception_message) + # Custom exception __str__ can raise, so guard with fallbacks. + if state.collect_message: + try: + msg = str(exc_value) + if len(msg) > MAX_EXCEPTION_MESSAGE_LEN: + exception_message = msg[:MAX_EXCEPTION_MESSAGE_LEN] + "... (truncated)" + else: + exception_message = msg + # We don't know the internal implementation of a potential custom exception + # raise, so we have to catch all exception types + except Exception: + exception_message = "" - thread = _current_thread() - handle.push_threadinfo(thread.ident or 0, getattr(thread, "native_id", 0) or 0, thread.name) + handle.push_exception_message(exception_message) - handle.push_pytraceback(exc_traceback) + thread = _current_thread() + handle.push_threadinfo(thread.ident or 0, getattr(thread, "native_id", 0) or 0, thread.name) - handle.flush_sample() - except: - handle.drop_sample() + handle.push_pytraceback(exc_traceback) + handle.push_monotonic_ns(time.monotonic_ns()) + + handle.flush_sample() cpdef void _on_exception_handled(object code, int instruction_offset, object exception): - # sys.monitoring.EXCEPTION_HANDLED callback - HOT PATH - global _sample_counter, _next_sample + """sys.monitoring.EXCEPTION_HANDLED callback — HOT PATH.""" + cdef _SamplerState state = _state + if state is None: + return - _sample_counter += 1 + state.counter += 1 - if _sample_counter < _next_sample: + if state.counter < state.next_sample: return - _next_sample = _fast_poisson.sample(_sampling_interval) or 1 - _sample_counter = 0 + state.next_sample = max(state.sampler.sample(state.sampling_interval), 1) + state.counter = 0 - _collect_exception(type(exception), exception, exception.__traceback__) + # _collect_exception may trigger internal EXCEPTION_HANDLED events (via the + # try/except in message collection), but re-entrant calls are safe: the counter + # was just reset to 0 so re-entrant invocations will increment and return. + _collect_exception(state, type(exception), exception, exception.__traceback__) class ExceptionCollector(collector.Collector): - # Collects exception samples using sys.monitoring (Python 3.12+) + """Collects exception samples using sys.monitoring (Python 3.12+).""" def __init__(self, sampling_interval: int = None, collect_message: bool = None): - global _sampling_interval, _next_sample, _sample_counter - global _collect_message - super().__init__() - _sampling_interval = sampling_interval if sampling_interval is not None else config.exception.sampling_interval - _collect_message = collect_message if collect_message is not None else config.exception.collect_message - _next_sample = _sampling_interval - _sample_counter = 0 + raw_interval = sampling_interval if sampling_interval is not None else config.exception.sampling_interval + self._sampling_interval = raw_interval if raw_interval >= 1 else 100 + self._collect_message = collect_message if collect_message is not None else config.exception.collect_message + self._monitoring_registered = False def _start_service(self) -> None: - if sys.version_info >= (3, 12) and HAS_MONITORING: - # Python 3.12+: Use sys.monitoring + global _state + + if _GIL_DISABLED: + LOG.debug("Exception profiling is not supported on free-threaded CPython, skipping") + return + + if HAS_MONITORING: try: sys.monitoring.use_tool_id(sys.monitoring.PROFILER_ID, "dd-trace-exception-profiler") - sys.monitoring.set_events(sys.monitoring.PROFILER_ID, sys.monitoring.events.EXCEPTION_HANDLED) - sys.monitoring.register_callback( - sys.monitoring.PROFILER_ID, - sys.monitoring.events.EXCEPTION_HANDLED, - _on_exception_handled, - ) - LOG.debug("Using sys.monitoring.EXCEPTION_HANDLED") - except Exception: + except ValueError: LOG.exception("Failed to set up exception monitoring") return + _state = _SamplerState(self._sampling_interval, self._collect_message) + sys.monitoring.set_events(sys.monitoring.PROFILER_ID, sys.monitoring.events.EXCEPTION_HANDLED) + sys.monitoring.register_callback( + sys.monitoring.PROFILER_ID, + sys.monitoring.events.EXCEPTION_HANDLED, + _on_exception_handled, + ) + self._monitoring_registered = True + LOG.debug("Using sys.monitoring.EXCEPTION_HANDLED") else: LOG.debug("Exception profiling only supports Python 3.12+, skipping") return - LOG.info("ExceptionCollector started: interval=%d", _sampling_interval) + LOG.debug("ExceptionCollector started: interval=%d", _state.sampling_interval) def _stop_service(self) -> None: - if sys.version_info >= (3, 12) and HAS_MONITORING: - try: - sys.monitoring.set_events(sys.monitoring.PROFILER_ID, 0) - sys.monitoring.free_tool_id(sys.monitoring.PROFILER_ID) - except: - pass + global _state + + if not self._monitoring_registered: + _state = None + return + + try: + sys.monitoring.register_callback( + sys.monitoring.PROFILER_ID, + sys.monitoring.events.EXCEPTION_HANDLED, + None, + ) + sys.monitoring.set_events(sys.monitoring.PROFILER_ID, 0) + sys.monitoring.free_tool_id(sys.monitoring.PROFILER_ID) + except Exception: + LOG.debug("Failed to clean up exception monitoring", exc_info=True) + finally: + self._monitoring_registered = False + _state = None diff --git a/ddtrace/profiling/collector/_fast_poisson.pyi b/ddtrace/profiling/collector/_fast_poisson.pyi index 0afe349401b..1918eda4aa0 100644 --- a/ddtrace/profiling/collector/_fast_poisson.pyi +++ b/ddtrace/profiling/collector/_fast_poisson.pyi @@ -1,3 +1,3 @@ -def seed(s: int) -> None: ... -def sample(lam: float) -> int: ... -def sample_n(lam: float, n: int) -> list[int]: ... +class PoissonSampler: + def __init__(self) -> None: ... + def sample(self, lam: float) -> int: ... diff --git a/ddtrace/profiling/collector/_fast_poisson.pyx b/ddtrace/profiling/collector/_fast_poisson.pyx index 5e42a106e90..9d526dc2b91 100644 --- a/ddtrace/profiling/collector/_fast_poisson.pyx +++ b/ddtrace/profiling/collector/_fast_poisson.pyx @@ -7,40 +7,35 @@ cdef uint64_t SPLITMIX_MUL2 = 0x94D049BB133111EBULL cdef double INV_2_53 = 1.0 / 9007199254740992.0 cdef double TWO_PI = 6.283185307179586 -# Module-level RNG state (seeded randomly at import) -cdef uint64_t _rng_state - import os as _os -_rng_state = int.from_bytes(_os.urandom(8), "little") # SplitMix64: Taken from https://prng.di.unimi.it/splitmix64.c -cdef inline uint64_t _splitmix64() noexcept nogil: - global _rng_state +cdef inline uint64_t _splitmix64(uint64_t* state): cdef uint64_t z - _rng_state = _rng_state + SPLITMIX_INC - z = _rng_state + state[0] = state[0] + SPLITMIX_INC + z = state[0] z = (z ^ (z >> 30)) * SPLITMIX_MUL1 z = (z ^ (z >> 27)) * SPLITMIX_MUL2 return z ^ (z >> 31) # Convert to double in [0, 1) -cdef inline double _uniform() noexcept nogil: - cdef uint64_t x = _splitmix64() >> 11 +cdef inline double _uniform(uint64_t* state): + cdef uint64_t x = _splitmix64(state) >> 11 return (x + 0.5) * INV_2_53 # Taken from: https://en.wikipedia.org/wiki/Poisson_distribution#Generating_Poisson-distributed_random_variables -cdef inline int _poisson_knuth(double lam) noexcept nogil: +cdef inline int _poisson_knuth(double lam, uint64_t* state): cdef double L = exp(-lam) cdef int k = 0 cdef double p = 1.0 while p > L: k = k + 1 - p = p * _uniform() + p = p * _uniform(state) return k - 1 if k > 0 else 0 # Taken from: https://hpaulkeeler.com/simulating-poisson-random-variables-with-large-means-in-c/ -cdef inline int _poisson_ptrs(double lam) noexcept nogil: +cdef inline int _poisson_ptrs(double lam, uint64_t* state): cdef double slam = sqrt(lam) cdef double loglam = log(lam) cdef double b = 0.931 + 2.53 * slam @@ -51,8 +46,8 @@ cdef inline int _poisson_ptrs(double lam) noexcept nogil: cdef int ik while True: - U = _uniform() - 0.5 - V = _uniform() + U = _uniform(state) - 0.5 + V = _uniform(state) if U < 0: us = 0.5 + U else: @@ -71,15 +66,23 @@ cdef inline int _poisson_ptrs(double lam) noexcept nogil: return k -cpdef void seed(uint64_t s): - global _rng_state - _rng_state = s +cdef class PoissonSampler: + """Poisson sampler with per-instance RNG state. + + Each instance maintains its own SplitMix64 state, seeded from os.urandom + at construction. This ensures that separate callers get independent + sequences, and that state is fresh after fork (when the profiler is + restarted and a new instance is created). + """ + cdef uint64_t _state + def __init__(self): + self._state = int.from_bytes(_os.urandom(8), "little") -cpdef int sample(double lam): - if lam <= 0.0: - return 0 - elif lam < 30.0: - return _poisson_knuth(lam) - else: - return _poisson_ptrs(lam) + cpdef int sample(self, double lam): + if lam <= 0.0: + return 0 + elif lam < 30.0: + return _poisson_knuth(lam, &self._state) + else: + return _poisson_ptrs(lam, &self._state) diff --git a/ddtrace/profiling/profiler.py b/ddtrace/profiling/profiler.py index 5a78a486f8d..47c797072fc 100644 --- a/ddtrace/profiling/profiler.py +++ b/ddtrace/profiling/profiler.py @@ -204,11 +204,7 @@ def __post_init__(self) -> None: if self._exception_profiling_enabled: LOG.debug("Profiling collector (exception) enabled") try: - exc_collector = exception.ExceptionCollector( - sampling_interval=profiling_config.exception.sampling_interval, - collect_message=profiling_config.exception.collect_message, - ) - self._collectors.append(exc_collector) + self._collectors.append(exception.ExceptionCollector()) LOG.debug("Profiling collector (exception) initialized") except Exception: LOG.error("Failed to start exception collector, disabling.", exc_info=True) diff --git a/tests/profiling/collector/test_exception.py b/tests/profiling/collector/test_exception.py index e9a3a681333..2eb6cd0e567 100644 --- a/tests/profiling/collector/test_exception.py +++ b/tests/profiling/collector/test_exception.py @@ -1,4 +1,5 @@ import _thread +import inspect import os from pathlib import Path import sys @@ -81,7 +82,7 @@ def _raise_runtime_error_handled() -> None: pass -def _nested_exception_handling() -> None: +def _wrapped_exception_handling() -> None: try: try: raise ValueError("inner error") @@ -131,20 +132,37 @@ def _thread_raise_runtime_errors() -> None: pass +def _raise_long_exception_message() -> None: + try: + raise ValueError("a" * 1000) + except ValueError: + pass + + +def _lineno_of(func, substring): + """Return the 1-based line number of the first source line of func containing substring.""" + source_lines, start_lineno = inspect.getsourcelines(func) + for offset, line in enumerate(source_lines): + if substring in line: + return start_lineno + offset + raise AssertionError(f"{substring!r} not found in source of {func.__name__}") + + def test_exception_config_defaults() -> None: """Test that exception profiling config has expected default values.""" from ddtrace.internal.settings.profiling import config as profiling_config assert profiling_config.exception.enabled is False assert profiling_config.exception.sampling_interval == 100 - assert profiling_config.exception.collect_message is True + assert profiling_config.exception.collect_message is False def test_poisson_sampling_distribution() -> None: """Test that Poisson sampling mean is close to the configured lambda.""" - from ddtrace.profiling.collector import _fast_poisson + from ddtrace.profiling.collector._fast_poisson import PoissonSampler - samples = [_fast_poisson.sample(100) for _ in range(1000)] + sampler = PoissonSampler() + samples = [sampler.sample(100) for _ in range(1000)] assert all(s >= 0 for s in samples), "All samples should be non-negative" @@ -178,9 +196,15 @@ def test_simple_exception_profiling(tmp_path: Path) -> None: thread_name="MainThread", exception_type="builtins\\.ValueError", locations=[ - pprof_utils.StackLocation(function_name="_raise_value_error", filename="test_exception.py", line_no=-1), pprof_utils.StackLocation( - function_name="_handle_value_error", filename="test_exception.py", line_no=-1 + function_name="_raise_value_error", + filename="test_exception.py", + line_no=_lineno_of(_raise_value_error, "raise ValueError"), + ), + pprof_utils.StackLocation( + function_name="_handle_value_error", + filename="test_exception.py", + line_no=_lineno_of(_handle_value_error, "_raise_value_error()"), ), ], ), @@ -208,9 +232,21 @@ def test_exception_stack_trace(tmp_path: Path) -> None: expected_sample=pprof_utils.StackEvent( exception_type="builtins\\.RuntimeError", locations=[ - pprof_utils.StackLocation(function_name="_level_3", filename="test_exception.py", line_no=-1), - pprof_utils.StackLocation(function_name="_level_2", filename="test_exception.py", line_no=-1), - pprof_utils.StackLocation(function_name="_level_1", filename="test_exception.py", line_no=-1), + pprof_utils.StackLocation( + function_name="_level_3", + filename="test_exception.py", + line_no=_lineno_of(_level_3, "raise RuntimeError"), + ), + pprof_utils.StackLocation( + function_name="_level_2", + filename="test_exception.py", + line_no=_lineno_of(_level_2, "_level_3()"), + ), + pprof_utils.StackLocation( + function_name="_level_1", + filename="test_exception.py", + line_no=_lineno_of(_level_1, "_level_2()"), + ), ], ), print_samples_on_failure=True, @@ -243,13 +279,13 @@ def test_multiple_exception_types(tmp_path: Path) -> None: ) -def test_nested_exception_handling(tmp_path: Path) -> None: - """Test that both inner and outer exceptions are captured in nested try-except.""" - output_filename = _setup_profiler(tmp_path, "test_nested_exceptions") +def test_wrapped_exception_handling(tmp_path: Path) -> None: + """Test that both inner and outer exceptions are captured in wrapped try-except.""" + output_filename = _setup_profiler(tmp_path, "test_wrapped_exceptions") with exception.ExceptionCollector(sampling_interval=1): for _ in range(10): - _nested_exception_handling() + _wrapped_exception_handling() ddup.upload() @@ -258,20 +294,37 @@ def test_nested_exception_handling(tmp_path: Path) -> None: assert len(samples) > 0 # Both the inner ValueError and outer RuntimeError should be captured - for exc_type in ["builtins\\.ValueError", "builtins\\.RuntimeError"]: - pprof_utils.assert_profile_has_sample( - profile, - samples=samples, - expected_sample=pprof_utils.StackEvent( - exception_type=exc_type, - locations=[ - pprof_utils.StackLocation( - function_name="_nested_exception_handling", filename="test_exception.py", line_no=-1 - ), - ], - ), - print_samples_on_failure=True, - ) + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + exception_type="builtins\\.ValueError", + locations=[ + pprof_utils.StackLocation( + function_name="_wrapped_exception_handling", + filename="test_exception.py", + line_no=_lineno_of(_wrapped_exception_handling, 'raise ValueError("inner error")'), + ), + ], + ), + print_samples_on_failure=True, + ) + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent( + exception_type="builtins\\.RuntimeError", + locations=[ + pprof_utils.StackLocation( + function_name="_wrapped_exception_handling", + filename="test_exception.py", + line_no=_lineno_of(_wrapped_exception_handling, 'raise RuntimeError("outer error")'), + ), + ], + ), + print_samples_on_failure=True, + ) def test_custom_exception_class(tmp_path: Path) -> None: @@ -295,7 +348,9 @@ def test_custom_exception_class(tmp_path: Path) -> None: exception_type=".*\\.CustomError", locations=[ pprof_utils.StackLocation( - function_name="_raise_custom_error", filename="test_exception.py", line_no=-1 + function_name="_raise_custom_error", + filename="test_exception.py", + line_no=_lineno_of(_raise_custom_error, "raise CustomError"), ), ], ), @@ -303,6 +358,28 @@ def test_custom_exception_class(tmp_path: Path) -> None: ) +def test_long_exception_message(tmp_path: Path) -> None: + """Test that long exception messages are truncated.""" + output_filename = _setup_profiler(tmp_path, "test_long_exception_message") + + with exception.ExceptionCollector(sampling_interval=1, collect_message=True): + for _ in range(10): + _raise_long_exception_message() + + ddup.upload() + + profile: pprof_pb2.Profile = pprof_utils.parse_newest_profile(output_filename) + samples: list[pprof_pb2.Sample] = pprof_utils.get_samples_with_value_type(profile, "exception-samples") + assert len(samples) > 0 + + pprof_utils.assert_profile_has_sample( + profile, + samples=samples, + expected_sample=pprof_utils.StackEvent(exception_message=f"{'a' * 128}\\.\\.\\. \\(truncated\\)"), + print_samples_on_failure=True, + ) + + def test_multithreaded_exception_profiling(tmp_path: Path) -> None: """Test exceptions from multiple threads are captured with correct types.""" output_filename = _setup_profiler(tmp_path, "test_multithreaded") @@ -462,7 +539,11 @@ def test_exception_in_instance_method(tmp_path: Path) -> None: expected_sample=pprof_utils.StackEvent( exception_type="builtins\\.ValueError", locations=[ - pprof_utils.StackLocation(function_name="handle", filename="test_exception.py", line_no=-1), + pprof_utils.StackLocation( + function_name="handle", + filename="test_exception.py", + line_no=_lineno_of(_ExceptionInMethod.handle, "raise ValueError"), + ), ], ), print_samples_on_failure=True, @@ -489,7 +570,11 @@ def test_exception_in_static_method(tmp_path: Path) -> None: expected_sample=pprof_utils.StackEvent( exception_type="builtins\\.ValueError", locations=[ - pprof_utils.StackLocation(function_name="handle", filename="test_exception.py", line_no=-1), + pprof_utils.StackLocation( + function_name="handle", + filename="test_exception.py", + line_no=_lineno_of(_ExceptionInStaticMethod.handle, "raise ValueError"), + ), ], ), print_samples_on_failure=True, @@ -516,7 +601,11 @@ def test_exception_in_class_method(tmp_path: Path) -> None: expected_sample=pprof_utils.StackEvent( exception_type="builtins\\.ValueError", locations=[ - pprof_utils.StackLocation(function_name="handle", filename="test_exception.py", line_no=-1), + pprof_utils.StackLocation( + function_name="handle", + filename="test_exception.py", + line_no=_lineno_of(_ExceptionInClassMethod.handle, "raise ValueError"), + ), ], ), print_samples_on_failure=True, @@ -544,7 +633,11 @@ def test_exception_in_callable_instance(tmp_path: Path) -> None: expected_sample=pprof_utils.StackEvent( exception_type="builtins\\.ValueError", locations=[ - pprof_utils.StackLocation(function_name="__call__", filename="test_exception.py", line_no=-1), + pprof_utils.StackLocation( + function_name="__call__", + filename="test_exception.py", + line_no=_lineno_of(_CallableWithException.__call__, "raise ValueError"), + ), ], ), print_samples_on_failure=True, diff --git a/tests/telemetry/test_writer.py b/tests/telemetry/test_writer.py index 2a2cba058d1..08d676ab436 100644 --- a/tests/telemetry/test_writer.py +++ b/tests/telemetry/test_writer.py @@ -298,7 +298,7 @@ def test_app_started_event_configuration_override(test_agent_session, run_python {"name": "DD_PROFILING_ENABLE_ASSERTS", "origin": "default", "value": False}, {"name": "DD_PROFILING_ENABLE_CODE_PROVENANCE", "origin": "default", "value": True}, {"name": "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED", "origin": "default", "value": True}, - {"name": "DD_PROFILING_EXCEPTION_COLLECT_MESSAGE", "origin": "default", "value": True}, + {"name": "DD_PROFILING_EXCEPTION_COLLECT_MESSAGE", "origin": "default", "value": False}, {"name": "DD_PROFILING_EXCEPTION_ENABLED", "origin": "default", "value": False}, {"name": "DD_PROFILING_EXCEPTION_SAMPLING_INTERVAL", "origin": "default", "value": 100}, {"name": "DD_PROFILING_HEAP_ENABLED", "origin": "env_var", "value": False}, From 732f2cab58812910f78ebc77ccff671456ba331d Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Fri, 13 Mar 2026 21:40:53 +0000 Subject: [PATCH 4/9] Lazily extract frame info --- .../profiling/dd_wrapper/src/sample.cpp | 64 ++++++++----------- ddtrace/profiling/collector/_exception.pyx | 2 +- 2 files changed, 26 insertions(+), 40 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp index 76700a1f19d..9587d730099 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp @@ -262,31 +262,37 @@ Datadog::Sample::push_pytraceback(PyTracebackObject* tb) { /* Walk the Python traceback chain and push each frame to the sample. * The chain goes from outermost (root) to innermost (leaf) via tb_next. - * We collect frames first, then push in reverse (leaf-to-root) order to - * match the convention used by push_pyframes and the rest of the profiler. - * This is because the in the traceback chain tb_next is the next level in - * the stack trace (towards the frame where the exception occurred) - * https://docs.python.org/3/reference/datamodel.html#traceback.tb_next - * - * Ownership: tb_frame is a borrowed reference owned by the traceback. - * PyFrame_GetCode() returns a new reference that we DECREF internally. */ + * We collect raw traceback pointers first, then extract frame info only + * for the frames we actually keep (up to max_nframes from the leaf end). + * Frames are pushed in leaf-to-root order to match the convention used + * by push_pyframes and the rest of the profiler. + * https://docs.python.org/3/reference/datamodel.html#traceback.tb_next */ PythonErrorRestorer error_restorer; - struct TracebackFrameInfo - { - PyCodeObject* code; // new reference from PyFrame_GetCode; must be DECREF'd - int lineno; - }; - - // Collect frame info root→leaf by following tb_next. - std::vector frames; + // First pass: collect raw traceback pointers root->leaf. + // These are borrowed references owned by the traceback chain, so no + // ref-counting is needed here. + std::vector tb_nodes; for (; tb != nullptr; tb = reinterpret_cast(tb->tb_next)) { - int lineno = tb->tb_lineno; + tb_nodes.push_back(tb); + } + + // Second pass: iterate leaf->root (reverse), only extracting frame info + // for frames we will actually keep (up to max_nframes). + for (auto it = tb_nodes.rbegin(); it != tb_nodes.rend(); ++it) { + if (locations.size() >= max_nframes) { + dropped_frames += std::distance(it, tb_nodes.rend()); + break; + } + + PyTracebackObject* node = *it; + + int lineno = node->tb_lineno; if (lineno < 0) { // In Python 3.12+, tb_lineno can be -1 (lazy). Resolve it through // the Python property which calls PyCode_Addr2Line internally. - PyObject* lineno_obj = PyObject_GetAttrString(reinterpret_cast(tb), "tb_lineno"); + PyObject* lineno_obj = PyObject_GetAttrString(reinterpret_cast(node), "tb_lineno"); if (lineno_obj != nullptr) { lineno = PyLong_AsLong(lineno_obj); Py_DECREF(lineno_obj); @@ -298,31 +304,12 @@ Datadog::Sample::push_pytraceback(PyTracebackObject* tb) lineno = 0; } } - PyCodeObject* code = (tb->tb_frame != nullptr) ? PyFrame_GetCode(tb->tb_frame) : nullptr; - frames.push_back({ code, lineno }); - } - - // Push in leaf-to-root order (reverse of collected). - for (auto it = frames.rbegin(); it != frames.rend(); ++it) { - // Early exit: once we've hit the frame limit, count all remaining - // frames as dropped and release their code refs without further - // string extraction. - if (locations.size() >= max_nframes) { - for (auto jt = it; jt != frames.rend(); ++jt) { - ++dropped_frames; - Py_XDECREF(jt->code); - } - break; - } - - PyCodeObject* code = it->code; - int lineno = it->lineno; std::string_view name_sv = ""; std::string_view filename_sv = ""; + PyCodeObject* code = (node->tb_frame != nullptr) ? PyFrame_GetCode(node->tb_frame) : nullptr; if (code != nullptr) { - // Use co_qualname for Python 3.11+ for better context (e.g. Class.method) #if defined(PY311_AND_LATER) PyObject* name_obj = code->co_qualname ? code->co_qualname : code->co_name; #else @@ -335,7 +322,6 @@ Datadog::Sample::push_pytraceback(PyTracebackObject* tb) push_frame(name_sv, filename_sv, 0, lineno); Py_XDECREF(code); } - // Error state is automatically restored by error_restorer destructor. } void diff --git a/ddtrace/profiling/collector/_exception.pyx b/ddtrace/profiling/collector/_exception.pyx index e1518cbd07a..691a0bf4ce5 100644 --- a/ddtrace/profiling/collector/_exception.pyx +++ b/ddtrace/profiling/collector/_exception.pyx @@ -122,11 +122,11 @@ class ExceptionCollector(collector.Collector): if HAS_MONITORING: try: sys.monitoring.use_tool_id(sys.monitoring.PROFILER_ID, "dd-trace-exception-profiler") + sys.monitoring.set_events(sys.monitoring.PROFILER_ID, sys.monitoring.events.EXCEPTION_HANDLED) except ValueError: LOG.exception("Failed to set up exception monitoring") return _state = _SamplerState(self._sampling_interval, self._collect_message) - sys.monitoring.set_events(sys.monitoring.PROFILER_ID, sys.monitoring.events.EXCEPTION_HANDLED) sys.monitoring.register_callback( sys.monitoring.PROFILER_ID, sys.monitoring.events.EXCEPTION_HANDLED, From 0cc56ae5556ab1eb1d5cf07a6574046286fe5676 Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Fri, 13 Mar 2026 22:16:32 +0000 Subject: [PATCH 5/9] Reentrancy guard, leaking exception guard, register order clean up --- ddtrace/profiling/collector/_exception.pyx | 35 ++++++++++++------- ddtrace/profiling/collector/_fast_poisson.pyx | 3 ++ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/ddtrace/profiling/collector/_exception.pyx b/ddtrace/profiling/collector/_exception.pyx index 691a0bf4ce5..c2526c364e9 100644 --- a/ddtrace/profiling/collector/_exception.pyx +++ b/ddtrace/profiling/collector/_exception.pyx @@ -83,6 +83,11 @@ cdef void _collect_exception(_SamplerState state, object exc_type, object exc_va cpdef void _on_exception_handled(object code, int instruction_offset, object exception): """sys.monitoring.EXCEPTION_HANDLED callback — HOT PATH.""" + cdef bint _collecting = False + if _collecting: + return + + _collecting = True cdef _SamplerState state = _state if state is None: return @@ -95,11 +100,17 @@ cpdef void _on_exception_handled(object code, int instruction_offset, object exc state.next_sample = max(state.sampler.sample(state.sampling_interval), 1) state.counter = 0 - # _collect_exception may trigger internal EXCEPTION_HANDLED events (via the - # try/except in message collection), but re-entrant calls are safe: the counter - # was just reset to 0 so re-entrant invocations will increment and return. - _collect_exception(state, type(exception), exception, exception.__traceback__) - + # If an exception ever leaks from _collect_exception, this will silently disable + # sys.monitoring. This is rare, but we should catch all exception here to avoid this + # + # Rare, but if next_sample is 1, then reentrant calls will cause this to fire again + # We should guard against re-entrancy explictly here + try: + _collect_exception(state, type(exception), exception, exception.__traceback__) + except Exception: + LOG.exception("Failed to collect exception") + finally: + _collecting = False class ExceptionCollector(collector.Collector): """Collects exception samples using sys.monitoring (Python 3.12+).""" @@ -121,19 +132,19 @@ class ExceptionCollector(collector.Collector): if HAS_MONITORING: try: + _state = _SamplerState(self._sampling_interval, self._collect_message) sys.monitoring.use_tool_id(sys.monitoring.PROFILER_ID, "dd-trace-exception-profiler") sys.monitoring.set_events(sys.monitoring.PROFILER_ID, sys.monitoring.events.EXCEPTION_HANDLED) + sys.monitoring.register_callback( + sys.monitoring.PROFILER_ID, + sys.monitoring.events.EXCEPTION_HANDLED, + _on_exception_handled, + ) except ValueError: LOG.exception("Failed to set up exception monitoring") return - _state = _SamplerState(self._sampling_interval, self._collect_message) - sys.monitoring.register_callback( - sys.monitoring.PROFILER_ID, - sys.monitoring.events.EXCEPTION_HANDLED, - _on_exception_handled, - ) + self._monitoring_registered = True - LOG.debug("Using sys.monitoring.EXCEPTION_HANDLED") else: LOG.debug("Exception profiling only supports Python 3.12+, skipping") return diff --git a/ddtrace/profiling/collector/_fast_poisson.pyx b/ddtrace/profiling/collector/_fast_poisson.pyx index 9d526dc2b91..244a743b43c 100644 --- a/ddtrace/profiling/collector/_fast_poisson.pyx +++ b/ddtrace/profiling/collector/_fast_poisson.pyx @@ -73,6 +73,9 @@ cdef class PoissonSampler: at construction. This ensures that separate callers get independent sequences, and that state is fresh after fork (when the profiler is restarted and a new instance is created). + + This Poisson sampler is only intended to be used with the exception profile + collector """ cdef uint64_t _state From 5d91f47a054daccb53540ea0a63a9ee14cfa038b Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Fri, 13 Mar 2026 22:37:25 +0000 Subject: [PATCH 6/9] Upfront allocation and fix reentrancy guard --- .../profiling/dd_wrapper/src/sample.cpp | 3 +++ ddtrace/profiling/collector/_exception.pyx | 24 ++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp index 9587d730099..836381a7378 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp @@ -274,6 +274,9 @@ Datadog::Sample::push_pytraceback(PyTracebackObject* tb) // These are borrowed references owned by the traceback chain, so no // ref-counting is needed here. std::vector tb_nodes; + + // Bias for bigger upfront allocations than multiple reallocations (Can revisit this with DOE) + tb_nodes.reserve(max_nframes); for (; tb != nullptr; tb = reinterpret_cast(tb->tb_next)) { tb_nodes.push_back(tb); } diff --git a/ddtrace/profiling/collector/_exception.pyx b/ddtrace/profiling/collector/_exception.pyx index c2526c364e9..f454913b401 100644 --- a/ddtrace/profiling/collector/_exception.pyx +++ b/ddtrace/profiling/collector/_exception.pyx @@ -46,16 +46,18 @@ cdef class _SamplerState: # to share sampler state, because the profiler is process scoped, not thread scoped cdef _SamplerState _state = None +# Reentrancy guard: _collect_exception can trigger EXCEPTION_HANDLED callbacks +# (via str(exc_value) raising, or ddup internals). Without this guard, +# the callback would recurse. +cdef bint _collecting = False + cdef void _collect_exception(_SamplerState state, object exc_type, object exc_value, object exc_traceback) except *: if not ddup.is_available: return - cdef str module = exc_type.__module__ - cdef str exception_type = f"{module}.{exc_type.__name__}" if module else exc_type.__name__ - cdef object handle = ddup.SampleHandle() - handle.push_exceptioninfo(exception_type, 1) + handle.push_exceptioninfo(exc_type, 1) # Custom exception __str__ can raise, so guard with fallbacks. if state.collect_message: @@ -83,11 +85,11 @@ cdef void _collect_exception(_SamplerState state, object exc_type, object exc_va cpdef void _on_exception_handled(object code, int instruction_offset, object exception): """sys.monitoring.EXCEPTION_HANDLED callback — HOT PATH.""" - cdef bint _collecting = False + global _collecting + if _collecting: return - _collecting = True cdef _SamplerState state = _state if state is None: return @@ -100,11 +102,11 @@ cpdef void _on_exception_handled(object code, int instruction_offset, object exc state.next_sample = max(state.sampler.sample(state.sampling_interval), 1) state.counter = 0 - # If an exception ever leaks from _collect_exception, this will silently disable - # sys.monitoring. This is rare, but we should catch all exception here to avoid this - # - # Rare, but if next_sample is 1, then reentrant calls will cause this to fire again - # We should guard against re-entrancy explictly here + # If an exception ever leaks from _collect_exception, this will silently + # disable sys.monitoring. Guard against that and against reentrancy + # (next_sample == 1 and _collect_exception triggers another + # EXCEPTION_HANDLED callback internally). + _collecting = True try: _collect_exception(state, type(exception), exception, exception.__traceback__) except Exception: From 17a37b14e21d38c4a8eb56646a2b740819c225c5 Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Mon, 16 Mar 2026 17:33:22 +0000 Subject: [PATCH 7/9] Clean up sample pushing, add validator to setting, clean log level --- .../datadog/profiling/dd_wrapper/src/sample.cpp | 10 ++-------- ddtrace/internal/settings/profiling.py | 8 +++++++- ddtrace/profiling/collector/_exception.pyi | 4 ++-- ddtrace/profiling/collector/_exception.pyx | 7 ++++--- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp index 836381a7378..392e02e01ec 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp @@ -308,9 +308,6 @@ Datadog::Sample::push_pytraceback(PyTracebackObject* tb) } } - std::string_view name_sv = ""; - std::string_view filename_sv = ""; - PyCodeObject* code = (node->tb_frame != nullptr) ? PyFrame_GetCode(node->tb_frame) : nullptr; if (code != nullptr) { #if defined(PY311_AND_LATER) @@ -318,12 +315,9 @@ Datadog::Sample::push_pytraceback(PyTracebackObject* tb) #else PyObject* name_obj = code->co_name; #endif - name_sv = unicode_to_string_view(name_obj); - filename_sv = unicode_to_string_view(code->co_filename); + push_frame(unicode_to_string_view(name_obj), unicode_to_string_view(code->co_filename), 0, lineno); + Py_DECREF(code); } - - push_frame(name_sv, filename_sv, 0, lineno); - Py_XDECREF(code); } } diff --git a/ddtrace/internal/settings/profiling.py b/ddtrace/internal/settings/profiling.py index b9d7724b0cc..cb48311e1f0 100644 --- a/ddtrace/internal/settings/profiling.py +++ b/ddtrace/internal/settings/profiling.py @@ -380,6 +380,11 @@ def _validate_non_negative_int(value: int) -> None: raise ValueError("value must be non negative") +def _validate_positive_int(value: int) -> None: + if value < 1: + raise ValueError("value must be >= 1") + + class ProfilingConfigPytorch(DDConfig): __item__ = __prefix__ = "pytorch" @@ -417,10 +422,11 @@ class ProfilingConfigException(DDConfig): "sampling_interval", default=100, help_type="Integer", + validator=_validate_positive_int, help=( "Average number of exceptions between samples (uses Poisson distribution). " "Lower values sample more frequently but add more overhead." - "This value must be >= 1. If set to less than 1, it will default to 100." + "This value must be >= 1." ), ) diff --git a/ddtrace/profiling/collector/_exception.pyi b/ddtrace/profiling/collector/_exception.pyi index 60eab52d511..fc904d141b0 100644 --- a/ddtrace/profiling/collector/_exception.pyi +++ b/ddtrace/profiling/collector/_exception.pyi @@ -1,4 +1,4 @@ -from typing import Any +from types import CodeType from typing import Optional from ddtrace.profiling import collector @@ -18,4 +18,4 @@ class ExceptionCollector(collector.Collector): def _start_service(self) -> None: ... def _stop_service(self) -> None: ... -def _on_exception_handled(code: Any, instruction_offset: int, exception: BaseException) -> None: ... +def _on_exception_handled(code: CodeType, instruction_offset: int, exception: BaseException) -> None: ... diff --git a/ddtrace/profiling/collector/_exception.pyx b/ddtrace/profiling/collector/_exception.pyx index f454913b401..b48c37d3bf8 100644 --- a/ddtrace/profiling/collector/_exception.pyx +++ b/ddtrace/profiling/collector/_exception.pyx @@ -110,7 +110,7 @@ cpdef void _on_exception_handled(object code, int instruction_offset, object exc try: _collect_exception(state, type(exception), exception, exception.__traceback__) except Exception: - LOG.exception("Failed to collect exception") + LOG.debug("Failed to collect exception") finally: _collecting = False @@ -119,9 +119,10 @@ class ExceptionCollector(collector.Collector): def __init__(self, sampling_interval: int = None, collect_message: bool = None): super().__init__() - raw_interval = sampling_interval if sampling_interval is not None else config.exception.sampling_interval - self._sampling_interval = raw_interval if raw_interval >= 1 else 100 + assert raw_interval >= 1, "sampling_interval must be >= 1" + self._sampling_interval = raw_interval + self._collect_message = collect_message if collect_message is not None else config.exception.collect_message self._monitoring_registered = False From ec8d30483def67704a07209aaabd3b839368f26b Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Mon, 16 Mar 2026 18:04:10 +0000 Subject: [PATCH 8/9] Use tool id 4 --- ddtrace/profiling/collector/_exception.pyx | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/ddtrace/profiling/collector/_exception.pyx b/ddtrace/profiling/collector/_exception.pyx index b48c37d3bf8..b8e38a99527 100644 --- a/ddtrace/profiling/collector/_exception.pyx +++ b/ddtrace/profiling/collector/_exception.pyx @@ -17,6 +17,16 @@ _current_thread = threading.current_thread MAX_EXCEPTION_MESSAGE_LEN = 128 +# sys.monitoring tool ID for the exception profiler. +# CPython provides IDs 0-5: +# 0 = DEBUGGER_ID +# 1 = COVERAGE_ID (used by dd-trace-py coverage) +# 2 = PROFILER_ID (used by the native stack profiler) +# 3 = used by error tracking (handled exceptions) +# 4 = **used here** +# 5 = OPTIMIZER_ID +_MONITORING_TOOL_ID = 4 + cdef class _SamplerState: """ @@ -136,10 +146,10 @@ class ExceptionCollector(collector.Collector): if HAS_MONITORING: try: _state = _SamplerState(self._sampling_interval, self._collect_message) - sys.monitoring.use_tool_id(sys.monitoring.PROFILER_ID, "dd-trace-exception-profiler") - sys.monitoring.set_events(sys.monitoring.PROFILER_ID, sys.monitoring.events.EXCEPTION_HANDLED) + sys.monitoring.use_tool_id(_MONITORING_TOOL_ID, "dd-trace-exception-profiler") + sys.monitoring.set_events(_MONITORING_TOOL_ID, sys.monitoring.events.EXCEPTION_HANDLED) sys.monitoring.register_callback( - sys.monitoring.PROFILER_ID, + _MONITORING_TOOL_ID, sys.monitoring.events.EXCEPTION_HANDLED, _on_exception_handled, ) @@ -163,12 +173,12 @@ class ExceptionCollector(collector.Collector): try: sys.monitoring.register_callback( - sys.monitoring.PROFILER_ID, + _MONITORING_TOOL_ID, sys.monitoring.events.EXCEPTION_HANDLED, None, ) - sys.monitoring.set_events(sys.monitoring.PROFILER_ID, 0) - sys.monitoring.free_tool_id(sys.monitoring.PROFILER_ID) + sys.monitoring.set_events(_MONITORING_TOOL_ID, 0) + sys.monitoring.free_tool_id(_MONITORING_TOOL_ID) except Exception: LOG.debug("Failed to clean up exception monitoring", exc_info=True) finally: From 97e337d393c7a4a02e011d7c7a5ca1934c583c48 Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Mon, 16 Mar 2026 18:25:00 +0000 Subject: [PATCH 9/9] Changelog --- .../notes/python-exception-profiling-2efbb5289d133cb9.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 releasenotes/notes/python-exception-profiling-2efbb5289d133cb9.yaml diff --git a/releasenotes/notes/python-exception-profiling-2efbb5289d133cb9.yaml b/releasenotes/notes/python-exception-profiling-2efbb5289d133cb9.yaml new file mode 100644 index 00000000000..8f567e73d62 --- /dev/null +++ b/releasenotes/notes/python-exception-profiling-2efbb5289d133cb9.yaml @@ -0,0 +1,4 @@ +features: + - | + Python profiling: This introduces Python exception profiling using sys.monitoring hooks. This feature is disabled by default + and the exception collector only ever registers when explicitly enabled with `DD_PROFILING_EXCEPTION_ENABLED`.