From 0a1c37aae4235b94a90e7fcca055abc7396dc65c Mon Sep 17 00:00:00 2001 From: ashokchatharajupalli Date: Thu, 20 Nov 2025 03:24:20 +0000 Subject: [PATCH 1/8] Add heartbeat-based dynamic profiling system This PR introduces a heartbeat protocol that enables dynamic profiling control. Agents periodically send heartbeats to a Performance Studio backend and receive start/stop profiling commands, allowing on-demand profiling without agent restarts. Key features: - HeartbeatClient for server communication - DynamicGProfilerManager for profiler lifecycle management - Command idempotency to prevent duplicate execution - Support for dynamic profiler configuration - PerfSpect hardware metrics integration - Comprehensive test suite with mock and live modes - Complete documentation with examples Files added: - gprofiler/heartbeat.py (627 lines) - docs/HEARTBEAT_SYSTEM_README.md (634 lines) - tests/test_heartbeat_system.py (358 lines) - tests/run_heartbeat_agent.py (136 lines) Files modified: - gprofiler/main.py (heartbeat initialization) Source: Pinterest's gprofiler repository Testing: Mock tests pass, live tests verified with backend --- docs/HEARTBEAT_SYSTEM_README.md | 634 ++++++++++++++++++++++++++++++++ gprofiler/heartbeat.py | 627 +++++++++++++++++++++++++++++++ gprofiler/main.py | 113 ++++-- tests/run_heartbeat_agent.py | 136 +++++++ tests/test_heartbeat_system.py | 358 ++++++++++++++++++ 5 files changed, 1838 insertions(+), 30 deletions(-) create mode 100644 docs/HEARTBEAT_SYSTEM_README.md create mode 100644 gprofiler/heartbeat.py create mode 100644 tests/run_heartbeat_agent.py create mode 100644 tests/test_heartbeat_system.py diff --git a/docs/HEARTBEAT_SYSTEM_README.md b/docs/HEARTBEAT_SYSTEM_README.md new file mode 100644 index 000000000..3006ef9a0 --- /dev/null +++ b/docs/HEARTBEAT_SYSTEM_README.md @@ -0,0 +1,634 @@ +# Profiling Control System with Heartbeat Protocol + +This document describes the implementation of a centralized profiling control system where a Performance Studio backend can dynamically issue profiling commands (start/stop) to gProfiler agents via a heartbeat protocol. + +## System Overview + +``` +┌─────────────────────┐ Heartbeat ┌──────────────────────┐ +│ │ ◄──────────────► │ │ +│ Performance Studio │ │ gProfiler Agent │ +│ Backend │ Commands │ │ +│ │ ────────────────► │ │ +└─────────────────────┘ └──────────────────────┘ +``` + +### Key Components + +1. **Performance Studio Backend** - Central control server that: + - Receives profiling requests via REST API + - Manages profiling commands for hosts/services + - Responds to agent heartbeats with pending commands + - Tracks command execution status + +2. **gProfiler Agent** - Profiling agent that: + - Sends periodic heartbeats to the backend + - Receives and executes profiling commands + - Ensures idempotent command execution + - Reports command completion status + +## Features + +### ✅ Backend Features +- **REST API** for submitting profiling requests +- **Heartbeat endpoint** for agent communication +- **Command merging** for multiple requests targeting same host +- **Process-level and host-level** stop commands +- **Idempotent command execution** using unique command IDs +- **Command completion tracking** +- **PerfSpect integration** for hardware metrics collection + +### ✅ Agent Features +- **Heartbeat communication** with configurable intervals +- **Dynamic profiling** based on server commands +- **Command-driven execution** (start/stop profiling) +- **Idempotency** to prevent duplicate command execution +- **Persistent command tracking** across agent restarts +- **Graceful error handling** and retry logic +- **PerfSpect auto-installation** for hardware metrics collection +- **Hardware metrics integration** with CPU profiling data + +## API Endpoints + +### 1. Submit Profiling Request + +```http +POST /api/metrics/profile_request +``` + +**Request Body:** +```json +{ + "service_name": "my-service", + "command_type": "start", // "start" or "stop" + "duration": 60, + "frequency": 11, + "profiling_mode": "cpu", + "target_hostnames": ["host1", "host2"], + "pids": [1234, 5678], // Optional: specific PIDs + "stop_level": "process", // "process" or "host" (for stop commands) + "additional_args": { + "enable_perfspect": true // Optional: enable hardware metrics collection + } +} +``` + +**Response:** +```json +{ + "success": true, + "message": "Start profiling request submitted successfully", + "request_id": "req-uuid", + "command_id": "cmd-uuid", + "estimated_completion_time": "2025-01-08T12:00:00Z" +} +``` + +### 2. Agent Heartbeat + +```http +POST /api/metrics/heartbeat +``` + +**Request Body:** +```json +{ + "ip_address": "192.168.1.100", + "hostname": "worker-01", + "service_name": "my-service", + "last_command_id": "cmd-uuid", + "available_pids" : [java:{}, python:{}], + "namespaces" : [{namespace: kube_system, pods : [{pod_name: gprofiler, containers : {{pid:123, name: metrics-exporter},{pid:123, name: metrics-exporter}},{pod_name: webapp, containers : {{pid:123, name: metrics-exporter},{pid:123, name: metrics-exporter}}]}], + "status": "active", + "timestamp": "2025-01-08T11:00:00Z" +} +"containers" -> "host" Table -> {container_name, array_of_hosts} +"pod" -> "host" Table -> {pod_name, array_of_hosts} +"namespace" -> "host" Table -> {namespace, array_of_hosts} + +1. add k8s namespace hierarchy info as part of heartbeat +2. save k8s information in hostheartbeats table and create de-normalized table for containersToHosts, podsToHost and namespaceToHosts, +3. perform profiling : support profiling request by namespaces, pods and containers ( 5 ) +4. test e2e ( 3 ) +``` + +**Response:** +```json +{ + "success": true, + "message": "Heartbeat received. New profiling command available.", + "profiling_command": { + "command_type": "start", + "combined_config": { + "duration": 60, + "frequency": 11, + "profiling_mode": "cpu", + "pids": "" + } + }, + "command_id": "cmd-uuid" +} +``` + +### 3. Report Command Completion + +```http +POST /api/metrics/command_completion +``` + +**Request Body:** +```json +{ + "command_id": "cmd-uuid", + "hostname": "worker-01", + "status": "completed", // "completed" or "failed" + "execution_time": 65, + "error_message": null, + "results_path": "s3://bucket/path/to/results" +} +``` + +## PerfSpect Hardware Metrics Integration + +The heartbeat system supports Intel PerfSpect integration for collecting hardware performance metrics alongside CPU profiling data. This feature enables comprehensive performance analysis by combining software-level profiling with hardware-level metrics. + +### Overview + +When `enable_perfspect: true` is included in the `additional_args` of a profiling request, the gProfiler agent will: + +1. **Auto-install PerfSpect**: Downloads and extracts the latest PerfSpect binary from GitHub releases +2. **Configure hardware collection**: Enables `--enable-hw-metrics-collection` flag +3. **Set PerfSpect path**: Configures `--perfspect-path` to the auto-installed binary +4. **Collect metrics**: Runs PerfSpect alongside CPU profiling to gather hardware metrics + +### Agent Behavior + +#### Command Processing +When the agent receives a heartbeat response with `enable_perfspect: true` in the `combined_config`: + +```python +# Agent processes the configuration +if combined_config.get("enable_perfspect", False): + new_args.collect_hw_metrics = True + + # Auto-install PerfSpect + from gprofiler.perfspect_installer import get_or_install_perfspect + perfspect_path = get_or_install_perfspect() + if perfspect_path: + new_args.tool_perfspect_path = str(perfspect_path) + logger.info(f"PerfSpect auto-installed at: {perfspect_path}") +``` + +#### Installation Process +1. **Download**: Fetches `perfspect.tgz` from `https://github.com/intel/PerfSpect/releases/latest/download/perfspect.tgz` +2. **Extract**: Unpacks to `/tmp/gprofiler_perfspect/perfspect/` +3. **Verify**: Checks binary exists and is executable +4. **Configure**: Sets path for gProfiler to use + +#### Data Collection +PerfSpect runs with the following command: +```bash +/tmp/gprofiler_perfspect/perfspect/perfspect metrics \ + --duration 60 \ + --output /tmp/perfspect_data +``` + +### Output Files + +When PerfSpect is enabled, additional files are generated: + +- **Hardware Metrics CSV**: `/tmp/perfspect_data/{hostname}_metrics.csv` +- **Hardware Summary CSV**: `/tmp/perfspect_data/{hostname}_metrics_summary.csv` +- **Hardware HTML Report**: `/tmp/perfspect_data/{hostname}_metrics_summary.html` +- **Latest Metrics**: `/tmp/perfspect_data/{hostname}_metrics_summary_latest.csv` +- **Latest HTML**: `/tmp/perfspect_data/{hostname}_metrics_summary_latest.html` + +### Example Request with PerfSpect + +```bash +curl -X POST http://localhost:8000/api/metrics/profile_request \ + -H "Content-Type: application/json" \ + -d '{ + "service_name": "web-service", + "command_type": "start", + "duration": 60, + "frequency": 11, + "profiling_mode": "cpu", + "target_hostnames": ["worker-01", "worker-02"], + "additional_args": { + "enable_perfspect": true + } + }' +``` + +### Combined Config Example + +The agent receives the following `combined_config` in heartbeat responses: + +```json +{ + "duration": 60, + "frequency": 11, + "continuous": true, + "command_type": "start", + "profiling_mode": "cpu", + "enable_perfspect": true +} +``` + +### Requirements + +- **Platform**: Linux x86_64 (PerfSpect requirement) +- **Permissions**: Root access for hardware performance counter access +- **Network**: Internet access to download PerfSpect binary +- **Storage**: ~50MB for PerfSpect installation and data files + +### Troubleshooting + +#### Common Issues + +1. **Permission Denied**: Ensure agent runs with sufficient privileges + ```bash + sudo ./gprofiler --enable-heartbeat-server ... + ``` + +2. **Download Failures**: Check network connectivity and GitHub access + ```bash + curl -I https://github.com/intel/PerfSpect/releases/latest/download/perfspect.tgz + ``` + +3. **Binary Not Found**: Verify installation directory permissions + ```bash + ls -la /tmp/gprofiler_perfspect/perfspect/ + ``` + +#### Debug Logging + +Enable verbose logging to see PerfSpect installation and execution details: +```bash +./gprofiler --enable-heartbeat-server --verbose +``` + +Look for log messages: +- `PerfSpect auto-installed at: /path/to/binary` +- `Using perfspect path: /path/to/binary` +- `Failed to auto-install PerfSpect, hardware metrics disabled` + +## Usage Examples + +### Backend - Submit Start Command + +```bash +curl -X POST http://localhost:8000/api/metrics/profile_request \ + -H "Content-Type: application/json" \ + -d '{ + "service_name": "web-service", + "command_type": "start", + "duration": 120, + "frequency": 11, + "profiling_mode": "cpu", + "target_hostnames": ["web-01", "web-02"] + "containers" : [], + "pods" : [], + "namespaces" : [], + }' +``` + +### Backend - Submit Stop Command + +```bash +curl -X POST http://localhost:8000/api/metrics/profile_request \ + -H "Content-Type: application/json" \ + -d '{ + "service_name": "web-service", + "command_type": "stop", + "stop_level": "host", + "target_hostnames": ["web-01"] + }' +``` + +### Agent - Run in Heartbeat Mode + +**Basic heartbeat mode:** +```bash +python gprofiler/main.py \ + --enable-heartbeat-server \ + --upload-results \ + --token "your-token" \ + --service-name "web-service" \ + --api-server "http://performance-studio:8000" \ + --heartbeat-interval 30 \ + --output-dir /tmp/profiles \ + --verbose +``` + +**Production deployment with all optimizations:** +```bash +# Set environment variables first +export GPROFILER_TOKEN="my_token" +export GPROFILER_SERVICE="your-service-name" +export GPROFILER_SERVER="http://localhost:8080" + +# Production command (can also source /opt/gprofiler/envs.sh for variables) +/opt/gprofiler/gprofiler \ + -u \ + --token=$GPROFILER_TOKEN \ + --service-name=$GPROFILER_SERVICE \ + --server-host $GPROFILER_SERVER \ + --dont-send-logs \ + --server-upload-timeout 10 \ + -c \ + --disable-metrics-collection \ + --java-safemode= \ + -d 60 \ + --java-no-version-check +``` + +## Implementation Details + +### Backend Logic + +1. **Command Generation**: Each profiling request generates a unique `command_id` +2. **Command Merging**: Multiple requests for the same host are merged into single commands +3. **Stop Handling**: + - Process-level stops remove specific PIDs from commands + - Host-level stops terminate all profiling for the host +4. **Heartbeat Response**: Returns pending commands with `command_type` and configuration + +### Agent Logic + +1. **Heartbeat Loop**: Sends heartbeats at configured intervals +2. **Command Processing**: + - `start`: Stop current profiler (if any) and start new one with given config + - `stop`: Stop current profiler without starting a new one +3. **Idempotency**: Track executed command IDs to prevent duplicates +4. **Persistence**: Save executed command IDs to disk for restart resilience + +### Command Flow + +``` +1. User submits profiling request to backend + ↓ +2. Backend creates command with unique ID + ↓ +3. Agent sends heartbeat to backend + ↓ +4. Backend responds with pending command + ↓ +5. Agent executes command (start/stop profiling) + ↓ +6. Agent reports completion to backend + ↓ +7. Backend updates command status +``` + +## Configuration + +### Backend Configuration +- Database connection for command storage +- API endpoints for profiling control +- Command merging and deduplication logic + +### Agent Configuration +```bash +--enable-heartbeat-server # Enable heartbeat mode +--heartbeat-interval 30 # Heartbeat frequency (seconds) +--api-server URL # Backend server URL +--upload-results # Required for heartbeat mode +--token TOKEN # Authentication token +--service-name NAME # Service identifier +``` + +## Testing + +### Test Scripts + +1. **test_heartbeat_system.py** - Test backend API and heartbeat flow +2. **run_heartbeat_agent.py** - Run agent in heartbeat mode for testing + +### Test Workflow + +1. Start Performance Studio backend +2. Run test agent: `python run_heartbeat_agent.py` +3. Submit test commands: `python test_heartbeat_system.py` +4. Verify agent receives and executes commands +5. Check idempotency and error handling + +## Error Handling + +### Backend +- Validates profiling request parameters +- Handles database connection errors +- Returns appropriate HTTP status codes +- Logs all operations for debugging + +### Agent +- Retries failed heartbeats with backoff +- Continues heartbeat loop on command execution errors +- Persists executed command IDs across restarts +- Graceful shutdown on termination signals + +## Security Considerations + +- **Authentication**: Token-based authentication for agent-backend communication +- **Authorization**: Service-based access control for profiling commands +- **Command Validation**: Validate all command parameters before execution +- **Rate Limiting**: Prevent abuse of profiling requests +- **Audit Logging**: Track all profiling activities for compliance + +## Future Enhancements + +- **Real-time Status**: WebSocket connection for real-time agent status +- **Command Scheduling**: Schedule profiling commands for future execution +- **Resource Monitoring**: Check system resources before starting profiling +- **Multi-tenant Support**: Isolation between different services/teams +- **Command Prioritization**: Priority queues for urgent profiling requests +- **Distributed Coordination**: Coordinate profiling across multiple agents + +## Troubleshooting + +### Common Issues + +1. **Agent not receiving commands** + - Check network connectivity to backend + - Verify authentication token + - Check service name matching + +2. **Commands not executing** + - Check agent logs for errors + - Verify command parameters are valid + - Check system permissions for profiling + +3. **Duplicate commands** + - Verify idempotency implementation + - Check command ID persistence + - Review heartbeat timing + +4. **PerfSpect hardware metrics not working** + - Ensure Linux x86_64 platform (PerfSpect requirement) + - Verify root/sudo permissions for hardware counters + - Check internet connectivity for auto-installation + - Look for "PerfSpect auto-installed" or "Failed to auto-install" log messages + - Verify `/tmp/gprofiler_perfspect/perfspect/perfspect` binary exists and is executable + +### Debugging + +- Enable verbose logging: `--verbose` +- Check heartbeat logs: `/tmp/gprofiler-heartbeat.log` +- Monitor backend API logs +- Use test scripts to isolate issues +- For PerfSpect issues: + - Check PerfSpect installation: `ls -la /tmp/gprofiler_perfspect/perfspect/` + - Test PerfSpect manually: `/tmp/gprofiler_perfspect/perfspect/perfspect --help` + - Check PerfSpect data directory: `ls -la /tmp/perfspect_data/` + - Monitor hardware metrics collection in agent logs + +## Building and Running gProfiler Locally + +### Prerequisites +- Linux system (x86_64 or Aarch64) +- Python 3.10+ for source builds +- Docker for containerized builds +- 16GB+ RAM for full builds +- Root access for profiling operations + +### Build Options + +#### 1. Build Executable (Recommended) + +```bash +cd gprofiler + +# Full build (takes 20-30 minutes, builds all profilers from source) +./scripts/build_x86_64_executable.sh + +# Fast build (for development, skips some optimizations) +./scripts/build_x86_64_executable.sh --fast +``` + +The executable will be created at `build/x86_64/gprofiler`. + +#### 2. Build Docker Image + +```bash +./scripts/build_x86_64_container.sh -t gprofiler +``` + +#### 3. Run from Source (Development) + +```bash +# Install dependencies +pip3 install -r requirements.txt + +# Copy required resources +./scripts/copy_resources_from_image.sh + +# Run directly from source (requires root) +sudo python3 -m gprofiler [options] +``` + +### Running Locally + +#### Basic Local Profiling + +```bash +# Make executable and run basic profiling +chmod +x build/x86_64/gprofiler +sudo ./build/x86_64/gprofiler -o /tmp/gprofiler-output -d 30 +``` + +#### Production-Style Local Run + +```bash +# Set environment variables +export GPROFILER_TOKEN="my_token" +export GPROFILER_SERVICE="your-service-name" +export GPROFILER_SERVER="http://localhost:8080" + +# Run with production flags +sudo ./build/x86_64/gprofiler \ + -u \ + --token=$GPROFILER_TOKEN \ + --service-name=$GPROFILER_SERVICE \ + --server-host $GPROFILER_SERVER \ + --dont-send-logs \ + --server-upload-timeout 10 \ + -c \ + --disable-metrics-collection \ + --java-safemode= \ + -d 60 \ + --java-no-version-check +``` + +#### Local Heartbeat Mode Testing + +```bash +# Run agent in heartbeat mode for testing +sudo ./build/x86_64/gprofiler \ + --enable-heartbeat-server \ + --upload-results \ + --token=$GPROFILER_TOKEN \ + --service-name=$GPROFILER_SERVICE \ + --api-server $GPROFILER_SERVER \ + --heartbeat-interval 30 \ + --output-dir /tmp/profiles \ + --dont-send-logs \ + --server-upload-timeout 10 \ + --disable-metrics-collection \ + --java-safemode= \ + --java-no-version-check \ + --verbose +``` + +#### Local PerfSpect Testing (Manual) + +```bash +# Test PerfSpect integration manually (Linux x86_64 only) +sudo ./build/x86_64/gprofiler \ + --enable-hw-metrics-collection \ + --perfspect-path /path/to/perfspect \ + --perfspect-duration 60 \ + --output-dir /tmp/profiles \ + --duration 60 \ + --verbose +``` + +### Command Line Options Explained + +```bash +-u, --upload-results # Upload results to Performance Studio +--token=$GPROFILER_TOKEN # Authentication token +--service-name=$GPROFILER_SERVICE # Service identifier +--server-host $GPROFILER_SERVER # Performance Studio backend URL +--dont-send-logs # Disable log transmission +--server-upload-timeout 10 # Upload timeout (seconds) +-c, --continuous # Continuous profiling mode +--disable-metrics-collection # Disable system metrics collection +--java-safemode= # Disable Java safe mode (empty value) +-d 60 # Profiling duration (seconds) +--java-no-version-check # Skip Java version check +--enable-heartbeat-server # Enable heartbeat communication +--heartbeat-interval 30 # Heartbeat frequency (seconds) +--api-server URL # Heartbeat API server URL +-o, --output-dir PATH # Local output directory +--verbose # Enable verbose logging + +# PerfSpect Hardware Metrics Options (Linux x86_64 only) +--enable-hw-metrics-collection # Enable hardware metrics via PerfSpect +--perfspect-path PATH # Path to PerfSpect binary (auto-installed in heartbeat mode) +--perfspect-duration SECONDS # PerfSpect collection duration (default: 60) +``` + +### Development Workflow + +1. **Build**: `./scripts/build_x86_64_executable.sh --fast` +2. **Test locally**: `sudo ./build/x86_64/gprofiler -o /tmp/results -d 30` +3. **View results**: Open `/tmp/results/last_flamegraph.html` in browser +4. **Test heartbeat**: Run with `--enable-heartbeat-server` flag + +### Troubleshooting Local Builds + +- **Build fails**: Ensure 16GB+ RAM available +- **Permission errors**: Run profiling commands with `sudo` +- **Docker issues**: Ensure Docker daemon is running +- **Missing dependencies**: Install build requirements with package manager diff --git a/gprofiler/heartbeat.py b/gprofiler/heartbeat.py new file mode 100644 index 000000000..3eabcc0c3 --- /dev/null +++ b/gprofiler/heartbeat.py @@ -0,0 +1,627 @@ +# +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import datetime +import logging +import os +import socket +import threading +from pathlib import Path +from typing import Dict, Any, Optional, List, TYPE_CHECKING + +import configargparse +import requests +from psutil import Process + +# Use TYPE_CHECKING to avoid circular imports +if TYPE_CHECKING: + from gprofiler.main import GProfiler + +from gprofiler.client import ProfilerAPIClient +from gprofiler.containers_client import ContainerNamesClient +from gprofiler.metadata.application_identifiers import ApplicationIdentifiers +from gprofiler.metadata.enrichment import EnrichmentOptions +from gprofiler.metadata.metadata_collector import get_static_metadata +from gprofiler.metadata.system_metadata import get_hostname +from gprofiler.metrics_publisher import ( + MetricsPublisher, + METRIC_BASE_NAME, + RESPONSE_TYPE_SUCCESS, + RESPONSE_TYPE_FAILURE +) +from gprofiler.profiler_state import ProfilerState +from gprofiler.profilers.factory import get_profilers +from gprofiler.profilers.profiler_base import NoopProfiler +from gprofiler.state import State, init_state, get_state +from gprofiler.system_metrics import NoopSystemMetricsMonitor, SystemMetricsMonitor, SystemMetricsMonitorBase +from gprofiler.usage_loggers import NoopUsageLogger +from gprofiler.utils import TEMPORARY_STORAGE_PATH, resource_path +from gprofiler.hw_metrics import HWMetricsMonitor, HWMetricsMonitorBase, NoopHWMetricsMonitor +from gprofiler.exceptions import NoProfilersEnabledError + +logger = logging.getLogger(__name__) + + +class HeartbeatClient: + """Client for sending heartbeats to the server and receiving profiling commands""" + + def __init__(self, api_server: str, service_name: str, server_token: str, verify: bool = True): + self.api_server = api_server.rstrip('/') + self.service_name = service_name + self.server_token = server_token + self.verify = verify + self.hostname = get_hostname() + self.ip_address = self._get_local_ip() + self.last_command_id: Optional[str] = None + self.executed_command_ids: set = set() # Track executed command IDs for idempotency (in-memory) + self.max_command_history = 1000 # Limit command history to prevent memory growth + self.session = requests.Session() + + # Set up authentication headers + if self.server_token: + self.session.headers.update({ + 'Authorization': f'Bearer {self.server_token}', + 'Content-Type': 'application/json' + }) + + def _get_local_ip(self) -> str: + """Get the local IP address""" + try: + # Connect to a remote address to determine local IP + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: + s.connect(("8.8.8.8", 80)) + return s.getsockname()[0] + except Exception: + return "127.0.0.1" + + def send_heartbeat(self) -> Optional[Dict[str, Any]]: + """Send heartbeat to server and return any profiling commands""" + try: + heartbeat_data = { + "ip_address": self.ip_address, + "hostname": self.hostname, + "service_name": self.service_name, + "last_command_id": self.last_command_id, + "status": "active", + "timestamp": datetime.datetime.now().isoformat() + } + + url = f"{self.api_server}/api/metrics/heartbeat" + response = self.session.post( + url, + json=heartbeat_data, + verify=self.verify, + timeout=30 + ) + + if response.status_code == 200: + result = response.json() + # Emit success metric (SLI tracking) using singleton + MetricsPublisher.get_instance().send_sli_metric( + response_type=RESPONSE_TYPE_SUCCESS, + method_name='send_heartbeat' + ) + + if result.get("success") and result.get("profiling_command"): + logger.info(f"Received profiling command from server: {result.get('command_id')}") + return result + else: + logger.debug("Heartbeat successful, no pending commands") + return None + else: + logger.warning(f"Heartbeat failed with status {response.status_code}: {response.text}") + # Emit failure metric (SLI tracking) using singleton + MetricsPublisher.get_instance().send_sli_metric( + response_type=RESPONSE_TYPE_FAILURE, + method_name='send_heartbeat', + extra_tags={'status_code': response.status_code} + ) + return None + + except Exception as e: + logger.error(f"Failed to send heartbeat: {e}") + # Emit failure metric (SLI tracking) using singleton + MetricsPublisher.get_instance().send_sli_metric( + response_type=RESPONSE_TYPE_FAILURE, + method_name='send_heartbeat', + extra_tags={'error': str(e)} + ) + return None + + def send_command_completion(self, command_id: str, status: str, execution_time: Optional[int] = None, + error_message: Optional[str] = None, results_path: Optional[str] = None) -> bool: + """ + Send command completion status to the server. + + Args: + command_id: The ID of the completed command + status: 'completed' or 'failed' + execution_time: Duration of execution in seconds + error_message: Error message if status is 'failed' + results_path: Path to profiling results if available + + Returns: + bool: True if completion was successfully reported, False otherwise + """ + try: + completion_data = { + "command_id": command_id, + "hostname": self.hostname, + "status": status, + "execution_time": execution_time, + "error_message": error_message, + "results_path": results_path + } + + url = f"{self.api_server}/api/metrics/command_completion" + response = self.session.post( + url, + json=completion_data, + verify=self.verify, + timeout=30 + ) + + if response.status_code == 200: + logger.info(f"Successfully reported command completion for {command_id} with status: {status}") + return True + else: + logger.error(f"Failed to report command completion for {command_id}. Status: {response.status_code}, Response: {response.text}") + return False + + except Exception as e: + logger.error(f"Failed to send command completion for {command_id}: {e}") + return False + + + + def mark_command_executed(self, command_id: str): + """Mark a command as executed (in-memory)""" + self.executed_command_ids.add(command_id) + + # Cleanup old command IDs if we exceed the limit + if len(self.executed_command_ids) > self.max_command_history: + self._cleanup_old_command_ids() + + logger.debug(f"Marked command ID {command_id} as executed") + + def _cleanup_old_command_ids(self): + """Remove old command IDs to prevent memory growth""" + try: + # Keep only the most recent commands (this is a simple approach) + # In production, you might want to implement time-based cleanup + if len(self.executed_command_ids) > self.max_command_history: + # Convert to list, sort, and keep the last max_command_history items + command_list = list(self.executed_command_ids) + # Since UUIDs don't sort chronologically, we'll just remove some arbitrary ones + # In a real implementation, you'd want to track timestamps + commands_to_keep = command_list[-self.max_command_history:] + self.executed_command_ids = set(commands_to_keep) + logger.info(f"Cleaned up command ID history in memory, keeping {len(self.executed_command_ids)} entries") + except Exception as e: + logger.warning(f"Failed to cleanup old command IDs: {e}") + + +class DynamicGProfilerManager: + """Manager for dynamically starting/stopping gProfiler instances based on server commands""" + + def __init__(self, base_args: configargparse.Namespace, heartbeat_client: HeartbeatClient): + self.base_args = base_args + self.heartbeat_client = heartbeat_client + self.current_gprofiler: Optional['GProfiler'] = None + self.current_thread: Optional[threading.Thread] = None + self.stop_event = threading.Event() + self.heartbeat_interval = 30 # seconds + + def start_heartbeat_loop(self): + """Start the main heartbeat loop""" + logger.info("Starting heartbeat loop...") + + while not self.stop_event.is_set(): + try: + # Send heartbeat and check for commands + command_response = self.heartbeat_client.send_heartbeat() + + if command_response and command_response.get("profiling_command"): + profiling_command = command_response["profiling_command"] + command_id = command_response["command_id"] + command_type = profiling_command.get("command_type", "start") + + logger.info(f"Received profiling command: {profiling_command}") + + # Check for idempotency - skip if command already executed + if command_id in self.heartbeat_client.executed_command_ids: + logger.info(f"Command ID {command_id} already executed, skipping...") + + # Wait for next heartbeat + self.stop_event.wait(self.heartbeat_interval) + + continue + + logger.info(f"Received {command_type} command: {command_id}") + + # Mark command as executed for idempotency + self.heartbeat_client.mark_command_executed(command_id) + self.heartbeat_client.last_command_id = command_id + + if command_type == "stop": + # Stop current profiler without starting a new one + logger.info(f"RECEIVED STOP COMMAND for command ID: {command_id}") + logger.info(f"STOP command details: {profiling_command}") + self._stop_current_profiler() + # TODO: important comment to make sure profiler has stopped successful to avoid leak + # Report completion for stop command + self.heartbeat_client.send_command_completion( + command_id=command_id, + status="completed", + execution_time=0, + error_message=None, + results_path=None + ) + elif command_type == "start": + # Stop current profiler if running, then start new one + logger.info("Starting new profiler due to start command") + # TODO: important comment to make sure profiler has stopped successful to avoid leak + self._stop_current_profiler() + self._start_new_profiler(profiling_command, command_id) + # Note: command completion still needs since it will wait for successful profiling + # Report command completion to the server + try: + self.heartbeat_client.send_command_completion( + command_id=command_id, + status="completed", + execution_time=0, + error_message=None, + results_path=None + ) + except Exception as e: + logger.error(f"Failed to report command completion for {command_id}: {e}") + else: + logger.warning(f"Unknown command type: {command_type}") + # Report completion for unknown command type + self.heartbeat_client.send_command_completion( + command_id=command_id, + status="failed", + execution_time=0, + error_message=f"Unknown command type: {command_type}", + results_path=None + ) + + + # Wait for next heartbeat + self.stop_event.wait(self.heartbeat_interval) + + except Exception as e: + logger.error(f"Error in heartbeat loop: {e}", exc_info=True) + self.stop_event.wait(self.heartbeat_interval) + + def _stop_current_profiler(self): + """Stop the currently running profiler""" + if self.current_gprofiler: + logger.info("STOPPING current gProfiler instance...") + try: + self.current_gprofiler.stop() # This sets the stop_event! + logger.info("Successfully called gprofiler.stop()") + except Exception as e: + # TODO: This is a huge leak, report it + logger.error(f"Error stopping gProfiler: {e}") + + # ALWAYS cleanup subprocesses regardless of stop() success/failure + try: + logger.info("Starting comprehensive cleanup after heartbeat stop...") + self.current_gprofiler.maybe_cleanup_subprocesses() + logger.info("Comprehensive cleanup completed") + except Exception as cleanup_error: + # Cleanup errors are non-fatal - log and continue + logger.info(f"Cleanup completed with minor errors (expected during stop): {cleanup_error}") + + # Always clear the reference + self.current_gprofiler = None + + if self.current_thread and self.current_thread.is_alive(): + # No need to actively kill the thread, the self.current_gprofiler.stop() already handles it using events + logger.info("Waiting for profiler thread to finish...") + self.current_thread.join(timeout=10) + self.current_thread = None + + def _start_new_profiler(self, profiling_command: Dict[str, Any], command_id: str): + """Start a new profiler with the given configuration""" + try: + # Import here to avoid circular imports + from gprofiler.main import DEFAULT_PROFILING_DURATION + + # Create modified args for the new profiler + new_args = self._create_profiler_args(profiling_command) + + # Create new GProfiler instance + self.current_gprofiler = self._create_gprofiler_instance(new_args) + + # Start profiler in a separate thread + self.current_thread = threading.Thread( + target=self._run_profiler, + args=( + self.current_gprofiler, + new_args.continuous, + getattr(new_args, "duration", DEFAULT_PROFILING_DURATION), + command_id, + ), + daemon=True + ) + self.current_thread.start() + + logger.info(f"Started new gProfiler instance with command ID: {command_id}") + + except Exception as e: + logger.error(f"Failed to start new profiler: {e}", exc_info=True) + # Report failure to the server + self.heartbeat_client.send_command_completion( + command_id=command_id, + status="failed", + execution_time=0, + error_message=str(e), + results_path=None + ) + + def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargparse.Namespace: + """Create modified args based on profiling command""" + # Copy base args + new_args = configargparse.Namespace(**vars(self.base_args)) + + # Update with profiling command parameters from combined_config + combined_config = profiling_command.get("combined_config", {}) + if "duration" in combined_config: + new_args.duration = combined_config["duration"] + if "frequency" in combined_config: + new_args.frequency = combined_config["frequency"] + if "profiling_mode" in combined_config: + new_args.profiling_mode = combined_config["profiling_mode"] + if "target_hostnames" in combined_config and combined_config["target_hostnames"]: + # Only profile if this hostname is in the target list or no specific targets + if self.heartbeat_client.hostname not in combined_config["target_hostnames"]: + logger.info(f"Hostname {self.heartbeat_client.hostname} not in target list, skipping profiling") + return None + if "pids" in combined_config and combined_config["pids"]: + new_args.pids_to_profile = combined_config["pids"] + + # Set continuous mode + new_args.continuous = combined_config.get("continuous", False) + + # Handle PerfSpect configuration + enable_perfspect = combined_config.get("enable_perfspect", False) + if enable_perfspect: + new_args.collect_hw_metrics = True + + # Assume PerfSpect is pre-installed as a resource + perfspect_path = resource_path("perfspect/perfspect") + + # Check if PerfSpect binary exists + if os.path.exists(perfspect_path) and os.access(perfspect_path, os.X_OK): + new_args.tool_perfspect_path = perfspect_path + logger.info(f"Using pre-installed PerfSpect at: {perfspect_path}") + else: + logger.error(f"PerfSpect not found at {perfspect_path}, hardware metrics disabled") + new_args.collect_hw_metrics = False + + # Handle max_processes configuration + max_processes = combined_config.get("max_processes", 10) + new_args.max_processes_per_profiler = max_processes + logger.info(f"Setting max processes per profiler: {max_processes}") + + # Handle Profiler Configurations + profiler_configs = combined_config.get("profiler_configs", {}) + if profiler_configs: + logger.info(f"Applying profiler configurations: {profiler_configs}") + + # Handle Perf Profiler configuration + perf_config = profiler_configs.get("perf", "enabled_restricted") + if perf_config == "enabled_restricted": + new_args.max_system_processes_for_system_profilers = 600 + new_args.perf_max_docker_containers = 2 + logger.info("Perf profiler: enabled restricted mode") + elif perf_config == "enabled_aggressive": + new_args.max_system_processes_for_system_profilers = 1500 + new_args.perf_max_docker_containers = 50 + logger.info("Perf profiler: enabled aggressive mode") + elif perf_config == "disabled": + new_args.perf_mode = "disabled" + logger.info("Perf profiler: disabled") + + # Handle Pyperf configuration + pyperf_config = profiler_configs.get("pyperf", "enabled") + if pyperf_config == "enabled": + new_args.python_skip_pyperf_profiler_above = 1500 + new_args.python_mode = "pyperf" + logger.info("Pyperf profiler: enabled") + elif pyperf_config == "disabled": + new_args.python_mode = "disabled" + logger.info("Pyperf profiler: disabled, using pyspy") + + # Handle Pyspy configuration + pyspy_config = profiler_configs.get("pyspy", "enabled_fallback") + if pyspy_config == "enabled_fallback": + new_args.python_mode = "auto" + logger.info("Pyspy profiler: enabled as fallback") + elif pyspy_config == "enabled": + new_args.python_mode = "pyspy" + logger.info("Pyspy profiler: enabled") + elif pyspy_config == "disabled" and pyperf_config == "disabled": + new_args.python_mode = "disabled" + logger.info("Pyspy profiler: disabled") + + # Handle Java Async Profiler configuration + async_profiler_config = profiler_configs.get("async_profiler", "enabled") + if async_profiler_config == "disabled": + new_args.java_mode = "disabled" + logger.info("Java async profiler: disabled") + else: + logger.info("Java async profiler: enabled") + + # Handle PHP configuration + phpspy_config = profiler_configs.get("phpspy", "enabled") + if phpspy_config == "disabled": + new_args.php_mode = "disabled" + logger.info("PHP profiler: disabled") + else: + logger.info("PHP profiler: enabled") + + # Handle Ruby configuration + rbspy_config = profiler_configs.get("rbspy", "enabled") + if rbspy_config == "disabled": + new_args.ruby_mode = "disabled" + logger.info("Ruby profiler: disabled") + else: + logger.info("Ruby profiler: enabled") + + # Handle .NET configuration + dotnet_config = profiler_configs.get("dotnet_trace", "enabled") + if dotnet_config == "disabled": + new_args.dotnet_mode = "disabled" + logger.info(".NET profiler: disabled") + else: + logger.info(".NET profiler: enabled") + + # Handle NodeJS configuration + nodejs_config = profiler_configs.get("nodejs_perf", "enabled") + if nodejs_config == "disabled": + new_args.nodejs_mode = "none" + logger.info("NodeJS profiler: disabled") + else: + logger.info("NodeJS profiler: enabled") + + return new_args + + def _create_gprofiler_instance(self, args: configargparse.Namespace) -> 'GProfiler': + """Create a new GProfiler instance with the given args""" + if args is None: + return None + + # Import here to avoid circular imports + from gprofiler.main import GProfiler, pids_to_processes + + processes_to_profile = pids_to_processes(args) + state = get_state() + + # Create profiler API client + profiler_api_client = None + if args.upload_results: + profiler_api_client = ProfilerAPIClient( + token=args.server_token, + service_name=args.service_name, + server_address=args.server_host, + curlify_requests=getattr(args, 'curlify_requests', False), + hostname=get_hostname(), + verify=args.verify, + upload_timeout=getattr(args, 'server-upload-timeout', 120) # Default to 120 seconds + ) + + enrichment_options = EnrichmentOptions( + profile_api_version=args.profile_api_version, + container_names=args.container_names, + application_identifiers=args.collect_appids, + application_identifier_args_filters=args.app_id_args_filters, + application_metadata=args.application_metadata, + ) + + # Create external metadata path if specified + external_metadata_path = None + if hasattr(args, 'external_metadata') and args.external_metadata: + external_metadata_path = Path(args.external_metadata) + + # Create heartbeat file path if specified + heartbeat_file_path = None + if hasattr(args, 'heartbeat_file') and args.heartbeat_file: + heartbeat_file_path = Path(args.heartbeat_file) + + # Create perfspect path if specified + perfspect_path = None + if hasattr(args, "tool_perfspect_path") and args.tool_perfspect_path: + perfspect_path = Path(args.tool_perfspect_path) + + return GProfiler( + output_dir=getattr(args, 'output_dir', None), + flamegraph=getattr(args, 'flamegraph', True), + rotating_output=getattr(args, 'rotating_output', False), + rootless=getattr(args, 'rootless', False), + profiler_api_client=profiler_api_client, + collect_metrics=getattr(args, 'collect_metrics', True), + collect_metadata=getattr(args, 'collect_metadata', True), + enrichment_options=enrichment_options, + state=state, + usage_logger=NoopUsageLogger(), # Simplified for dynamic profiling + user_args=args.__dict__, + duration=args.duration, + profile_api_version=args.profile_api_version, + profiling_mode=args.profiling_mode, + collect_hw_metrics=getattr(args, "collect_hw_metrics", False), + profile_spawned_processes=getattr(args, 'profile_spawned_processes', False), + remote_logs_handler=None, # Simplified for dynamic profiling + controller_process=None, + processes_to_profile=processes_to_profile, + external_metadata_path=external_metadata_path, + heartbeat_file_path=heartbeat_file_path, + perfspect_path=perfspect_path, + perfspect_duration=getattr(args, "tool_perfspect_duration", 60), + ) + + def _run_profiler(self, gprofiler: 'GProfiler', continuous: bool, duration: int, command_id: str): + """Run the profiler with specified args""" + if gprofiler is None: + return + + start_time = datetime.datetime.now() + error_message = None + results_path = None + + try: + if continuous: + logger.info(f"Running continuous profiler for command ID: {command_id}") + gprofiler.run_continuous() + else: + logger.info(f"Running profiler for {duration} seconds (command ID: {command_id})...") + gprofiler.run_single() + + # After run completes, check if it was stopped or completed + if gprofiler._profiler_state.stop_event.is_set(): + logger.info(f"Profiler run was stopped before completion for command ID: {command_id}") + else: + logger.info(f"Profiler run completed successfully for command ID: {command_id}") + + # Try to get results path if available + if hasattr(gprofiler, 'output_dir') and gprofiler.output_dir: + results_path = str(gprofiler.output_dir) + + except Exception as e: + # Internal exceptions can occur during profiling stop + # Only consider a failure if it was not due to a stop event + if not gprofiler._profiler_state.stop_event.is_set(): + error_message = str(e) + logger.error(f"Profiler run failed for command ID {command_id}: {e}", exc_info=True) + else: + logger.info(f"Profiler run was stopped before completion for command ID: {command_id}") + + finally: + # Calculate execution time + end_time = datetime.datetime.now() + execution_time = int((end_time - start_time).total_seconds()) + + # Clear the current profiler reference + if self.current_gprofiler == gprofiler: + self.current_gprofiler = None + + def stop(self): + """Stop the heartbeat manager""" + logger.info("Stopping heartbeat manager...") + self.stop_event.set() + self._stop_current_profiler() diff --git a/gprofiler/main.py b/gprofiler/main.py index ca932a267..ac1b3d6e9 100644 --- a/gprofiler/main.py +++ b/gprofiler/main.py @@ -48,6 +48,7 @@ from gprofiler.diagnostics import log_diagnostics, set_diagnostics from gprofiler.exceptions import APIError, NoProfilersEnabledError from gprofiler.gprofiler_types import ProcessToProfileData, UserArgs, integers_list, positive_integer +from gprofiler.heartbeat import DynamicGProfilerManager, HeartbeatClient from gprofiler.hw_metrics import HWMetricsMonitor, HWMetricsMonitorBase, NoopHWMetricsMonitor from gprofiler.log import RemoteLogsHandler, initial_root_logger_setup from gprofiler.merge import concatenate_from_external_file, concatenate_profiles, merge_profiles @@ -861,6 +862,22 @@ def parse_cmd_args() -> configargparse.Namespace: "The file modification indicates the last snapshot time.", ) + parser.add_argument( + "--enable-heartbeat-server", + action="store_true", + dest="enable_heartbeat_server", + default=False, + help="Enable heartbeat communication with server for dynamic profiling commands", + ) + + parser.add_argument( + "--heartbeat-interval", + type=positive_integer, + dest="heartbeat_interval", + default=30, + help="Interval in seconds for sending heartbeats to server (default: %(default)s)", + ) + if is_linux() and not is_aarch64(): hw_metrics_options = parser.add_argument_group("hardware metrics") hw_metrics_options.add_argument( @@ -936,6 +953,14 @@ def parse_cmd_args() -> configargparse.Namespace: if args.profile_spawned_processes and args.pids_to_profile is not None: parser.error("--pids is not allowed when profiling spawned processes") + if args.enable_heartbeat_server: + if not args.upload_results: + parser.error("--enable-heartbeat-server requires --upload-results to be enabled") + if not args.server_token: + parser.error("--enable-heartbeat-server requires --token to be provided") + if not args.service_name: + parser.error("--enable-heartbeat-server requires --service-name to be provided") + return args @@ -1215,37 +1240,65 @@ def main() -> None: ApplicationIdentifiers.init(enrichment_options) set_diagnostics(args.diagnostics) - gprofiler = GProfiler( - output_dir=args.output_dir, - flamegraph=args.flamegraph, - rotating_output=args.rotating_output, - rootless=args.rootless, - profiler_api_client=profiler_api_client, - collect_metrics=args.collect_metrics, - collect_metadata=args.collect_metadata, - enrichment_options=enrichment_options, - state=state, - usage_logger=usage_logger, - user_args=args.__dict__, - duration=args.duration, - profile_api_version=args.profile_api_version, - profiling_mode=args.profiling_mode, - collect_hw_metrics=getattr(args, "collect_hw_metrics", False), - profile_spawned_processes=args.profile_spawned_processes, - remote_logs_handler=remote_logs_handler, - controller_process=controller_process, - processes_to_profile=processes_to_profile, - external_metadata_path=external_metadata_path, - heartbeat_file_path=heartbeat_file_path, - perfspect_path=perfspect_path, - perfspect_duration=getattr(args, "tool_perfspect_duration", 60), - verbose=args.verbose, - ) - logger.info("gProfiler initialized and ready to start profiling") - if args.continuous: - gprofiler.run_continuous() + + # Check if heartbeat server mode is enabled FIRST + if args.enable_heartbeat_server: + if not args.upload_results: + logger.error("Heartbeat server mode requires --upload-results to be enabled") + sys.exit(1) + + # Create heartbeat client + heartbeat_client = HeartbeatClient( + api_server=args.api_server, + service_name=args.service_name, + server_token=args.server_token, + verify=args.verify, + ) + + # Create dynamic profiler manager + manager = DynamicGProfilerManager(args, heartbeat_client) + manager.heartbeat_interval = args.heartbeat_interval + + try: + logger.info("Starting heartbeat mode - waiting for server commands...") + manager.start_heartbeat_loop() + except KeyboardInterrupt: + logger.info("Received interrupt signal, stopping heartbeat mode...") + finally: + manager.stop() else: - gprofiler.run_single() + # Normal profiling mode + gprofiler = GProfiler( + output_dir=args.output_dir, + flamegraph=args.flamegraph, + rotating_output=args.rotating_output, + rootless=args.rootless, + profiler_api_client=profiler_api_client, + collect_metrics=args.collect_metrics, + collect_metadata=args.collect_metadata, + enrichment_options=enrichment_options, + state=state, + usage_logger=usage_logger, + user_args=args.__dict__, + duration=args.duration, + profile_api_version=args.profile_api_version, + profiling_mode=args.profiling_mode, + collect_hw_metrics=getattr(args, "collect_hw_metrics", False), + profile_spawned_processes=args.profile_spawned_processes, + remote_logs_handler=remote_logs_handler, + controller_process=controller_process, + processes_to_profile=processes_to_profile, + external_metadata_path=external_metadata_path, + heartbeat_file_path=heartbeat_file_path, + perfspect_path=perfspect_path, + perfspect_duration=getattr(args, "tool_perfspect_duration", 60), + verbose=args.verbose, + ) + logger.info("gProfiler initialized and ready to start profiling") + if args.continuous: + gprofiler.run_continuous() + else: + gprofiler.run_single() except KeyboardInterrupt: pass diff --git a/tests/run_heartbeat_agent.py b/tests/run_heartbeat_agent.py new file mode 100644 index 000000000..d37736b56 --- /dev/null +++ b/tests/run_heartbeat_agent.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +Test runner for the gProfiler agent with heartbeat mode enabled. + +This script demonstrates how to run the gProfiler agent in heartbeat mode +to receive dynamic profiling commands from the Performance Studio backend. +""" + +import subprocess +import sys +import os +import signal +import time +from pathlib import Path + +def run_gprofiler_heartbeat_mode(): + """Run gProfiler in heartbeat mode""" + + # Configuration - adjust these values for your environment + config = { + "server_token": "test-token", + "service_name": "test-service", + "api_server": "http://localhost:8000", # Performance Studio backend URL + "server_host": "http://localhost:8000", # Profile upload server URL (can be same) + "output_dir": "/tmp/gprofiler-test", + "log_file": "/tmp/gprofiler-heartbeat.log", + "heartbeat_interval": "10", # seconds + "verbose": True + } + + # Ensure output directory exists + os.makedirs(config["output_dir"], exist_ok=True) + + # Build the command + gprofiler_path = Path(__file__).parent.parent / "gprofiler" / "main.py" + + cmd = [ + sys.executable, + str(gprofiler_path), + "--enable-heartbeat-server", + "--upload-results", + "--token", config["server_token"], + "--service-name", config["service_name"], + "--api-server", config["api_server"], + "--server-host", config["server_host"], + "--output-dir", config["output_dir"], + "--log-file", config["log_file"], + "--heartbeat-interval", config["heartbeat_interval"], + "--no-verify", # For testing with localhost + ] + + if config["verbose"]: + cmd.append("--verbose") + + print("🤖 Starting gProfiler in heartbeat mode...") + print(f"📝 Command: {' '.join(cmd)}") + print("="*60) + print("The agent will:") + print("1. Send heartbeats to the backend every 10 seconds") + print("2. Wait for profiling commands from the server") + print("3. Execute start/stop commands as received") + print("4. Maintain idempotency for duplicate commands") + print("="*60) + print("💡 To test the system:") + print("1. Start the Performance Studio backend") + print("2. Run this script to start the agent") + print("3. Use the backend API to send profiling requests") + print("4. Watch the agent logs to see command execution") + print("="*60) + print("\n🚀 Starting agent... (Press Ctrl+C to stop)") + + try: + # Start the process + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + bufsize=1 + ) + + # Monitor output + for line in iter(process.stdout.readline, ''): + print(f"[AGENT] {line.rstrip()}") + + process.wait() + + except KeyboardInterrupt: + print("\n🛑 Received interrupt signal, stopping agent...") + if process: + process.send_signal(signal.SIGINT) + try: + process.wait(timeout=10) + except subprocess.TimeoutExpired: + print("⚠️ Process didn't stop gracefully, forcing termination...") + process.kill() + process.wait() + + except Exception as e: + print(f"❌ Error running gProfiler: {e}") + return 1 + + print("✅ Agent stopped") + return 0 + +def print_usage(): + """Print usage instructions""" + print("📖 gProfiler Heartbeat Mode Test Runner") + print("="*50) + print("\nThis script runs gProfiler in heartbeat mode for testing.") + print("\nPrerequisites:") + print("1. Performance Studio backend running on http://localhost:8000") + print("2. gProfiler agent code in the expected location") + print("3. Python dependencies installed") + print("\nUsage:") + print(f" {sys.argv[0]}") + print("\nConfiguration:") + print("- Edit the 'config' dictionary in this script to customize settings") + print("- Logs will be written to /tmp/gprofiler-heartbeat.log") + print("- Profiles will be saved to /tmp/gprofiler-test/") + print("\nTesting flow:") + print("1. Start the backend server") + print("2. Run this script to start the agent") + print("3. Use test_heartbeat_system.py to send commands") + print("4. Watch the agent respond to commands") + +def main(): + """Main function""" + if len(sys.argv) > 1 and sys.argv[1] in ["-h", "--help"]: + print_usage() + return 0 + + return run_gprofiler_heartbeat_mode() + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_heartbeat_system.py b/tests/test_heartbeat_system.py new file mode 100644 index 000000000..41158e824 --- /dev/null +++ b/tests/test_heartbeat_system.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +""" +Test script to verify the heartbeat-based profiling control system. + +This script demonstrates: +1. Agent sending heartbeat to backend +2. Backend responding with start/stop commands +3. Agent acting on commands with idempotency +4. Command completion acknowledgments + +Supports both mock mode (default) and live mode with real backend. +""" + +import json +import requests +import time +from datetime import datetime +from typing import Dict, Any, Optional +import unittest.mock +import sys + +# Configuration +BACKEND_URL = "http://localhost:8000" # Adjust based on your setup +SERVICE_NAME = "test-service" +HOSTNAME = "test-host" +IP_ADDRESS = "127.0.0.1" + +# Check if we should run in mock mode (no real backend) +MOCK_MODE = "--live" not in sys.argv # Default to mock mode unless --live specified + +class HeartbeatClient: + """Client to simulate agent heartbeat behavior""" + + def __init__(self, backend_url: str, service_name: str, hostname: str, ip_address: str): + self.backend_url = backend_url.rstrip('/') + self.service_name = service_name + self.hostname = hostname + self.ip_address = ip_address + self.last_command_id: Optional[str] = None + self.executed_commands = set() + + def send_heartbeat(self) -> Optional[Dict[str, Any]]: + """Send heartbeat to backend and return response""" + heartbeat_data = { + "ip_address": self.ip_address, + "hostname": self.hostname, + "service_name": self.service_name, + "last_command_id": self.last_command_id, + "status": "active", + "timestamp": datetime.now().isoformat() + } + + try: + response = requests.post( + f"{self.backend_url}/api/metrics/heartbeat", + json=heartbeat_data, + timeout=10 + ) + + if response.status_code == 200: + result = response.json() + print(f"✓ Heartbeat successful: {result.get('message')}") + + if result.get("profiling_command") and result.get("command_id"): + command_id = result["command_id"] + profiling_command = result["profiling_command"] + command_type = profiling_command.get("command_type", "unknown") + + print(f"📋 Received command: {command_type} (ID: {command_id})") + + # Check idempotency + if command_id in self.executed_commands: + print(f"⚠️ Command {command_id} already executed, skipping...") + return None + + # Mark as executed + self.executed_commands.add(command_id) + self.last_command_id = command_id + + return { + "command_type": command_type, + "command_id": command_id, + "profiling_command": profiling_command + } + else: + print("📭 No pending commands") + return None + else: + print(f"❌ Heartbeat failed: {response.status_code} - {response.text}") + return None + + except Exception as e: + print(f"❌ Heartbeat error: {e}") + return None + + def send_command_completion(self, command_id: str, status: str, execution_time: int = 0, + error_message: str = None, results_path: str = None) -> bool: + """Send command completion status to backend""" + completion_data = { + "command_id": command_id, + "hostname": self.hostname, + "status": status, + "execution_time": execution_time, + "error_message": error_message, + "results_path": results_path + } + + try: + response = requests.post( + f"{self.backend_url}/api/metrics/command_completion", + json=completion_data, + timeout=10 + ) + + if response.status_code == 200: + print(f"✅ Command completion sent successfully for {command_id} with status: {status}") + return True + else: + print(f"❌ Failed to send command completion: {response.status_code} - {response.text}") + return False + + except Exception as e: + print(f"❌ Error sending command completion: {e}") + return False + + def simulate_profiling_action(self, command_type: str, command_id: str): + """Simulate profiling action (start/stop)""" + if command_type == "start": + print(f"🚀 Starting profiler for command {command_id}") + # Simulate profiling work + time.sleep(2) + print(f"✅ Profiler completed successfully") + # Send completion acknowledgment + self.send_command_completion(command_id, "completed", execution_time=2) + elif command_type == "stop": + print(f"🛑 Stopping profiler for command {command_id}") + # Simulate stopping + time.sleep(1) + print(f"✅ Profiler stopped successfully") + # Send completion acknowledgment + self.send_command_completion(command_id, "completed", execution_time=1) + else: + print(f"⚠️ Unknown command type: {command_type}") + # Send failure acknowledgment + self.send_command_completion(command_id, "failed", error_message=f"Unknown command type: {command_type}") + +def create_test_profiling_request(backend_url: str, service_name: str, command_type: str = "start") -> bool: + """Create a test profiling request""" + request_data = { + "service_name": service_name, + "command_type": command_type, + "duration": 60, + "frequency": 11, + "profiling_mode": "cpu", + "target_hostnames": [HOSTNAME], + "additional_args": {"test": True} + } + + try: + response = requests.post( + f"{backend_url}/api/metrics/profile_request", + json=request_data, + timeout=10 + ) + + if response.status_code == 200: + result = response.json() + print(f"✅ Profiling request created: {result.get('message')}") + print(f" Request ID: {result.get('request_id')}") + print(f" Command ID: {result.get('command_id')}") + return True + else: + print(f"❌ Failed to create profiling request: {response.status_code} - {response.text}") + return False + + except Exception as e: + print(f"❌ Error creating profiling request: {e}") + return False + +def create_mock_responses(): + """Create mock responses for testing without a real backend""" + mock_state = { + "pending_commands": [], + "completed_commands": [], + "heartbeat_count": 0 + } + + def mock_heartbeat_post(url, json=None, timeout=None): + """Mock heartbeat endpoint""" + mock_state["heartbeat_count"] += 1 + + # Mock response object + response = unittest.mock.Mock() + response.status_code = 200 + + # Check if there are pending commands + if mock_state["pending_commands"]: + command = mock_state["pending_commands"].pop(0) + response.json.return_value = { + "message": "Heartbeat received", + "command_id": command["command_id"], + "profiling_command": command["profiling_command"] + } + else: + response.json.return_value = { + "message": "Heartbeat received, no pending commands" + } + + return response + + def mock_profile_request_post(url, json=None, timeout=None): + """Mock profile request endpoint""" + # Generate unique IDs based on total requests made + total_requests = len(mock_state["completed_commands"]) + len(mock_state["pending_commands"]) + 1 + command_id = f"cmd_{total_requests}" + request_id = f"req_{total_requests}" + + # Add command to pending queue + mock_state["pending_commands"].append({ + "command_id": command_id, + "profiling_command": { + "command_type": json.get("command_type", "start"), + "combined_config": { + "duration": json.get("duration", 60), + "frequency": json.get("frequency", 11), + "profiling_mode": json.get("profiling_mode", "cpu") + } + } + }) + + response = unittest.mock.Mock() + response.status_code = 200 + response.json.return_value = { + "message": f"Profiling request created", + "request_id": request_id, + "command_id": command_id + } + + return response + + def mock_command_completion_post(url, json=None, timeout=None): + """Mock command completion endpoint""" + mock_state["completed_commands"].append({ + "command_id": json.get("command_id"), + "status": json.get("status"), + "execution_time": json.get("execution_time") + }) + + response = unittest.mock.Mock() + response.status_code = 200 + response.json.return_value = { + "message": "Command completion received" + } + + return response + + def mock_post(url, json=None, timeout=None): + """Route mock requests to appropriate handlers""" + if "/heartbeat" in url: + return mock_heartbeat_post(url, json, timeout) + elif "/profile_request" in url: + return mock_profile_request_post(url, json, timeout) + elif "/command_completion" in url: + return mock_command_completion_post(url, json, timeout) + else: + # Unknown endpoint + response = unittest.mock.Mock() + response.status_code = 404 + response.text = "Not found" + return response + + return mock_post, mock_state + +def run_tests(): + """Run the actual test logic""" + + # Initialize test client + client = HeartbeatClient(BACKEND_URL, SERVICE_NAME, HOSTNAME, IP_ADDRESS) + + # Test 1: Send initial heartbeat (should have no commands) + print("\n1️⃣ Test: Initial heartbeat (no commands expected)") + client.send_heartbeat() + + # Test 2: Create a START profiling request + print("\n2️⃣ Test: Create START profiling request") + if create_test_profiling_request(BACKEND_URL, SERVICE_NAME, "start"): + time.sleep(0.1) # Give backend time to process + + # Send heartbeat to receive the command + print("\n 📡 Sending heartbeat to receive command...") + command = client.send_heartbeat() + + if command: + client.simulate_profiling_action(command["command_type"], command["command_id"]) + + # Test idempotency - send heartbeat again + print("\n 🔄 Testing idempotency - sending heartbeat again...") + command = client.send_heartbeat() + if command is None: + print("✅ Idempotency working - no duplicate command received") + + # Test 3: Create a STOP profiling request + print("\n3️⃣ Test: Create STOP profiling request") + if create_test_profiling_request(BACKEND_URL, SERVICE_NAME, "stop"): + time.sleep(0.1) # Give backend time to process + + # Send heartbeat to receive the stop command + print("\n 📡 Sending heartbeat to receive stop command...") + command = client.send_heartbeat() + + if command: + client.simulate_profiling_action(command["command_type"], command["command_id"]) + + # Test 4: Multiple heartbeats with no commands + print("\n4️⃣ Test: Multiple heartbeats with no pending commands") + for i in range(3): + print(f"\n Heartbeat {i+1}/3:") + client.send_heartbeat() + time.sleep(0.1) + + print("\n✅ Test completed!") + print("\nTest Summary:") + print(f" - Executed commands: {len(client.executed_commands)}") + print(f" - Last command ID: {client.last_command_id}") + print(f" - Commands executed: {list(client.executed_commands)}") + +def main(): + """Main test function""" + print("🧪 Testing Heartbeat-Based Profiling Control System") + + if MOCK_MODE: + print("🎭 Running in MOCK MODE (no real backend required)") + print(" Use --live flag to test against real backend on localhost:8000") + mock_post, mock_state = create_mock_responses() + + # Patch requests.post for mock mode + with unittest.mock.patch('requests.post', side_effect=mock_post): + print("=" * 60) + run_tests() + + # Print mock state summary + print(f"\n📊 Mock Backend State:") + print(f" - Total heartbeats: {mock_state['heartbeat_count']}") + print(f" - Pending commands: {len(mock_state['pending_commands'])}") + print(f" - Completed commands: {len(mock_state['completed_commands'])}") + + if mock_state['completed_commands']: + print(" - Command completions:") + for cmd in mock_state['completed_commands']: + print(f" * {cmd['command_id']}: {cmd['status']} ({cmd['execution_time']}s)") + + else: + print("🌐 Running in LIVE MODE (requires backend on localhost:8000)") + print("=" * 60) + run_tests() + +if __name__ == "__main__": + main() From 5456e44784ae544a1ea24986e859fd8c5fe7be11 Mon Sep 17 00:00:00 2001 From: ashokchatharajupalli Date: Sat, 6 Dec 2025 17:21:34 +0000 Subject: [PATCH 2/8] Remove metrics_publisher dependency for Intel compatibility Removed Pinterest-specific MetricsPublisher imports and calls. requiring additional Pinterest-specific dependencies. --- gprofiler/heartbeat.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/gprofiler/heartbeat.py b/gprofiler/heartbeat.py index 3eabcc0c3..31fde5d24 100644 --- a/gprofiler/heartbeat.py +++ b/gprofiler/heartbeat.py @@ -36,12 +36,6 @@ from gprofiler.metadata.enrichment import EnrichmentOptions from gprofiler.metadata.metadata_collector import get_static_metadata from gprofiler.metadata.system_metadata import get_hostname -from gprofiler.metrics_publisher import ( - MetricsPublisher, - METRIC_BASE_NAME, - RESPONSE_TYPE_SUCCESS, - RESPONSE_TYPE_FAILURE -) from gprofiler.profiler_state import ProfilerState from gprofiler.profilers.factory import get_profilers from gprofiler.profilers.profiler_base import NoopProfiler @@ -109,11 +103,6 @@ def send_heartbeat(self) -> Optional[Dict[str, Any]]: if response.status_code == 200: result = response.json() - # Emit success metric (SLI tracking) using singleton - MetricsPublisher.get_instance().send_sli_metric( - response_type=RESPONSE_TYPE_SUCCESS, - method_name='send_heartbeat' - ) if result.get("success") and result.get("profiling_command"): logger.info(f"Received profiling command from server: {result.get('command_id')}") @@ -123,22 +112,10 @@ def send_heartbeat(self) -> Optional[Dict[str, Any]]: return None else: logger.warning(f"Heartbeat failed with status {response.status_code}: {response.text}") - # Emit failure metric (SLI tracking) using singleton - MetricsPublisher.get_instance().send_sli_metric( - response_type=RESPONSE_TYPE_FAILURE, - method_name='send_heartbeat', - extra_tags={'status_code': response.status_code} - ) return None except Exception as e: logger.error(f"Failed to send heartbeat: {e}") - # Emit failure metric (SLI tracking) using singleton - MetricsPublisher.get_instance().send_sli_metric( - response_type=RESPONSE_TYPE_FAILURE, - method_name='send_heartbeat', - extra_tags={'error': str(e)} - ) return None def send_command_completion(self, command_id: str, status: str, execution_time: Optional[int] = None, From f60a4d024d32ef22e576c33b5ce3c95b7cad71ca Mon Sep 17 00:00:00 2001 From: ashokchatharajupalli Date: Tue, 9 Dec 2025 00:23:57 +0000 Subject: [PATCH 3/8] WIP: Add cgroup support and upgrade PyInstaller --- exe-requirements.txt | 2 +- gprofiler/main.py | 48 ++- gprofiler/profiler_state.py | 2 + gprofiler/profilers/perf.py | 43 +++ gprofiler/profilers/profiler_base.py | 67 ++++ gprofiler/profilers/python.py | 26 ++ gprofiler/profilers/python_ebpf.py | 52 +++ gprofiler/utils/cgroup_utils.py | 532 +++++++++++++++++++++++++++ gprofiler/utils/perf_process.py | 76 +++- 9 files changed, 844 insertions(+), 4 deletions(-) create mode 100644 gprofiler/utils/cgroup_utils.py diff --git a/exe-requirements.txt b/exe-requirements.txt index a5f892587..527ddc3d1 100644 --- a/exe-requirements.txt +++ b/exe-requirements.txt @@ -1,2 +1,2 @@ -pyinstaller==6.12.0 +pyinstaller==6.17.0 staticx @ git+https://github.com/Granulate/staticx.git@33eefdadc72832d5aa67c0792768c9e76afb746d; platform.machine == "x86_64" diff --git a/gprofiler/main.py b/gprofiler/main.py index ac1b3d6e9..0389b7a5b 100644 --- a/gprofiler/main.py +++ b/gprofiler/main.py @@ -33,7 +33,7 @@ from granulate_utils.linux.ns import is_root, is_running_in_init_pid from granulate_utils.linux.process import is_process_running from granulate_utils.metadata.cloud import get_aws_execution_env -from psutil import NoSuchProcess, Process +from psutil import NoSuchProcess, Process, process_iter from requests import RequestException, Timeout from gprofiler import __version__ @@ -168,6 +168,8 @@ def __init__( profiling_mode=profiling_mode, container_names_client=container_names_client, processes_to_profile=processes_to_profile, + max_processes_per_profiler=user_args.get("max_processes_per_profiler", 0), + max_system_processes_for_system_profilers=user_args.get("max_system_processes_for_system_profilers", 0), ) self.system_profiler, self.process_profilers = get_profilers(user_args, profiler_state=self._profiler_state) self._usage_logger = usage_logger @@ -280,8 +282,33 @@ def start(self) -> None: self._system_metrics_monitor.start() self._hw_metrics_monitor.start() + # Check if system should skip continuous profilers due to process count + skip_system_profilers = False + if self._profiler_state.max_system_processes_for_system_profilers > 0: + try: + total_processes = len(list(process_iter())) + if total_processes > self._profiler_state.max_system_processes_for_system_profilers: + skip_system_profilers = True + logger.warning( + f"Skipping system profilers (perf) - {total_processes} processes exceed threshold " + f"of {self._profiler_state.max_system_processes_for_system_profilers}. " + f"Runtime profilers (py-spy, Java, etc.) will continue normally." + ) + else: + logger.debug( + f"System process count: {total_processes} " + f"(threshold: {self._profiler_state.max_system_processes_for_system_profilers})" + ) + except Exception as e: + logger.warning(f"Could not count system processes, continuing with all profilers: {e}") + for prof in list(self.all_profilers): try: + # Skip system profilers if threshold exceeded + if skip_system_profilers and hasattr(prof, '_is_system_wide_profiler') and prof._is_system_wide_profiler(): + logger.info(f"Skipping {prof.__class__.__name__} due to high system process count") + continue + prof.start() except Exception: # the SystemProfiler is handled separately - let the user run with '--perf-mode none' if they @@ -595,6 +622,25 @@ def parse_cmd_args() -> configargparse.Namespace: help="Comma separated list of processes that will be filtered to profile," " given multiple times will append pids to one list", ) + parser.add_argument( + "--max-processes-runtime-profiler", + dest="max_processes_per_profiler", + type=positive_integer, + default=0, + help="Maximum number of processes to profile per runtime profiler (0=unlimited). " + "When exceeded, profiles only the top N processes by CPU usage. " + "Does not affect system-wide profilers (perf, eBPF). Default: %(default)s", + ) + parser.add_argument( + "--skip-system-profilers-above", + dest="max_system_processes_for_system_profilers", + type=positive_integer, + default=0, + help="Skip system-wide profilers (perf only) when total system processes exceed this threshold (0=unlimited). " + "When exceeded, prevents perf profiler from starting to reduce resource usage on busy systems. " + "PyPerf has its own threshold via --python-skip-pyperf-profiler-above. " + "Runtime profilers (py-spy, Java, etc.) continue normally with --max-processes limiting. Default: %(default)s", + ) parser.add_argument( "--rootless", action="store_true", diff --git a/gprofiler/profiler_state.py b/gprofiler/profiler_state.py index 58597778d..64fa64314 100644 --- a/gprofiler/profiler_state.py +++ b/gprofiler/profiler_state.py @@ -25,6 +25,8 @@ class ProfilerState: profiling_mode: str container_names_client: Optional[ContainerNamesClient] processes_to_profile: Optional[List[Process]] + max_processes_per_profiler: int + max_system_processes_for_system_profilers: int def __post_init__(self) -> None: self._temporary_dir = TemporaryDirectoryWithMode(dir=self.storage_dir, mode=0o755) diff --git a/gprofiler/profilers/perf.py b/gprofiler/profilers/perf.py index d48676e07..28a60fba1 100644 --- a/gprofiler/profilers/perf.py +++ b/gprofiler/profilers/perf.py @@ -125,6 +125,30 @@ def add_highest_avg_depth_stacks_per_process( action="store_false", dest="perf_memory_restart", ), + ProfilerArgument( + "--perf-use-cgroups", + help="Use cgroup-based profiling instead of PID-based profiling for better reliability. " + "Profiles the top N cgroups by resource usage, avoiding crashes from invalid PIDs.", + action="store_true", + default=False, + dest="perf_use_cgroups", + ), + ProfilerArgument( + "--perf-max-cgroups", + help="Maximum number of cgroups to profile when using --perf-use-cgroups. Default: %(default)s", + type=int, + default=50, + dest="perf_max_cgroups", + ), + ProfilerArgument( + "--perf-max-docker-containers", + help="Maximum number of individual Docker containers to profile instead of the broad 'docker' cgroup. " + "When set, profiles the top N highest-resource individual containers rather than all containers together. " + "Set to 0 to use the broad 'docker' cgroup (default behavior). Default: %(default)s", + type=int, + default=0, + dest="perf_max_docker_containers", + ), ], disablement_help="Disable the global perf of processes," " and instead only concatenate runtime-specific profilers results", @@ -138,6 +162,10 @@ class SystemProfiler(ProfilerBase): versions of Go processes. """ + def _is_system_wide_profiler(self) -> bool: + """Perf is a system-wide profiler that can be disabled on busy systems.""" + return True + def __init__( self, frequency: int, @@ -148,6 +176,9 @@ def __init__( perf_inject: bool, perf_node_attach: bool, perf_memory_restart: bool, + perf_use_cgroups: bool = False, + perf_max_cgroups: int = 50, + perf_max_docker_containers: int = 0, min_duration: int = 0, ): super().__init__(frequency, duration, profiler_state, min_duration) @@ -159,6 +190,12 @@ def __init__( self._node_processes: List[Process] = [] self._node_processes_attached: List[Process] = [] self._perf_memory_restart = perf_memory_restart + self._perf_mode = perf_mode + self._perf_dwarf_stack_size = perf_dwarf_stack_size + self._perf_inject = perf_inject + self._perf_use_cgroups = perf_use_cgroups + self._perf_max_cgroups = perf_max_cgroups + self._perf_max_docker_containers = perf_max_docker_containers switch_timeout_s = duration * 3 # allow gprofiler to be delayed up to 3 intervals before timing out. extra_args = [] try: @@ -184,6 +221,9 @@ def __init__( extra_args=extra_args, processes_to_profile=self._profiler_state.processes_to_profile, switch_timeout_s=switch_timeout_s, + use_cgroups=self._perf_use_cgroups, + max_cgroups=self._perf_max_cgroups, + max_docker_containers=self._perf_max_docker_containers, ) self._perfs.append(self._perf_fp) else: @@ -200,6 +240,9 @@ def __init__( extra_args=extra_args, processes_to_profile=self._profiler_state.processes_to_profile, switch_timeout_s=switch_timeout_s, + use_cgroups=self._perf_use_cgroups, + max_cgroups=self._perf_max_cgroups, + max_docker_containers=self._perf_max_docker_containers, ) self._perfs.append(self._perf_dwarf) else: diff --git a/gprofiler/profilers/profiler_base.py b/gprofiler/profilers/profiler_base.py index 78d2ba2c4..9e2210b5c 100644 --- a/gprofiler/profilers/profiler_base.py +++ b/gprofiler/profilers/profiler_base.py @@ -180,6 +180,65 @@ def _profile_process(self, process: Process, duration: int, spawned: bool) -> Pr def _notify_selected_processes(self, processes: List[Process]) -> None: pass + def _should_limit_processes(self) -> bool: + """ + Override this in profilers that should NOT respect the max_processes_per_profiler limit. + System-wide profilers (perf, eBPF) should return False. + Runtime profilers (py-spy, Java, Ruby, etc.) should return True (default). + """ + return True + + def _is_system_wide_profiler(self) -> bool: + """ + Override this in system-wide profilers (perf, eBPF) to return True. + These profilers can be disabled when system has too many processes. + """ + return False + + def _get_top_processes_by_cpu(self, processes: List[Process], max_processes: int) -> List[Process]: + """ + Filter processes to the top N by CPU usage to reduce memory consumption. + + Args: + processes: List of processes to filter + max_processes: Maximum number of processes to return + + Returns: + List of top N processes by CPU usage, or all processes if max_processes <= 0 + """ + if max_processes <= 0 or len(processes) <= max_processes: + return processes + + logger.info( + f"{self.__class__.__name__}: Limiting to top {max_processes} processes " + f"(from {len(processes)}) by CPU usage to reduce memory consumption" + ) + + # Get CPU usage for each process, handling exceptions gracefully + processes_with_cpu = [] + for process in processes: + try: + # Use short interval for CPU measurement to avoid blocking + cpu_percent = process.cpu_percent(interval=0.1) + processes_with_cpu.append((process, cpu_percent)) + except (NoSuchProcess, ZombieProcess, PermissionError): + # Process may have died or we don't have permission + # Still include it with 0% CPU so it's considered but deprioritized + processes_with_cpu.append((process, 0.0)) + except Exception as e: + logger.debug(f"Error getting CPU usage for process {process.pid}: {e}") + processes_with_cpu.append((process, 0.0)) + + # Sort by CPU usage (descending) and take top N + processes_with_cpu.sort(key=lambda x: x[1], reverse=True) + top_processes = [proc for proc, cpu in processes_with_cpu[:max_processes]] + + if logger.isEnabledFor(logging.DEBUG): + top_cpu_info = [(proc.pid, cpu) for proc, cpu in processes_with_cpu[:min(5, max_processes)]] + logger.debug(f"{self.__class__.__name__}: Selected top processes by CPU: {top_cpu_info}") + + return top_processes + def _get_process_age(self, process: Process) -> float: """Get the age of a process in seconds.""" try: @@ -206,6 +265,14 @@ def snapshot(self) -> ProcessToProfileData: process for process in processes_to_profile if process in self._profiler_state.processes_to_profile ] logger.debug(f"{self.__class__.__name__}: processes left after filtering: {len(processes_to_profile)}") + + # Apply max_processes_per_profiler limit for runtime profilers (not system-wide profilers) + if self._should_limit_processes() and self._profiler_state.max_processes_per_profiler > 0: + processes_to_profile = self._get_top_processes_by_cpu( + processes_to_profile, + self._profiler_state.max_processes_per_profiler + ) + self._notify_selected_processes(processes_to_profile) if not processes_to_profile: diff --git a/gprofiler/profilers/python.py b/gprofiler/profilers/python.py index 61d47e863..d6b14243d 100644 --- a/gprofiler/profilers/python.py +++ b/gprofiler/profilers/python.py @@ -45,6 +45,7 @@ StackToSampleCount, integers_list, nonnegative_integer, + positive_integer, ) from gprofiler.log import get_logger_adapter from gprofiler.metadata import application_identifiers @@ -378,6 +379,15 @@ def _should_skip_process(self, process: Process) -> bool: " they are not recognized by gProfiler as Python processes." " Note - gProfiler assumes that the given processes are kept running as long as gProfiler runs.", ), + ProfilerArgument( + name="--python-skip-pyperf-profiler-above", + dest="python_skip_pyperf_profiler_above", + type=positive_integer, + default=0, + help="Skip PyPerf (eBPF Python profiler) when Python processes exceed this threshold (0=unlimited). " + "When exceeded, prevents PyPerf from starting but allows py-spy fallback for Python profiling. " + "This provides fine-grained control over PyPerf resource usage independent of system profilers. Default: %(default)s", + ), ], supported_profiling_modes=["cpu"], ) @@ -398,6 +408,7 @@ def __init__( python_pyperf_verbose: bool, python_pyspy_process: List[int], min_duration: int = 0, + python_skip_pyperf_profiler_above: int = 0, ): if python_mode == "py-spy": python_mode = "pyspy" @@ -422,6 +433,7 @@ def __init__( python_pyperf_user_stacks_pages, python_pyperf_verbose, min_duration, + python_skip_pyperf_profiler_above, ) else: self._ebpf_profiler = None @@ -449,6 +461,7 @@ def _create_ebpf_profiler( user_stacks_pages: Optional[int], verbose: bool, min_duration: int, + python_skip_pyperf_profiler_above: int, ) -> Optional[PythonEbpfProfiler]: try: profiler = PythonEbpfProfiler( @@ -459,6 +472,7 @@ def _create_ebpf_profiler( user_stacks_pages=user_stacks_pages, verbose=verbose, min_duration=min_duration, + python_skip_pyperf_profiler_above=python_skip_pyperf_profiler_above, ) profiler.test() return profiler @@ -468,6 +482,18 @@ def _create_ebpf_profiler( return None def start(self) -> None: + # Check PyPerf-specific skip logic first + if self._ebpf_profiler is not None: + if self._ebpf_profiler.should_skip_due_to_python_threshold(): + # Skip PyPerf but keep py-spy as fallback + logger.info("PyPerf skipped due to Python process threshold, falling back to py-spy") + self._ebpf_profiler = None + + # Ensure py-spy profiler exists as fallback + if self._pyspy_profiler is None: + logger.warning("PyPerf skipped but no py-spy fallback available") + + # Start the appropriate profiler if self._ebpf_profiler is not None: self._ebpf_profiler.start() elif self._pyspy_profiler is not None: diff --git a/gprofiler/profilers/python_ebpf.py b/gprofiler/profilers/python_ebpf.py index cf76ca698..3e417cf87 100644 --- a/gprofiler/profilers/python_ebpf.py +++ b/gprofiler/profilers/python_ebpf.py @@ -78,6 +78,7 @@ def __init__( user_stacks_pages: Optional[int] = None, verbose: bool, min_duration: int = 0, + python_skip_pyperf_profiler_above: int = 0, ): super().__init__(frequency, duration, profiler_state, min_duration) self.process: Optional[Popen] = None @@ -89,10 +90,61 @@ def __init__( self._metadata = python.PythonMetadata(self._profiler_state.stop_event) self._verbose = verbose self._pyperf_staticx_tmpdir: Optional[Path] = None + self._python_skip_pyperf_profiler_above = python_skip_pyperf_profiler_above if os.environ.get("TMPDIR", None) is not None: # We want to create a new level of hirerachy in our current staticx tempdir. self._pyperf_staticx_tmpdir = Path(os.environ["TMPDIR"]) / ("pyperf_" + random_prefix()) + def _count_python_processes(self) -> int: + """ + Count Python processes using the same detection logic as py-spy. + This ensures consistent counting between PyPerf skip logic and py-spy process selection. + """ + try: + from gprofiler.utils import pgrep_maps, pgrep_exe + + # Count all processes that match Python detection criteria + python_pattern = "python" + python_processes = set() + + # Check via maps (memory mappings contain libpython) + try: + python_processes.update(pgrep_maps(python_pattern)) + except Exception: + pass + + # Check via executable name + try: + python_processes.update(pgrep_exe(python_pattern)) + except Exception: + pass + + return len(python_processes) + except Exception as e: + logger.debug(f"Error counting Python processes: {e}") + return 0 + + def should_skip_due_to_python_threshold(self) -> bool: + """ + Check if PyPerf should be skipped due to too many Python processes. + This provides PyPerf-specific resource management independent of system profiler logic. + """ + if self._python_skip_pyperf_profiler_above <= 0: + return False # No threshold set, don't skip + + python_process_count = self._count_python_processes() + should_skip = python_process_count > self._python_skip_pyperf_profiler_above + + if should_skip: + logger.info( + f"Skipping PyPerf - {python_process_count} Python processes exceed threshold " + f"of {self._python_skip_pyperf_profiler_above}. py-spy fallback will be used for Python profiling." + ) + else: + logger.debug(f"PyPerf: Python process count {python_process_count} (threshold: {self._python_skip_pyperf_profiler_above})") + + return should_skip + @classmethod def _check_output(cls, process: Popen, output_path: Path) -> None: if not glob.glob(f"{str(output_path)}.*"): diff --git a/gprofiler/utils/cgroup_utils.py b/gprofiler/utils/cgroup_utils.py new file mode 100644 index 000000000..852f44ae5 --- /dev/null +++ b/gprofiler/utils/cgroup_utils.py @@ -0,0 +1,532 @@ +# +# Copyright (C) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import logging +from pathlib import Path +from typing import List, Optional, Tuple, Dict +from dataclasses import dataclass +from enum import Enum + +logger = logging.getLogger(__name__) + + +class CgroupVersion(Enum): + """Cgroup version enumeration""" + V1 = "v1" + V2 = "v2" + UNKNOWN = "unknown" + +@dataclass +class CgroupResourceUsage: + """Represents resource usage for a cgroup""" + cgroup_path: str + name: str + cpu_usage: int # CPU usage in nanoseconds + memory_usage: int # Memory usage in bytes + + @property + def total_score(self) -> float: + """Calculate a combined score for ranking cgroups by resource usage + + Prioritizes CPU usage over memory since CPU indicates active processes + that are more interesting for profiling. + """ + # Normalize CPU (ns) and memory (bytes) to comparable scales + cpu_score = self.cpu_usage / 1_000_000_000 # ns to seconds + memory_score = self.memory_usage / (1024 * 1024) # bytes to MB + + # Weight CPU heavily (10x) since active CPU usage is more important for profiling + # than static memory usage + return (cpu_score * 10) + memory_score + + +def detect_cgroup_version() -> CgroupVersion: + """Detect which cgroup version is in use for Docker containers""" + try: + # Check if Docker containers are using cgroup v1 paths (hybrid systems) + if os.path.exists("/sys/fs/cgroup/memory/docker") or os.path.exists("/sys/fs/cgroup/cpu,cpuacct/docker"): + return CgroupVersion.V1 + + # Check if cgroup v2 is mounted and being used + with open("/proc/mounts", "r") as f: + mounts = f.read() + if "cgroup2" in mounts and "/sys/fs/cgroup" in mounts: + # Check if Docker containers exist in v2 paths + v2_docker_paths = [ + "/sys/fs/cgroup/system.slice", + "/sys/fs/cgroup/docker" + ] + for path in v2_docker_paths: + if os.path.exists(path): + try: + entries = os.listdir(path) + if any("docker" in entry.lower() for entry in entries): + return CgroupVersion.V2 + except (OSError, PermissionError): + continue + + # If cgroup2 is mounted but no Docker containers found in v2, check v1 + if "/sys/fs/cgroup/memory" in mounts or "/sys/fs/cgroup/cpu" in mounts: + return CgroupVersion.V1 + else: + return CgroupVersion.V2 + elif "cgroup" in mounts and ("/sys/fs/cgroup/memory" in mounts or "/sys/fs/cgroup/cpu" in mounts): + return CgroupVersion.V1 + except (IOError, OSError) as e: + logger.debug(f"Failed to read /proc/mounts: {e}") + + # Fallback: check filesystem structure + if os.path.exists("/sys/fs/cgroup/memory") or os.path.exists("/sys/fs/cgroup/cpu,cpuacct"): + return CgroupVersion.V1 + elif os.path.exists("/sys/fs/cgroup/cgroup.controllers"): + return CgroupVersion.V2 + + return CgroupVersion.UNKNOWN + + +def is_cgroup_available() -> bool: + """Check if cgroup filesystem is available and mounted""" + return os.path.exists("/sys/fs/cgroup") and detect_cgroup_version() != CgroupVersion.UNKNOWN + + +def get_cgroup_cpu_usage(cgroup_path: str) -> Optional[int]: + """Get CPU usage for a cgroup in nanoseconds""" + cgroup_version = detect_cgroup_version() + + if cgroup_version == CgroupVersion.V2: + # cgroup v2 uses cpu.stat file + cpu_stat_file = os.path.join(cgroup_path, "cpu.stat") + if os.path.exists(cpu_stat_file): + try: + with open(cpu_stat_file, 'r') as f: + for line in f: + if line.startswith("usage_usec "): + # Convert microseconds to nanoseconds + return int(line.split()[1]) * 1000 + except (IOError, ValueError) as e: + logger.debug(f"Failed to read CPU usage from {cpu_stat_file}: {e}") + return None + + else: # cgroup v1 + usage_file = os.path.join(cgroup_path, "cpuacct.usage") + if not os.path.exists(usage_file): + # Try alternative path + alt_path = cgroup_path.replace("/cpu,cpuacct/", "/cpuacct/") + usage_file = os.path.join(alt_path, "cpuacct.usage") + if not os.path.exists(usage_file): + return None + + try: + with open(usage_file, 'r') as f: + return int(f.read().strip()) + except (IOError, ValueError) as e: + logger.debug(f"Failed to read CPU usage from {usage_file}: {e}") + return None + + +def get_cgroup_memory_usage(cgroup_path: str) -> Optional[int]: + """Get memory usage for a cgroup in bytes""" + cgroup_version = detect_cgroup_version() + + if cgroup_version == CgroupVersion.V2: + # cgroup v2 uses memory.current file + usage_file = os.path.join(cgroup_path, "memory.current") + else: # cgroup v1 + usage_file = os.path.join(cgroup_path, "memory.usage_in_bytes") + + if not os.path.exists(usage_file): + return None + + try: + with open(usage_file, 'r') as f: + return int(f.read().strip()) + except (IOError, ValueError) as e: + logger.debug(f"Failed to read memory usage from {usage_file}: {e}") + return None + + +def find_all_cgroups() -> List[str]: + """Find all available cgroups in the system""" + cgroups = [] + cgroup_version = detect_cgroup_version() + + if cgroup_version == CgroupVersion.V2: + # cgroup v2 unified hierarchy + base = "/sys/fs/cgroup" + try: + for root, dirs, files in os.walk(base): + # Skip the root directory itself + if root == base: + continue + + # Check if this directory has the necessary files for v2 + cpu_file = os.path.join(root, "cpu.stat") + memory_file = os.path.join(root, "memory.current") + + if os.path.exists(cpu_file) or os.path.exists(memory_file): + cgroups.append(root) + except OSError as e: + logger.debug(f"Error walking cgroup v2 directory {base}: {e}") + + else: # cgroup v1 + # Common cgroup mount points to check + cgroup_bases = [ + "/sys/fs/cgroup/cpu,cpuacct", + "/sys/fs/cgroup/memory", + "/sys/fs/cgroup/cpuacct", + ] + + for base in cgroup_bases: + if os.path.exists(base): + try: + # Walk through all subdirectories + for root, dirs, files in os.walk(base): + # Skip the base directory itself + if root == base: + continue + + # Check if this directory has the necessary files + cpu_file = os.path.join(root, "cpuacct.usage") + memory_file = root.replace("/cpu,cpuacct/", "/memory/") + "/memory.usage_in_bytes" + + if os.path.exists(cpu_file) or os.path.exists(memory_file): + cgroups.append(root) + except OSError as e: + logger.debug(f"Error walking cgroup directory {base}: {e}") + continue + + return list(set(cgroups)) # Remove duplicates + + +def get_cgroup_resource_usage(cgroup_path: str) -> Optional[CgroupResourceUsage]: + """Get resource usage for a single cgroup""" + cpu_usage = get_cgroup_cpu_usage(cgroup_path) + + # For memory, try to find the corresponding memory cgroup path + memory_path = cgroup_path.replace("/cpu,cpuacct/", "/memory/") + if not os.path.exists(memory_path): + memory_path = cgroup_path.replace("/cpuacct/", "/memory/") + + memory_usage = get_cgroup_memory_usage(memory_path) + + # If we can't get any usage data, skip this cgroup + if cpu_usage is None and memory_usage is None: + return None + + # Use 0 as default if one metric is missing + cpu_usage = cpu_usage or 0 + memory_usage = memory_usage or 0 + + # Extract a readable name from the path + name = os.path.basename(cgroup_path) + if len(name) > 12: # Truncate long container IDs + name = name[:12] + + return CgroupResourceUsage( + cgroup_path=cgroup_path, + name=name, + cpu_usage=cpu_usage, + memory_usage=memory_usage + ) + + +def get_top_cgroups_by_usage(limit: int = 50) -> List[CgroupResourceUsage]: + """Get the top N cgroups by resource usage""" + if not is_cgroup_available(): + logger.warning("Cgroup filesystem not available") + return [] + + all_cgroups = find_all_cgroups() + logger.debug(f"Found {len(all_cgroups)} cgroups to analyze") + + cgroup_usages = [] + for cgroup_path in all_cgroups: + usage = get_cgroup_resource_usage(cgroup_path) + if usage: + cgroup_usages.append(usage) + + # Sort by total resource usage score (descending) + cgroup_usages.sort(key=lambda x: x.total_score, reverse=True) + + logger.debug(f"Analyzed {len(cgroup_usages)} cgroups with resource data") + + return cgroup_usages[:limit] + + +def cgroup_to_perf_name(cgroup_path: str) -> str: + """Convert a cgroup path to the name format expected by perf -G option""" + # perf expects the cgroup name relative to the cgroup mount point + # For example: /sys/fs/cgroup/memory/docker/abc123 -> docker/abc123 + + # Find the relative path from the cgroup mount point + for base in ["/sys/fs/cgroup/memory/", "/sys/fs/cgroup/cpu,cpuacct/", "/sys/fs/cgroup/cpuacct/"]: + if cgroup_path.startswith(base): + return cgroup_path[len(base):] + + # Fallback: just use the basename + return os.path.basename(cgroup_path) + + +def convert_cgroupv2_path_to_perf_name(cgroup_path: str) -> str: + """Convert a cgroup v2 path to perf-compatible name""" + # Remove the base cgroup path + if cgroup_path.startswith("/sys/fs/cgroup/"): + relative_path = cgroup_path[len("/sys/fs/cgroup/"):] + else: + relative_path = cgroup_path + + # Handle Docker container paths in cgroup v2 + if "docker-" in relative_path and ".scope" in relative_path: + # Extract container ID from system.slice/docker-.scope + import re + match = re.search(r'docker-([a-f0-9]{64})\.scope', relative_path) + if match: + container_id = match.group(1) + return f"docker/{container_id}" + + # Handle other Docker paths + if relative_path.startswith("docker/"): + return relative_path + + # For other cgroups, use the relative path + return relative_path + + +def validate_cgroup_perf_event_access(cgroup_name: str) -> bool: + """Check if a cgroup is available for perf profiling""" + cgroup_version = detect_cgroup_version() + + if cgroup_version == CgroupVersion.V2: + # In cgroup v2, perf events are handled differently + # The cgroup path should exist in the unified hierarchy + if cgroup_name.startswith("docker/"): + # For Docker containers in cgroup v2, check common paths + container_id = cgroup_name.replace("docker/", "") + possible_paths = [ + f"/sys/fs/cgroup/system.slice/docker-{container_id}.scope", + f"/sys/fs/cgroup/docker/{container_id}", + f"/sys/fs/cgroup/system.slice/docker.service/docker/{container_id}", + ] + for path in possible_paths: + if os.path.exists(path) and os.path.isdir(path): + return True + return False + else: + # For other cgroups in v2, check if the path exists + # Handle both absolute and relative paths + if cgroup_name.startswith("/sys/fs/cgroup/"): + cgroup_path = cgroup_name + else: + cgroup_path = f"/sys/fs/cgroup/{cgroup_name}" + return os.path.exists(cgroup_path) and os.path.isdir(cgroup_path) + + else: # cgroup v1 + perf_event_path = f"/sys/fs/cgroup/perf_event/{cgroup_name}" + return os.path.exists(perf_event_path) and os.path.isdir(perf_event_path) + + +def get_top_docker_containers_for_perf(limit: int) -> List[str]: + """Get top Docker containers by resource usage for perf profiling + + Returns individual Docker container cgroup names that exist in perf_event controller. + """ + import subprocess + + docker_containers = [] + cgroup_version = detect_cgroup_version() + + try: + # Get running Docker containers with resource stats + result = subprocess.run( + ["docker", "stats", "--no-stream", "--format", "{{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + container_stats = [] + for line in result.stdout.strip().split('\n'): + if line.strip(): + parts = line.split('\t') + if len(parts) >= 2: + container_id = parts[0] + cpu_percent_str = parts[1].replace('%', '') + try: + cpu_percent = float(cpu_percent_str) + container_stats.append((container_id, cpu_percent)) + except ValueError: + continue + + # Sort by CPU usage (descending) + container_stats.sort(key=lambda x: x[1], reverse=True) + + # Get full container IDs and check perf_event access + for container_id, cpu_percent in container_stats[:limit * 2]: # Get more than needed in case some don't have perf access + try: + # Get full container ID + full_id_result = subprocess.run( + ["docker", "inspect", "--format", "{{.Id}}", container_id], + capture_output=True, + text=True, + timeout=5 + ) + + if full_id_result.returncode == 0: + full_id = full_id_result.stdout.strip() + + if cgroup_version == CgroupVersion.V2: + # For cgroup v2, we need to find the actual cgroup path + # and use the relative path for perf + possible_paths = [ + f"/sys/fs/cgroup/system.slice/docker-{full_id}.scope", + f"/sys/fs/cgroup/docker/{full_id}", + f"/sys/fs/cgroup/system.slice/docker.service/docker/{full_id}", + ] + + docker_cgroup = None + for path in possible_paths: + if os.path.exists(path) and os.path.isdir(path): + # For cgroup v2, perf expects the relative path from /sys/fs/cgroup/ + docker_cgroup = path.replace("/sys/fs/cgroup/", "") + logger.debug(f"Found cgroup v2 path for container {container_id}: {path} -> {docker_cgroup}") + break + + if not docker_cgroup: + # Fallback: try to find any docker-related path for this container + try: + import glob + pattern = f"/sys/fs/cgroup/**/docker*{full_id[:12]}*" + matches = glob.glob(pattern, recursive=True) + if matches: + docker_cgroup = matches[0].replace("/sys/fs/cgroup/", "") + logger.debug(f"Found fallback cgroup v2 path: {matches[0]} -> {docker_cgroup}") + else: + docker_cgroup = f"docker/{full_id}" # Last resort fallback + logger.debug(f"No cgroup v2 path found, using fallback: {docker_cgroup}") + except Exception as e: + docker_cgroup = f"docker/{full_id}" + logger.debug(f"Error finding cgroup v2 path: {e}, using fallback: {docker_cgroup}") + else: + # cgroup v1 format + docker_cgroup = f"docker/{full_id}" + + # Check if this container has perf_event access + if validate_cgroup_perf_event_access(docker_cgroup): + docker_containers.append(docker_cgroup) + logger.debug(f"Added Docker container for profiling: {container_id} (CPU: {cpu_percent}%) -> {docker_cgroup}") + + if len(docker_containers) >= limit: + break + else: + logger.debug(f"Docker container {container_id} not available for perf profiling") + + except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e: + logger.debug(f"Failed to get full ID for container {container_id}: {e}") + continue + + except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError) as e: + logger.debug(f"Failed to get Docker container stats: {e}") + + return docker_containers + + +def get_top_cgroup_names_for_perf(limit: int = 50, max_docker_containers: int = 0) -> List[str]: + """Get top cgroup names in the format needed for perf -G option + + Args: + limit: Maximum total number of cgroups to return + max_docker_containers: If > 0, profile individual Docker containers instead of broad 'docker' cgroup + + Only returns cgroups that exist in both resource controllers (memory/cpu) + and the perf_event controller, since perf needs access to both. + """ + if max_docker_containers > 0: + # Use individual Docker container profiling + docker_containers = get_top_docker_containers_for_perf(max_docker_containers) + + # Get other non-Docker cgroups + top_cgroups = get_top_cgroups_by_usage(limit) + other_cgroups = [] + seen_names = set(docker_containers) # Track unique cgroup names to avoid duplicates + + for cgroup in top_cgroups: + cgroup_name = cgroup_to_perf_name(cgroup.cgroup_path) + + # Skip Docker cgroups (we're handling them individually) + if cgroup_name.startswith("docker"): + continue + + # Skip duplicates + if cgroup_name in seen_names: + logger.debug(f"Skipping duplicate cgroup name {cgroup_name}") + continue + + if validate_cgroup_perf_event_access(cgroup_name): + other_cgroups.append(cgroup_name) + seen_names.add(cgroup_name) + + # Respect total limit + if len(docker_containers) + len(other_cgroups) >= limit: + break + else: + logger.debug(f"Skipping cgroup {cgroup_name} - not available in perf_event controller") + + valid_cgroups = docker_containers + other_cgroups + + if docker_containers: + logger.info(f"Using individual Docker container profiling: {len(docker_containers)} containers, {len(other_cgroups)} other cgroups") + + else: + # Use traditional cgroup profiling (including broad 'docker' cgroup) + top_cgroups = get_top_cgroups_by_usage(limit) + valid_cgroups = [] + seen_names = set() # Track unique cgroup names to avoid duplicates + + for cgroup in top_cgroups: + cgroup_name = cgroup_to_perf_name(cgroup.cgroup_path) + + # Skip duplicates (same cgroup from different controllers) + if cgroup_name in seen_names: + logger.debug(f"Skipping duplicate cgroup name {cgroup_name}") + continue + + if validate_cgroup_perf_event_access(cgroup_name): + valid_cgroups.append(cgroup_name) + seen_names.add(cgroup_name) + else: + logger.debug(f"Skipping cgroup {cgroup_name} - not available in perf_event controller") + + if len(valid_cgroups) < limit: + logger.info(f"Filtered cgroups for perf: {len(valid_cgroups)}/{limit} cgroups have perf_event access") + + return valid_cgroups + + +def validate_perf_cgroup_support() -> bool: + """Check if the current perf binary supports cgroup filtering""" + try: + import subprocess + result = subprocess.run( + ["perf", "record", "--help"], + capture_output=True, + text=True, + timeout=10 + ) + return "--cgroup" in result.stdout or "-G" in result.stdout + except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError): + return False diff --git a/gprofiler/utils/perf_process.py b/gprofiler/utils/perf_process.py index ef8d623b8..45e1060be 100644 --- a/gprofiler/utils/perf_process.py +++ b/gprofiler/utils/perf_process.py @@ -48,6 +48,9 @@ def __init__( extra_args: List[str], processes_to_profile: Optional[List[Process]], switch_timeout_s: int, + use_cgroups: bool = False, + max_cgroups: int = 50, + max_docker_containers: int = 0, ): self._start_time = 0.0 self._frequency = frequency @@ -55,12 +58,58 @@ def __init__( self._output_path = output_path self._type = "dwarf" if is_dwarf else "fp" self._inject_jit = inject_jit + self._use_cgroups = use_cgroups + self._max_cgroups = max_cgroups self._pid_args = [] - if processes_to_profile is not None: + self._cgroup_args = [] + + # Determine profiling strategy + if use_cgroups: + from gprofiler.utils.cgroup_utils import ( + is_cgroup_available, + validate_perf_cgroup_support, + get_top_cgroup_names_for_perf, + ) + # Use cgroup-based profiling for better reliability + if is_cgroup_available() and validate_perf_cgroup_support(): + try: + top_cgroups = get_top_cgroup_names_for_perf(max_cgroups, max_docker_containers) + if top_cgroups: + # Cgroup monitoring requires system-wide mode (-a) + self._pid_args.append("-a") + self._cgroup_args.extend(["-G", ",".join(top_cgroups)]) + logger.info(f"Using cgroup-based profiling with {len(top_cgroups)} top cgroups: {top_cgroups[:3]}{'...' if len(top_cgroups) > 3 else ''}") + else: + # Never fall back to system-wide profiling when cgroups are explicitly requested + from gprofiler.exceptions import PerfNoSupportedEvent + if max_docker_containers > 0: + logger.error(f"No Docker containers found for profiling despite --perf-max-docker-containers={max_docker_containers}. " + "This could indicate cgroup v2 compatibility issues or no running containers. " + "Perf profiler will be disabled to prevent system-wide profiling.") + raise PerfNoSupportedEvent("Docker container profiling requested but no containers available") + elif max_cgroups > 0: + logger.error(f"No cgroups found for profiling despite --perf-max-cgroups={max_cgroups}. " + "This could indicate cgroup compatibility issues or no active cgroups. " + "Perf profiler will be disabled to prevent system-wide profiling.") + raise PerfNoSupportedEvent("Cgroup profiling requested but no cgroups available") + else: + logger.error("Cgroup profiling was requested (--perf-use-cgroups) but no specific limits were set. " + "Perf profiler will be disabled to prevent system-wide profiling.") + raise PerfNoSupportedEvent("Cgroup profiling requested but no containers or cgroups specified") + except Exception as e: + # Never fall back to system-wide profiling when cgroups are explicitly requested + from gprofiler.exceptions import PerfNoSupportedEvent + logger.error(f"Failed to get cgroups for profiling: {e}. " + "Perf profiler will be disabled to prevent system-wide profiling.") + raise PerfNoSupportedEvent(f"Cgroup profiling failed: {e}") + elif processes_to_profile is not None: + # Traditional PID-based profiling self._pid_args.append("--pid") self._pid_args.append(",".join([str(process.pid) for process in processes_to_profile])) else: + # System-wide profiling self._pid_args.append("-a") + self._extra_args = extra_args self._switch_timeout_s = switch_timeout_s self._process: Optional[Popen] = None @@ -70,6 +119,28 @@ def _log_name(self) -> str: return f"perf ({self._type} mode)" def _get_perf_cmd(self) -> List[str]: + # When using cgroups, perf requires events to be specified before cgroups. + # If no explicit events are provided but cgroups are used, add default event. + # For multiple cgroups, perf requires one event per cgroup. + extra_args = self._extra_args + if self._cgroup_args and not extra_args: + # Count the number of cgroups (they are comma-separated in -G argument) + cgroup_arg = None + for i, arg in enumerate(self._cgroup_args): + if arg == "-G" and i + 1 < len(self._cgroup_args): + cgroup_arg = self._cgroup_args[i + 1] + break + + if cgroup_arg: + num_cgroups = len(cgroup_arg.split(",")) + # Add one event per cgroup (perf requirement) + extra_args = [] + for _ in range(num_cgroups): + extra_args.extend(["-e", "cycles"]) + else: + # Fallback: single event + extra_args = ["-e", "cycles"] + return ( [ perf_path(), @@ -88,9 +159,10 @@ def _get_perf_cmd(self) -> List[str]: "-m", str(self._MMAP_SIZES[self._type]), ] + + extra_args # Events must come before cgroups + self._pid_args + + self._cgroup_args + (["-k", "1"] if self._inject_jit else []) - + self._extra_args ) def start(self) -> None: From 18a8ccbd77999554bcc20f047ce36c61c93eb0cc Mon Sep 17 00:00:00 2001 From: ashokchatharajupalli Date: Tue, 9 Dec 2025 00:59:44 +0000 Subject: [PATCH 4/8] Enable cgroup-based profiling for perf restricted/aggressive modes --- gprofiler/heartbeat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gprofiler/heartbeat.py b/gprofiler/heartbeat.py index 31fde5d24..538dfb6e2 100644 --- a/gprofiler/heartbeat.py +++ b/gprofiler/heartbeat.py @@ -405,10 +405,12 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp perf_config = profiler_configs.get("perf", "enabled_restricted") if perf_config == "enabled_restricted": new_args.max_system_processes_for_system_profilers = 600 + new_args.perf_use_cgroups = True new_args.perf_max_docker_containers = 2 logger.info("Perf profiler: enabled restricted mode") elif perf_config == "enabled_aggressive": new_args.max_system_processes_for_system_profilers = 1500 + new_args.perf_use_cgroups = True new_args.perf_max_docker_containers = 50 logger.info("Perf profiler: enabled aggressive mode") elif perf_config == "disabled": From 46b728b386c8caec320ade62f89b86549c907041 Mon Sep 17 00:00:00 2001 From: ashokchatharajupalli Date: Tue, 9 Dec 2025 16:53:30 +0000 Subject: [PATCH 5/8] Address PR review comments: improve error handling and logging - Remove redundant upload_results validation check in main.py - Change heartbeat failure logging from warning to error for better monitoring - Separate success and error cases in send_heartbeat() response handling - Reduce duplicate logging: change redundant 'Received profiling command' to debug level - Validate command type before marking as executed to ensure proper idempotency - Add unified error handling for start/stop command execution with try-except - Ensure backend always receives command completion status (success or failure) - Keep command details logging for operational visibility Addresses all review comments from @mlim19 on PR #1009. --- gprofiler/heartbeat.py | 81 ++++++++++++++++++++++++++---------------- gprofiler/main.py | 4 --- 2 files changed, 50 insertions(+), 35 deletions(-) diff --git a/gprofiler/heartbeat.py b/gprofiler/heartbeat.py index 538dfb6e2..11e154b01 100644 --- a/gprofiler/heartbeat.py +++ b/gprofiler/heartbeat.py @@ -104,14 +104,20 @@ def send_heartbeat(self) -> Optional[Dict[str, Any]]: if response.status_code == 200: result = response.json() - if result.get("success") and result.get("profiling_command"): + # Check if the response indicates success + if not result.get("success"): + logger.error(f"Heartbeat returned unsuccessful status: {result}") + return None + + # Check if there's a profiling command + if result.get("profiling_command"): logger.info(f"Received profiling command from server: {result.get('command_id')}") return result else: logger.debug("Heartbeat successful, no pending commands") return None else: - logger.warning(f"Heartbeat failed with status {response.status_code}: {response.text}") + logger.error(f"Heartbeat failed with status {response.status_code}: {response.text}") return None except Exception as e: @@ -216,7 +222,7 @@ def start_heartbeat_loop(self): command_id = command_response["command_id"] command_type = profiling_command.get("command_type", "start") - logger.info(f"Received profiling command: {profiling_command}") + logger.debug(f"Processing profiling command: {profiling_command}") # Check for idempotency - skip if command already executed if command_id in self.heartbeat_client.executed_command_ids: @@ -229,33 +235,33 @@ def start_heartbeat_loop(self): logger.info(f"Received {command_type} command: {command_id}") - # Mark command as executed for idempotency - self.heartbeat_client.mark_command_executed(command_id) - self.heartbeat_client.last_command_id = command_id - - if command_type == "stop": - # Stop current profiler without starting a new one - logger.info(f"RECEIVED STOP COMMAND for command ID: {command_id}") - logger.info(f"STOP command details: {profiling_command}") - self._stop_current_profiler() - # TODO: important comment to make sure profiler has stopped successful to avoid leak - # Report completion for stop command + # Validate command type first + if command_type not in ["start", "stop"]: + logger.warning(f"Unknown command type: {command_type}") + # Mark invalid command as executed to prevent retry spam + self.heartbeat_client.mark_command_executed(command_id) + # Report completion for unknown command type self.heartbeat_client.send_command_completion( command_id=command_id, - status="completed", + status="failed", execution_time=0, - error_message=None, + error_message=f"Unknown command type: {command_type}", results_path=None ) - elif command_type == "start": - # Stop current profiler if running, then start new one - logger.info("Starting new profiler due to start command") - # TODO: important comment to make sure profiler has stopped successful to avoid leak - self._stop_current_profiler() - self._start_new_profiler(profiling_command, command_id) - # Note: command completion still needs since it will wait for successful profiling - # Report command completion to the server - try: + continue + + # Mark valid command as executed for idempotency + self.heartbeat_client.mark_command_executed(command_id) + self.heartbeat_client.last_command_id = command_id + + try: + if command_type == "stop": + # Stop current profiler without starting a new one + logger.info(f"Executing STOP command for command ID: {command_id}") + logger.info(f"STOP command details: {profiling_command}") + self._stop_current_profiler() + + # Report completion for stop command self.heartbeat_client.send_command_completion( command_id=command_id, status="completed", @@ -263,16 +269,29 @@ def start_heartbeat_loop(self): error_message=None, results_path=None ) - except Exception as e: - logger.error(f"Failed to report command completion for {command_id}: {e}") - else: - logger.warning(f"Unknown command type: {command_type}") - # Report completion for unknown command type + elif command_type == "start": + # Stop current profiler if running, then start new one + logger.info(f"Executing START command for command ID: {command_id}") + logger.info(f"START command details: {profiling_command}") + self._stop_current_profiler() + self._start_new_profiler(profiling_command, command_id) + + # Report command completion to the server + self.heartbeat_client.send_command_completion( + command_id=command_id, + status="completed", + execution_time=0, + error_message=None, + results_path=None + ) + except Exception as e: + logger.error(f"Failed to execute {command_type} command {command_id}: {e}", exc_info=True) + # Report failure to the server self.heartbeat_client.send_command_completion( command_id=command_id, status="failed", execution_time=0, - error_message=f"Unknown command type: {command_type}", + error_message=str(e), results_path=None ) diff --git a/gprofiler/main.py b/gprofiler/main.py index 0389b7a5b..f8c70f8f1 100644 --- a/gprofiler/main.py +++ b/gprofiler/main.py @@ -1289,10 +1289,6 @@ def main() -> None: # Check if heartbeat server mode is enabled FIRST if args.enable_heartbeat_server: - if not args.upload_results: - logger.error("Heartbeat server mode requires --upload-results to be enabled") - sys.exit(1) - # Create heartbeat client heartbeat_client = HeartbeatClient( api_server=args.api_server, From 0c0a511db5d176cbb1624123a67773e5ccde080a Mon Sep 17 00:00:00 2001 From: ashokchatharajupalli Date: Wed, 10 Dec 2025 22:55:43 +0000 Subject: [PATCH 6/8] Fix linting issues for CI compliance - Fix import sorting and code formatting - Remove unused imports and variables - Add missing logging import to profiler_base.py --- gprofiler/heartbeat.py | 273 +++++++++++++-------------- gprofiler/main.py | 12 +- gprofiler/profilers/profiler_base.py | 24 +-- gprofiler/profilers/python.py | 7 +- gprofiler/profilers/python_ebpf.py | 21 ++- gprofiler/utils/cgroup_utils.py | 212 ++++++++++----------- gprofiler/utils/perf_process.py | 55 ++++-- tests/run_heartbeat_agent.py | 76 ++++---- tests/test_heartbeat_system.py | 213 ++++++++++----------- 9 files changed, 451 insertions(+), 442 deletions(-) diff --git a/gprofiler/heartbeat.py b/gprofiler/heartbeat.py index 11e154b01..92ee05bf8 100644 --- a/gprofiler/heartbeat.py +++ b/gprofiler/heartbeat.py @@ -20,40 +20,30 @@ import socket import threading from pathlib import Path -from typing import Dict, Any, Optional, List, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Dict, Optional import configargparse import requests -from psutil import Process # Use TYPE_CHECKING to avoid circular imports if TYPE_CHECKING: from gprofiler.main import GProfiler from gprofiler.client import ProfilerAPIClient -from gprofiler.containers_client import ContainerNamesClient -from gprofiler.metadata.application_identifiers import ApplicationIdentifiers from gprofiler.metadata.enrichment import EnrichmentOptions -from gprofiler.metadata.metadata_collector import get_static_metadata from gprofiler.metadata.system_metadata import get_hostname -from gprofiler.profiler_state import ProfilerState -from gprofiler.profilers.factory import get_profilers -from gprofiler.profilers.profiler_base import NoopProfiler -from gprofiler.state import State, init_state, get_state -from gprofiler.system_metrics import NoopSystemMetricsMonitor, SystemMetricsMonitor, SystemMetricsMonitorBase +from gprofiler.state import get_state from gprofiler.usage_loggers import NoopUsageLogger -from gprofiler.utils import TEMPORARY_STORAGE_PATH, resource_path -from gprofiler.hw_metrics import HWMetricsMonitor, HWMetricsMonitorBase, NoopHWMetricsMonitor -from gprofiler.exceptions import NoProfilersEnabledError +from gprofiler.utils import resource_path logger = logging.getLogger(__name__) class HeartbeatClient: """Client for sending heartbeats to the server and receiving profiling commands""" - + def __init__(self, api_server: str, service_name: str, server_token: str, verify: bool = True): - self.api_server = api_server.rstrip('/') + self.api_server = api_server.rstrip("/") self.service_name = service_name self.server_token = server_token self.verify = verify @@ -63,14 +53,13 @@ def __init__(self, api_server: str, service_name: str, server_token: str, verify self.executed_command_ids: set = set() # Track executed command IDs for idempotency (in-memory) self.max_command_history = 1000 # Limit command history to prevent memory growth self.session = requests.Session() - + # Set up authentication headers if self.server_token: - self.session.headers.update({ - 'Authorization': f'Bearer {self.server_token}', - 'Content-Type': 'application/json' - }) - + self.session.headers.update( + {"Authorization": f"Bearer {self.server_token}", "Content-Type": "application/json"} + ) + def _get_local_ip(self) -> str: """Get the local IP address""" try: @@ -80,7 +69,7 @@ def _get_local_ip(self) -> str: return s.getsockname()[0] except Exception: return "127.0.0.1" - + def send_heartbeat(self) -> Optional[Dict[str, Any]]: """Send heartbeat to server and return any profiling commands""" try: @@ -90,25 +79,20 @@ def send_heartbeat(self) -> Optional[Dict[str, Any]]: "service_name": self.service_name, "last_command_id": self.last_command_id, "status": "active", - "timestamp": datetime.datetime.now().isoformat() + "timestamp": datetime.datetime.now().isoformat(), } - + url = f"{self.api_server}/api/metrics/heartbeat" - response = self.session.post( - url, - json=heartbeat_data, - verify=self.verify, - timeout=30 - ) - + response = self.session.post(url, json=heartbeat_data, verify=self.verify, timeout=30) + if response.status_code == 200: result = response.json() - + # Check if the response indicates success if not result.get("success"): logger.error(f"Heartbeat returned unsuccessful status: {result}") return None - + # Check if there's a profiling command if result.get("profiling_command"): logger.info(f"Received profiling command from server: {result.get('command_id')}") @@ -119,23 +103,29 @@ def send_heartbeat(self) -> Optional[Dict[str, Any]]: else: logger.error(f"Heartbeat failed with status {response.status_code}: {response.text}") return None - + except Exception as e: logger.error(f"Failed to send heartbeat: {e}") return None - - def send_command_completion(self, command_id: str, status: str, execution_time: Optional[int] = None, - error_message: Optional[str] = None, results_path: Optional[str] = None) -> bool: + + def send_command_completion( + self, + command_id: str, + status: str, + execution_time: Optional[int] = None, + error_message: Optional[str] = None, + results_path: Optional[str] = None, + ) -> bool: """ Send command completion status to the server. - + Args: command_id: The ID of the completed command status: 'completed' or 'failed' execution_time: Duration of execution in seconds error_message: Error message if status is 'failed' results_path: Path to profiling results if available - + Returns: bool: True if completion was successfully reported, False otherwise """ @@ -146,40 +136,36 @@ def send_command_completion(self, command_id: str, status: str, execution_time: "status": status, "execution_time": execution_time, "error_message": error_message, - "results_path": results_path + "results_path": results_path, } - + url = f"{self.api_server}/api/metrics/command_completion" - response = self.session.post( - url, - json=completion_data, - verify=self.verify, - timeout=30 - ) - + response = self.session.post(url, json=completion_data, verify=self.verify, timeout=30) + if response.status_code == 200: logger.info(f"Successfully reported command completion for {command_id} with status: {status}") return True else: - logger.error(f"Failed to report command completion for {command_id}. Status: {response.status_code}, Response: {response.text}") + logger.error( + f"Failed to report command completion for {command_id}. " + f"Status: {response.status_code}, Response: {response.text}" + ) return False - + except Exception as e: logger.error(f"Failed to send command completion for {command_id}: {e}") return False - - def mark_command_executed(self, command_id: str): """Mark a command as executed (in-memory)""" self.executed_command_ids.add(command_id) - + # Cleanup old command IDs if we exceed the limit if len(self.executed_command_ids) > self.max_command_history: self._cleanup_old_command_ids() - + logger.debug(f"Marked command ID {command_id} as executed") - + def _cleanup_old_command_ids(self): """Remove old command IDs to prevent memory growth""" try: @@ -190,40 +176,42 @@ def _cleanup_old_command_ids(self): command_list = list(self.executed_command_ids) # Since UUIDs don't sort chronologically, we'll just remove some arbitrary ones # In a real implementation, you'd want to track timestamps - commands_to_keep = command_list[-self.max_command_history:] + commands_to_keep = command_list[-self.max_command_history :] self.executed_command_ids = set(commands_to_keep) - logger.info(f"Cleaned up command ID history in memory, keeping {len(self.executed_command_ids)} entries") + logger.info( + f"Cleaned up command ID history in memory, keeping {len(self.executed_command_ids)} entries" + ) except Exception as e: logger.warning(f"Failed to cleanup old command IDs: {e}") class DynamicGProfilerManager: """Manager for dynamically starting/stopping gProfiler instances based on server commands""" - + def __init__(self, base_args: configargparse.Namespace, heartbeat_client: HeartbeatClient): self.base_args = base_args self.heartbeat_client = heartbeat_client - self.current_gprofiler: Optional['GProfiler'] = None + self.current_gprofiler: Optional["GProfiler"] = None self.current_thread: Optional[threading.Thread] = None self.stop_event = threading.Event() self.heartbeat_interval = 30 # seconds - + def start_heartbeat_loop(self): """Start the main heartbeat loop""" logger.info("Starting heartbeat loop...") - + while not self.stop_event.is_set(): try: # Send heartbeat and check for commands command_response = self.heartbeat_client.send_heartbeat() - + if command_response and command_response.get("profiling_command"): profiling_command = command_response["profiling_command"] command_id = command_response["command_id"] command_type = profiling_command.get("command_type", "start") - + logger.debug(f"Processing profiling command: {profiling_command}") - + # Check for idempotency - skip if command already executed if command_id in self.heartbeat_client.executed_command_ids: logger.info(f"Command ID {command_id} already executed, skipping...") @@ -232,9 +220,9 @@ def start_heartbeat_loop(self): self.stop_event.wait(self.heartbeat_interval) continue - + logger.info(f"Received {command_type} command: {command_id}") - + # Validate command type first if command_type not in ["start", "stop"]: logger.warning(f"Unknown command type: {command_type}") @@ -246,28 +234,28 @@ def start_heartbeat_loop(self): status="failed", execution_time=0, error_message=f"Unknown command type: {command_type}", - results_path=None + results_path=None, ) continue - + # Mark valid command as executed for idempotency self.heartbeat_client.mark_command_executed(command_id) self.heartbeat_client.last_command_id = command_id - + try: if command_type == "stop": # Stop current profiler without starting a new one logger.info(f"Executing STOP command for command ID: {command_id}") logger.info(f"STOP command details: {profiling_command}") self._stop_current_profiler() - + # Report completion for stop command self.heartbeat_client.send_command_completion( command_id=command_id, status="completed", execution_time=0, error_message=None, - results_path=None + results_path=None, ) elif command_type == "start": # Stop current profiler if running, then start new one @@ -275,14 +263,14 @@ def start_heartbeat_loop(self): logger.info(f"START command details: {profiling_command}") self._stop_current_profiler() self._start_new_profiler(profiling_command, command_id) - + # Report command completion to the server self.heartbeat_client.send_command_completion( command_id=command_id, status="completed", execution_time=0, error_message=None, - results_path=None + results_path=None, ) except Exception as e: logger.error(f"Failed to execute {command_type} command {command_id}: {e}", exc_info=True) @@ -292,17 +280,16 @@ def start_heartbeat_loop(self): status="failed", execution_time=0, error_message=str(e), - results_path=None + results_path=None, ) - - + # Wait for next heartbeat self.stop_event.wait(self.heartbeat_interval) - + except Exception as e: logger.error(f"Error in heartbeat loop: {e}", exc_info=True) self.stop_event.wait(self.heartbeat_interval) - + def _stop_current_profiler(self): """Stop the currently running profiler""" if self.current_gprofiler: @@ -311,9 +298,9 @@ def _stop_current_profiler(self): self.current_gprofiler.stop() # This sets the stop_event! logger.info("Successfully called gprofiler.stop()") except Exception as e: - # TODO: This is a huge leak, report it + # TODO: This is a huge leak, report it logger.error(f"Error stopping gProfiler: {e}") - + # ALWAYS cleanup subprocesses regardless of stop() success/failure try: logger.info("Starting comprehensive cleanup after heartbeat stop...") @@ -322,28 +309,28 @@ def _stop_current_profiler(self): except Exception as cleanup_error: # Cleanup errors are non-fatal - log and continue logger.info(f"Cleanup completed with minor errors (expected during stop): {cleanup_error}") - + # Always clear the reference self.current_gprofiler = None - + if self.current_thread and self.current_thread.is_alive(): # No need to actively kill the thread, the self.current_gprofiler.stop() already handles it using events logger.info("Waiting for profiler thread to finish...") self.current_thread.join(timeout=10) self.current_thread = None - + def _start_new_profiler(self, profiling_command: Dict[str, Any], command_id: str): """Start a new profiler with the given configuration""" try: # Import here to avoid circular imports from gprofiler.main import DEFAULT_PROFILING_DURATION - + # Create modified args for the new profiler new_args = self._create_profiler_args(profiling_command) - + # Create new GProfiler instance self.current_gprofiler = self._create_gprofiler_instance(new_args) - + # Start profiler in a separate thread self.current_thread = threading.Thread( target=self._run_profiler, @@ -353,28 +340,24 @@ def _start_new_profiler(self, profiling_command: Dict[str, Any], command_id: str getattr(new_args, "duration", DEFAULT_PROFILING_DURATION), command_id, ), - daemon=True + daemon=True, ) self.current_thread.start() - + logger.info(f"Started new gProfiler instance with command ID: {command_id}") - + except Exception as e: logger.error(f"Failed to start new profiler: {e}", exc_info=True) # Report failure to the server self.heartbeat_client.send_command_completion( - command_id=command_id, - status="failed", - execution_time=0, - error_message=str(e), - results_path=None + command_id=command_id, status="failed", execution_time=0, error_message=str(e), results_path=None ) - + def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargparse.Namespace: """Create modified args based on profiling command""" # Copy base args new_args = configargparse.Namespace(**vars(self.base_args)) - + # Update with profiling command parameters from combined_config combined_config = profiling_command.get("combined_config", {}) if "duration" in combined_config: @@ -390,15 +373,15 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp return None if "pids" in combined_config and combined_config["pids"]: new_args.pids_to_profile = combined_config["pids"] - + # Set continuous mode new_args.continuous = combined_config.get("continuous", False) - + # Handle PerfSpect configuration enable_perfspect = combined_config.get("enable_perfspect", False) if enable_perfspect: new_args.collect_hw_metrics = True - + # Assume PerfSpect is pre-installed as a resource perfspect_path = resource_path("perfspect/perfspect") @@ -409,17 +392,17 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp else: logger.error(f"PerfSpect not found at {perfspect_path}, hardware metrics disabled") new_args.collect_hw_metrics = False - + # Handle max_processes configuration max_processes = combined_config.get("max_processes", 10) new_args.max_processes_per_profiler = max_processes logger.info(f"Setting max processes per profiler: {max_processes}") - + # Handle Profiler Configurations profiler_configs = combined_config.get("profiler_configs", {}) if profiler_configs: logger.info(f"Applying profiler configurations: {profiler_configs}") - + # Handle Perf Profiler configuration perf_config = profiler_configs.get("perf", "enabled_restricted") if perf_config == "enabled_restricted": @@ -435,7 +418,7 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp elif perf_config == "disabled": new_args.perf_mode = "disabled" logger.info("Perf profiler: disabled") - + # Handle Pyperf configuration pyperf_config = profiler_configs.get("pyperf", "enabled") if pyperf_config == "enabled": @@ -445,7 +428,7 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp elif pyperf_config == "disabled": new_args.python_mode = "disabled" logger.info("Pyperf profiler: disabled, using pyspy") - + # Handle Pyspy configuration pyspy_config = profiler_configs.get("pyspy", "enabled_fallback") if pyspy_config == "enabled_fallback": @@ -457,7 +440,7 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp elif pyspy_config == "disabled" and pyperf_config == "disabled": new_args.python_mode = "disabled" logger.info("Pyspy profiler: disabled") - + # Handle Java Async Profiler configuration async_profiler_config = profiler_configs.get("async_profiler", "enabled") if async_profiler_config == "disabled": @@ -465,7 +448,7 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp logger.info("Java async profiler: disabled") else: logger.info("Java async profiler: enabled") - + # Handle PHP configuration phpspy_config = profiler_configs.get("phpspy", "enabled") if phpspy_config == "disabled": @@ -473,7 +456,7 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp logger.info("PHP profiler: disabled") else: logger.info("PHP profiler: enabled") - + # Handle Ruby configuration rbspy_config = profiler_configs.get("rbspy", "enabled") if rbspy_config == "disabled": @@ -481,7 +464,7 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp logger.info("Ruby profiler: disabled") else: logger.info("Ruby profiler: enabled") - + # Handle .NET configuration dotnet_config = profiler_configs.get("dotnet_trace", "enabled") if dotnet_config == "disabled": @@ -489,7 +472,7 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp logger.info(".NET profiler: disabled") else: logger.info(".NET profiler: enabled") - + # Handle NodeJS configuration nodejs_config = profiler_configs.get("nodejs_perf", "enabled") if nodejs_config == "disabled": @@ -497,20 +480,20 @@ def _create_profiler_args(self, profiling_command: Dict[str, Any]) -> configargp logger.info("NodeJS profiler: disabled") else: logger.info("NodeJS profiler: enabled") - + return new_args - - def _create_gprofiler_instance(self, args: configargparse.Namespace) -> 'GProfiler': + + def _create_gprofiler_instance(self, args: configargparse.Namespace) -> "GProfiler": """Create a new GProfiler instance with the given args""" if args is None: return None - + # Import here to avoid circular imports from gprofiler.main import GProfiler, pids_to_processes - + processes_to_profile = pids_to_processes(args) state = get_state() - + # Create profiler API client profiler_api_client = None if args.upload_results: @@ -518,12 +501,12 @@ def _create_gprofiler_instance(self, args: configargparse.Namespace) -> 'GProfil token=args.server_token, service_name=args.service_name, server_address=args.server_host, - curlify_requests=getattr(args, 'curlify_requests', False), + curlify_requests=getattr(args, "curlify_requests", False), hostname=get_hostname(), verify=args.verify, - upload_timeout=getattr(args, 'server-upload-timeout', 120) # Default to 120 seconds + upload_timeout=getattr(args, "server-upload-timeout", 120), # Default to 120 seconds ) - + enrichment_options = EnrichmentOptions( profile_api_version=args.profile_api_version, container_names=args.container_names, @@ -531,30 +514,30 @@ def _create_gprofiler_instance(self, args: configargparse.Namespace) -> 'GProfil application_identifier_args_filters=args.app_id_args_filters, application_metadata=args.application_metadata, ) - + # Create external metadata path if specified external_metadata_path = None - if hasattr(args, 'external_metadata') and args.external_metadata: + if hasattr(args, "external_metadata") and args.external_metadata: external_metadata_path = Path(args.external_metadata) - + # Create heartbeat file path if specified heartbeat_file_path = None - if hasattr(args, 'heartbeat_file') and args.heartbeat_file: + if hasattr(args, "heartbeat_file") and args.heartbeat_file: heartbeat_file_path = Path(args.heartbeat_file) - + # Create perfspect path if specified perfspect_path = None if hasattr(args, "tool_perfspect_path") and args.tool_perfspect_path: perfspect_path = Path(args.tool_perfspect_path) - + return GProfiler( - output_dir=getattr(args, 'output_dir', None), - flamegraph=getattr(args, 'flamegraph', True), - rotating_output=getattr(args, 'rotating_output', False), - rootless=getattr(args, 'rootless', False), + output_dir=getattr(args, "output_dir", None), + flamegraph=getattr(args, "flamegraph", True), + rotating_output=getattr(args, "rotating_output", False), + rootless=getattr(args, "rootless", False), profiler_api_client=profiler_api_client, - collect_metrics=getattr(args, 'collect_metrics', True), - collect_metadata=getattr(args, 'collect_metadata', True), + collect_metrics=getattr(args, "collect_metrics", True), + collect_metadata=getattr(args, "collect_metadata", True), enrichment_options=enrichment_options, state=state, usage_logger=NoopUsageLogger(), # Simplified for dynamic profiling @@ -563,7 +546,7 @@ def _create_gprofiler_instance(self, args: configargparse.Namespace) -> 'GProfil profile_api_version=args.profile_api_version, profiling_mode=args.profiling_mode, collect_hw_metrics=getattr(args, "collect_hw_metrics", False), - profile_spawned_processes=getattr(args, 'profile_spawned_processes', False), + profile_spawned_processes=getattr(args, "profile_spawned_processes", False), remote_logs_handler=None, # Simplified for dynamic profiling controller_process=None, processes_to_profile=processes_to_profile, @@ -572,16 +555,14 @@ def _create_gprofiler_instance(self, args: configargparse.Namespace) -> 'GProfil perfspect_path=perfspect_path, perfspect_duration=getattr(args, "tool_perfspect_duration", 60), ) - - def _run_profiler(self, gprofiler: 'GProfiler', continuous: bool, duration: int, command_id: str): + + def _run_profiler(self, gprofiler: "GProfiler", continuous: bool, duration: int, command_id: str): """Run the profiler with specified args""" if gprofiler is None: return - + start_time = datetime.datetime.now() - error_message = None - results_path = None - + try: if continuous: logger.info(f"Running continuous profiler for command ID: {command_id}") @@ -595,29 +576,29 @@ def _run_profiler(self, gprofiler: 'GProfiler', continuous: bool, duration: int, logger.info(f"Profiler run was stopped before completion for command ID: {command_id}") else: logger.info(f"Profiler run completed successfully for command ID: {command_id}") - + # Try to get results path if available - if hasattr(gprofiler, 'output_dir') and gprofiler.output_dir: - results_path = str(gprofiler.output_dir) - + if hasattr(gprofiler, "output_dir") and gprofiler.output_dir: + _ = str(gprofiler.output_dir) # Available for future use + except Exception as e: # Internal exceptions can occur during profiling stop # Only consider a failure if it was not due to a stop event if not gprofiler._profiler_state.stop_event.is_set(): - error_message = str(e) + _ = str(e) # Available for future error reporting logger.error(f"Profiler run failed for command ID {command_id}: {e}", exc_info=True) else: logger.info(f"Profiler run was stopped before completion for command ID: {command_id}") - + finally: # Calculate execution time end_time = datetime.datetime.now() - execution_time = int((end_time - start_time).total_seconds()) - + _ = int((end_time - start_time).total_seconds()) # Available for future use + # Clear the current profiler reference if self.current_gprofiler == gprofiler: self.current_gprofiler = None - + def stop(self): """Stop the heartbeat manager""" logger.info("Stopping heartbeat manager...") diff --git a/gprofiler/main.py b/gprofiler/main.py index f8c70f8f1..e832f5c44 100644 --- a/gprofiler/main.py +++ b/gprofiler/main.py @@ -305,10 +305,14 @@ def start(self) -> None: for prof in list(self.all_profilers): try: # Skip system profilers if threshold exceeded - if skip_system_profilers and hasattr(prof, '_is_system_wide_profiler') and prof._is_system_wide_profiler(): + if ( + skip_system_profilers + and hasattr(prof, "_is_system_wide_profiler") + and prof._is_system_wide_profiler() + ): logger.info(f"Skipping {prof.__class__.__name__} due to high system process count") continue - + prof.start() except Exception: # the SystemProfiler is handled separately - let the user run with '--perf-mode none' if they @@ -1286,7 +1290,7 @@ def main() -> None: ApplicationIdentifiers.init(enrichment_options) set_diagnostics(args.diagnostics) - + # Check if heartbeat server mode is enabled FIRST if args.enable_heartbeat_server: # Create heartbeat client @@ -1297,7 +1301,7 @@ def main() -> None: verify=args.verify, ) - # Create dynamic profiler manager + # Create dynamic profiler manager manager = DynamicGProfilerManager(args, heartbeat_client) manager.heartbeat_interval = args.heartbeat_interval diff --git a/gprofiler/profilers/profiler_base.py b/gprofiler/profilers/profiler_base.py index 9e2210b5c..61e26103e 100644 --- a/gprofiler/profilers/profiler_base.py +++ b/gprofiler/profilers/profiler_base.py @@ -16,6 +16,7 @@ import concurrent.futures import contextlib +import logging import os import sched import time @@ -198,22 +199,22 @@ def _is_system_wide_profiler(self) -> bool: def _get_top_processes_by_cpu(self, processes: List[Process], max_processes: int) -> List[Process]: """ Filter processes to the top N by CPU usage to reduce memory consumption. - + Args: processes: List of processes to filter max_processes: Maximum number of processes to return - + Returns: List of top N processes by CPU usage, or all processes if max_processes <= 0 """ if max_processes <= 0 or len(processes) <= max_processes: return processes - + logger.info( f"{self.__class__.__name__}: Limiting to top {max_processes} processes " f"(from {len(processes)}) by CPU usage to reduce memory consumption" ) - + # Get CPU usage for each process, handling exceptions gracefully processes_with_cpu = [] for process in processes: @@ -228,15 +229,15 @@ def _get_top_processes_by_cpu(self, processes: List[Process], max_processes: int except Exception as e: logger.debug(f"Error getting CPU usage for process {process.pid}: {e}") processes_with_cpu.append((process, 0.0)) - + # Sort by CPU usage (descending) and take top N processes_with_cpu.sort(key=lambda x: x[1], reverse=True) top_processes = [proc for proc, cpu in processes_with_cpu[:max_processes]] - + if logger.isEnabledFor(logging.DEBUG): - top_cpu_info = [(proc.pid, cpu) for proc, cpu in processes_with_cpu[:min(5, max_processes)]] + top_cpu_info = [(proc.pid, cpu) for proc, cpu in processes_with_cpu[: min(5, max_processes)]] logger.debug(f"{self.__class__.__name__}: Selected top processes by CPU: {top_cpu_info}") - + return top_processes def _get_process_age(self, process: Process) -> float: @@ -265,14 +266,13 @@ def snapshot(self) -> ProcessToProfileData: process for process in processes_to_profile if process in self._profiler_state.processes_to_profile ] logger.debug(f"{self.__class__.__name__}: processes left after filtering: {len(processes_to_profile)}") - + # Apply max_processes_per_profiler limit for runtime profilers (not system-wide profilers) if self._should_limit_processes() and self._profiler_state.max_processes_per_profiler > 0: processes_to_profile = self._get_top_processes_by_cpu( - processes_to_profile, - self._profiler_state.max_processes_per_profiler + processes_to_profile, self._profiler_state.max_processes_per_profiler ) - + self._notify_selected_processes(processes_to_profile) if not processes_to_profile: diff --git a/gprofiler/profilers/python.py b/gprofiler/profilers/python.py index d6b14243d..42d7bf4bf 100644 --- a/gprofiler/profilers/python.py +++ b/gprofiler/profilers/python.py @@ -386,7 +386,8 @@ def _should_skip_process(self, process: Process) -> bool: default=0, help="Skip PyPerf (eBPF Python profiler) when Python processes exceed this threshold (0=unlimited). " "When exceeded, prevents PyPerf from starting but allows py-spy fallback for Python profiling. " - "This provides fine-grained control over PyPerf resource usage independent of system profilers. Default: %(default)s", + "This provides fine-grained control over PyPerf resource usage independent of system profilers. " + "Default: %(default)s", ), ], supported_profiling_modes=["cpu"], @@ -488,11 +489,11 @@ def start(self) -> None: # Skip PyPerf but keep py-spy as fallback logger.info("PyPerf skipped due to Python process threshold, falling back to py-spy") self._ebpf_profiler = None - + # Ensure py-spy profiler exists as fallback if self._pyspy_profiler is None: logger.warning("PyPerf skipped but no py-spy fallback available") - + # Start the appropriate profiler if self._ebpf_profiler is not None: self._ebpf_profiler.start() diff --git a/gprofiler/profilers/python_ebpf.py b/gprofiler/profilers/python_ebpf.py index 3e417cf87..aae0792ec 100644 --- a/gprofiler/profilers/python_ebpf.py +++ b/gprofiler/profilers/python_ebpf.py @@ -101,24 +101,24 @@ def _count_python_processes(self) -> int: This ensures consistent counting between PyPerf skip logic and py-spy process selection. """ try: - from gprofiler.utils import pgrep_maps, pgrep_exe - + from gprofiler.utils import pgrep_exe, pgrep_maps + # Count all processes that match Python detection criteria python_pattern = "python" python_processes = set() - + # Check via maps (memory mappings contain libpython) try: python_processes.update(pgrep_maps(python_pattern)) except Exception: pass - + # Check via executable name try: python_processes.update(pgrep_exe(python_pattern)) except Exception: pass - + return len(python_processes) except Exception as e: logger.debug(f"Error counting Python processes: {e}") @@ -131,18 +131,21 @@ def should_skip_due_to_python_threshold(self) -> bool: """ if self._python_skip_pyperf_profiler_above <= 0: return False # No threshold set, don't skip - + python_process_count = self._count_python_processes() should_skip = python_process_count > self._python_skip_pyperf_profiler_above - + if should_skip: logger.info( f"Skipping PyPerf - {python_process_count} Python processes exceed threshold " f"of {self._python_skip_pyperf_profiler_above}. py-spy fallback will be used for Python profiling." ) else: - logger.debug(f"PyPerf: Python process count {python_process_count} (threshold: {self._python_skip_pyperf_profiler_above})") - + logger.debug( + f"PyPerf: Python process count {python_process_count} " + f"(threshold: {self._python_skip_pyperf_profiler_above})" + ) + return should_skip @classmethod diff --git a/gprofiler/utils/cgroup_utils.py b/gprofiler/utils/cgroup_utils.py index 852f44ae5..7b6090926 100644 --- a/gprofiler/utils/cgroup_utils.py +++ b/gprofiler/utils/cgroup_utils.py @@ -14,41 +14,43 @@ # limitations under the License. # -import os import logging -from pathlib import Path -from typing import List, Optional, Tuple, Dict +import os from dataclasses import dataclass from enum import Enum +from typing import List, Optional logger = logging.getLogger(__name__) class CgroupVersion(Enum): """Cgroup version enumeration""" + V1 = "v1" V2 = "v2" UNKNOWN = "unknown" + @dataclass class CgroupResourceUsage: """Represents resource usage for a cgroup""" + cgroup_path: str name: str cpu_usage: int # CPU usage in nanoseconds memory_usage: int # Memory usage in bytes - + @property def total_score(self) -> float: """Calculate a combined score for ranking cgroups by resource usage - + Prioritizes CPU usage over memory since CPU indicates active processes that are more interesting for profiling. """ # Normalize CPU (ns) and memory (bytes) to comparable scales cpu_score = self.cpu_usage / 1_000_000_000 # ns to seconds memory_score = self.memory_usage / (1024 * 1024) # bytes to MB - + # Weight CPU heavily (10x) since active CPU usage is more important for profiling # than static memory usage return (cpu_score * 10) + memory_score @@ -60,16 +62,13 @@ def detect_cgroup_version() -> CgroupVersion: # Check if Docker containers are using cgroup v1 paths (hybrid systems) if os.path.exists("/sys/fs/cgroup/memory/docker") or os.path.exists("/sys/fs/cgroup/cpu,cpuacct/docker"): return CgroupVersion.V1 - + # Check if cgroup v2 is mounted and being used with open("/proc/mounts", "r") as f: mounts = f.read() if "cgroup2" in mounts and "/sys/fs/cgroup" in mounts: # Check if Docker containers exist in v2 paths - v2_docker_paths = [ - "/sys/fs/cgroup/system.slice", - "/sys/fs/cgroup/docker" - ] + v2_docker_paths = ["/sys/fs/cgroup/system.slice", "/sys/fs/cgroup/docker"] for path in v2_docker_paths: if os.path.exists(path): try: @@ -78,7 +77,7 @@ def detect_cgroup_version() -> CgroupVersion: return CgroupVersion.V2 except (OSError, PermissionError): continue - + # If cgroup2 is mounted but no Docker containers found in v2, check v1 if "/sys/fs/cgroup/memory" in mounts or "/sys/fs/cgroup/cpu" in mounts: return CgroupVersion.V1 @@ -88,13 +87,13 @@ def detect_cgroup_version() -> CgroupVersion: return CgroupVersion.V1 except (IOError, OSError) as e: logger.debug(f"Failed to read /proc/mounts: {e}") - + # Fallback: check filesystem structure if os.path.exists("/sys/fs/cgroup/memory") or os.path.exists("/sys/fs/cgroup/cpu,cpuacct"): return CgroupVersion.V1 elif os.path.exists("/sys/fs/cgroup/cgroup.controllers"): return CgroupVersion.V2 - + return CgroupVersion.UNKNOWN @@ -106,13 +105,13 @@ def is_cgroup_available() -> bool: def get_cgroup_cpu_usage(cgroup_path: str) -> Optional[int]: """Get CPU usage for a cgroup in nanoseconds""" cgroup_version = detect_cgroup_version() - + if cgroup_version == CgroupVersion.V2: # cgroup v2 uses cpu.stat file cpu_stat_file = os.path.join(cgroup_path, "cpu.stat") if os.path.exists(cpu_stat_file): try: - with open(cpu_stat_file, 'r') as f: + with open(cpu_stat_file, "r") as f: for line in f: if line.startswith("usage_usec "): # Convert microseconds to nanoseconds @@ -120,7 +119,7 @@ def get_cgroup_cpu_usage(cgroup_path: str) -> Optional[int]: except (IOError, ValueError) as e: logger.debug(f"Failed to read CPU usage from {cpu_stat_file}: {e}") return None - + else: # cgroup v1 usage_file = os.path.join(cgroup_path, "cpuacct.usage") if not os.path.exists(usage_file): @@ -129,9 +128,9 @@ def get_cgroup_cpu_usage(cgroup_path: str) -> Optional[int]: usage_file = os.path.join(alt_path, "cpuacct.usage") if not os.path.exists(usage_file): return None - + try: - with open(usage_file, 'r') as f: + with open(usage_file, "r") as f: return int(f.read().strip()) except (IOError, ValueError) as e: logger.debug(f"Failed to read CPU usage from {usage_file}: {e}") @@ -141,18 +140,18 @@ def get_cgroup_cpu_usage(cgroup_path: str) -> Optional[int]: def get_cgroup_memory_usage(cgroup_path: str) -> Optional[int]: """Get memory usage for a cgroup in bytes""" cgroup_version = detect_cgroup_version() - + if cgroup_version == CgroupVersion.V2: # cgroup v2 uses memory.current file usage_file = os.path.join(cgroup_path, "memory.current") else: # cgroup v1 usage_file = os.path.join(cgroup_path, "memory.usage_in_bytes") - + if not os.path.exists(usage_file): return None - + try: - with open(usage_file, 'r') as f: + with open(usage_file, "r") as f: return int(f.read().strip()) except (IOError, ValueError) as e: logger.debug(f"Failed to read memory usage from {usage_file}: {e}") @@ -163,7 +162,7 @@ def find_all_cgroups() -> List[str]: """Find all available cgroups in the system""" cgroups = [] cgroup_version = detect_cgroup_version() - + if cgroup_version == CgroupVersion.V2: # cgroup v2 unified hierarchy base = "/sys/fs/cgroup" @@ -172,16 +171,16 @@ def find_all_cgroups() -> List[str]: # Skip the root directory itself if root == base: continue - + # Check if this directory has the necessary files for v2 cpu_file = os.path.join(root, "cpu.stat") memory_file = os.path.join(root, "memory.current") - + if os.path.exists(cpu_file) or os.path.exists(memory_file): cgroups.append(root) except OSError as e: logger.debug(f"Error walking cgroup v2 directory {base}: {e}") - + else: # cgroup v1 # Common cgroup mount points to check cgroup_bases = [ @@ -189,7 +188,7 @@ def find_all_cgroups() -> List[str]: "/sys/fs/cgroup/memory", "/sys/fs/cgroup/cpuacct", ] - + for base in cgroup_bases: if os.path.exists(base): try: @@ -198,50 +197,45 @@ def find_all_cgroups() -> List[str]: # Skip the base directory itself if root == base: continue - + # Check if this directory has the necessary files cpu_file = os.path.join(root, "cpuacct.usage") memory_file = root.replace("/cpu,cpuacct/", "/memory/") + "/memory.usage_in_bytes" - + if os.path.exists(cpu_file) or os.path.exists(memory_file): cgroups.append(root) except OSError as e: logger.debug(f"Error walking cgroup directory {base}: {e}") continue - + return list(set(cgroups)) # Remove duplicates def get_cgroup_resource_usage(cgroup_path: str) -> Optional[CgroupResourceUsage]: """Get resource usage for a single cgroup""" cpu_usage = get_cgroup_cpu_usage(cgroup_path) - + # For memory, try to find the corresponding memory cgroup path memory_path = cgroup_path.replace("/cpu,cpuacct/", "/memory/") if not os.path.exists(memory_path): memory_path = cgroup_path.replace("/cpuacct/", "/memory/") - + memory_usage = get_cgroup_memory_usage(memory_path) - + # If we can't get any usage data, skip this cgroup if cpu_usage is None and memory_usage is None: return None - + # Use 0 as default if one metric is missing cpu_usage = cpu_usage or 0 memory_usage = memory_usage or 0 - + # Extract a readable name from the path name = os.path.basename(cgroup_path) if len(name) > 12: # Truncate long container IDs name = name[:12] - - return CgroupResourceUsage( - cgroup_path=cgroup_path, - name=name, - cpu_usage=cpu_usage, - memory_usage=memory_usage - ) + + return CgroupResourceUsage(cgroup_path=cgroup_path, name=name, cpu_usage=cpu_usage, memory_usage=memory_usage) def get_top_cgroups_by_usage(limit: int = 50) -> List[CgroupResourceUsage]: @@ -249,21 +243,21 @@ def get_top_cgroups_by_usage(limit: int = 50) -> List[CgroupResourceUsage]: if not is_cgroup_available(): logger.warning("Cgroup filesystem not available") return [] - + all_cgroups = find_all_cgroups() logger.debug(f"Found {len(all_cgroups)} cgroups to analyze") - + cgroup_usages = [] for cgroup_path in all_cgroups: usage = get_cgroup_resource_usage(cgroup_path) if usage: cgroup_usages.append(usage) - + # Sort by total resource usage score (descending) cgroup_usages.sort(key=lambda x: x.total_score, reverse=True) - + logger.debug(f"Analyzed {len(cgroup_usages)} cgroups with resource data") - + return cgroup_usages[:limit] @@ -271,12 +265,12 @@ def cgroup_to_perf_name(cgroup_path: str) -> str: """Convert a cgroup path to the name format expected by perf -G option""" # perf expects the cgroup name relative to the cgroup mount point # For example: /sys/fs/cgroup/memory/docker/abc123 -> docker/abc123 - + # Find the relative path from the cgroup mount point for base in ["/sys/fs/cgroup/memory/", "/sys/fs/cgroup/cpu,cpuacct/", "/sys/fs/cgroup/cpuacct/"]: if cgroup_path.startswith(base): - return cgroup_path[len(base):] - + return cgroup_path[len(base) :] + # Fallback: just use the basename return os.path.basename(cgroup_path) @@ -285,23 +279,24 @@ def convert_cgroupv2_path_to_perf_name(cgroup_path: str) -> str: """Convert a cgroup v2 path to perf-compatible name""" # Remove the base cgroup path if cgroup_path.startswith("/sys/fs/cgroup/"): - relative_path = cgroup_path[len("/sys/fs/cgroup/"):] + relative_path = cgroup_path[len("/sys/fs/cgroup/") :] else: relative_path = cgroup_path - + # Handle Docker container paths in cgroup v2 if "docker-" in relative_path and ".scope" in relative_path: # Extract container ID from system.slice/docker-.scope import re - match = re.search(r'docker-([a-f0-9]{64})\.scope', relative_path) + + match = re.search(r"docker-([a-f0-9]{64})\.scope", relative_path) if match: container_id = match.group(1) return f"docker/{container_id}" - + # Handle other Docker paths if relative_path.startswith("docker/"): return relative_path - + # For other cgroups, use the relative path return relative_path @@ -309,7 +304,7 @@ def convert_cgroupv2_path_to_perf_name(cgroup_path: str) -> str: def validate_cgroup_perf_event_access(cgroup_name: str) -> bool: """Check if a cgroup is available for perf profiling""" cgroup_version = detect_cgroup_version() - + if cgroup_version == CgroupVersion.V2: # In cgroup v2, perf events are handled differently # The cgroup path should exist in the unified hierarchy @@ -333,7 +328,7 @@ def validate_cgroup_perf_event_access(cgroup_name: str) -> bool: else: cgroup_path = f"/sys/fs/cgroup/{cgroup_name}" return os.path.exists(cgroup_path) and os.path.isdir(cgroup_path) - + else: # cgroup v1 perf_event_path = f"/sys/fs/cgroup/perf_event/{cgroup_name}" return os.path.exists(perf_event_path) and os.path.isdir(perf_event_path) @@ -341,54 +336,56 @@ def validate_cgroup_perf_event_access(cgroup_name: str) -> bool: def get_top_docker_containers_for_perf(limit: int) -> List[str]: """Get top Docker containers by resource usage for perf profiling - + Returns individual Docker container cgroup names that exist in perf_event controller. """ import subprocess - + docker_containers = [] cgroup_version = detect_cgroup_version() - + try: # Get running Docker containers with resource stats result = subprocess.run( ["docker", "stats", "--no-stream", "--format", "{{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}"], capture_output=True, text=True, - timeout=10 + timeout=10, ) - + if result.returncode == 0: container_stats = [] - for line in result.stdout.strip().split('\n'): + for line in result.stdout.strip().split("\n"): if line.strip(): - parts = line.split('\t') + parts = line.split("\t") if len(parts) >= 2: container_id = parts[0] - cpu_percent_str = parts[1].replace('%', '') + cpu_percent_str = parts[1].replace("%", "") try: cpu_percent = float(cpu_percent_str) container_stats.append((container_id, cpu_percent)) except ValueError: continue - + # Sort by CPU usage (descending) container_stats.sort(key=lambda x: x[1], reverse=True) - + # Get full container IDs and check perf_event access - for container_id, cpu_percent in container_stats[:limit * 2]: # Get more than needed in case some don't have perf access + for container_id, cpu_percent in container_stats[ + : limit * 2 + ]: # Get more than needed in case some don't have perf access try: # Get full container ID full_id_result = subprocess.run( ["docker", "inspect", "--format", "{{.Id}}", container_id], capture_output=True, text=True, - timeout=5 + timeout=5, ) - + if full_id_result.returncode == 0: full_id = full_id_result.stdout.strip() - + if cgroup_version == CgroupVersion.V2: # For cgroup v2, we need to find the actual cgroup path # and use the relative path for perf @@ -397,19 +394,22 @@ def get_top_docker_containers_for_perf(limit: int) -> List[str]: f"/sys/fs/cgroup/docker/{full_id}", f"/sys/fs/cgroup/system.slice/docker.service/docker/{full_id}", ] - + docker_cgroup = None for path in possible_paths: if os.path.exists(path) and os.path.isdir(path): # For cgroup v2, perf expects the relative path from /sys/fs/cgroup/ docker_cgroup = path.replace("/sys/fs/cgroup/", "") - logger.debug(f"Found cgroup v2 path for container {container_id}: {path} -> {docker_cgroup}") + logger.debug( + f"Found cgroup v2 path for container {container_id}: {path} -> {docker_cgroup}" + ) break - + if not docker_cgroup: # Fallback: try to find any docker-related path for this container try: import glob + pattern = f"/sys/fs/cgroup/**/docker*{full_id[:12]}*" matches = glob.glob(pattern, recursive=True) if matches: @@ -424,96 +424,102 @@ def get_top_docker_containers_for_perf(limit: int) -> List[str]: else: # cgroup v1 format docker_cgroup = f"docker/{full_id}" - + # Check if this container has perf_event access if validate_cgroup_perf_event_access(docker_cgroup): docker_containers.append(docker_cgroup) - logger.debug(f"Added Docker container for profiling: {container_id} (CPU: {cpu_percent}%) -> {docker_cgroup}") - + logger.debug( + f"Added Docker container for profiling: {container_id} " + f"(CPU: {cpu_percent}%) -> {docker_cgroup}" + ) + if len(docker_containers) >= limit: break else: logger.debug(f"Docker container {container_id} not available for perf profiling") - + except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e: logger.debug(f"Failed to get full ID for container {container_id}: {e}") continue - + except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError) as e: logger.debug(f"Failed to get Docker container stats: {e}") - + return docker_containers def get_top_cgroup_names_for_perf(limit: int = 50, max_docker_containers: int = 0) -> List[str]: """Get top cgroup names in the format needed for perf -G option - + Args: limit: Maximum total number of cgroups to return max_docker_containers: If > 0, profile individual Docker containers instead of broad 'docker' cgroup - - Only returns cgroups that exist in both resource controllers (memory/cpu) + + Only returns cgroups that exist in both resource controllers (memory/cpu) and the perf_event controller, since perf needs access to both. """ if max_docker_containers > 0: # Use individual Docker container profiling docker_containers = get_top_docker_containers_for_perf(max_docker_containers) - + # Get other non-Docker cgroups top_cgroups = get_top_cgroups_by_usage(limit) other_cgroups = [] seen_names = set(docker_containers) # Track unique cgroup names to avoid duplicates - + for cgroup in top_cgroups: cgroup_name = cgroup_to_perf_name(cgroup.cgroup_path) - + # Skip Docker cgroups (we're handling them individually) if cgroup_name.startswith("docker"): continue - + # Skip duplicates if cgroup_name in seen_names: logger.debug(f"Skipping duplicate cgroup name {cgroup_name}") continue - + if validate_cgroup_perf_event_access(cgroup_name): other_cgroups.append(cgroup_name) seen_names.add(cgroup_name) - + # Respect total limit if len(docker_containers) + len(other_cgroups) >= limit: break else: logger.debug(f"Skipping cgroup {cgroup_name} - not available in perf_event controller") - + valid_cgroups = docker_containers + other_cgroups - + if docker_containers: - logger.info(f"Using individual Docker container profiling: {len(docker_containers)} containers, {len(other_cgroups)} other cgroups") - + logger.info( + f"Using individual Docker container profiling: {len(docker_containers)} containers, " + f"{len(other_cgroups)} other cgroups" + ) + else: # Use traditional cgroup profiling (including broad 'docker' cgroup) top_cgroups = get_top_cgroups_by_usage(limit) valid_cgroups = [] seen_names = set() # Track unique cgroup names to avoid duplicates - + for cgroup in top_cgroups: cgroup_name = cgroup_to_perf_name(cgroup.cgroup_path) - + # Skip duplicates (same cgroup from different controllers) if cgroup_name in seen_names: logger.debug(f"Skipping duplicate cgroup name {cgroup_name}") continue - + if validate_cgroup_perf_event_access(cgroup_name): valid_cgroups.append(cgroup_name) seen_names.add(cgroup_name) else: logger.debug(f"Skipping cgroup {cgroup_name} - not available in perf_event controller") - + if len(valid_cgroups) < limit: logger.info(f"Filtered cgroups for perf: {len(valid_cgroups)}/{limit} cgroups have perf_event access") - + return valid_cgroups @@ -521,12 +527,8 @@ def validate_perf_cgroup_support() -> bool: """Check if the current perf binary supports cgroup filtering""" try: import subprocess - result = subprocess.run( - ["perf", "record", "--help"], - capture_output=True, - text=True, - timeout=10 - ) + + result = subprocess.run(["perf", "record", "--help"], capture_output=True, text=True, timeout=10) return "--cgroup" in result.stdout or "-G" in result.stdout except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError): return False diff --git a/gprofiler/utils/perf_process.py b/gprofiler/utils/perf_process.py index 45e1060be..a5df15a9f 100644 --- a/gprofiler/utils/perf_process.py +++ b/gprofiler/utils/perf_process.py @@ -62,14 +62,15 @@ def __init__( self._max_cgroups = max_cgroups self._pid_args = [] self._cgroup_args = [] - + # Determine profiling strategy if use_cgroups: from gprofiler.utils.cgroup_utils import ( + get_top_cgroup_names_for_perf, is_cgroup_available, validate_perf_cgroup_support, - get_top_cgroup_names_for_perf, ) + # Use cgroup-based profiling for better reliability if is_cgroup_available() and validate_perf_cgroup_support(): try: @@ -78,29 +79,47 @@ def __init__( # Cgroup monitoring requires system-wide mode (-a) self._pid_args.append("-a") self._cgroup_args.extend(["-G", ",".join(top_cgroups)]) - logger.info(f"Using cgroup-based profiling with {len(top_cgroups)} top cgroups: {top_cgroups[:3]}{'...' if len(top_cgroups) > 3 else ''}") + logger.info( + f"Using cgroup-based profiling with {len(top_cgroups)} top cgroups: " + f"{top_cgroups[:3]}{'...' if len(top_cgroups) > 3 else ''}" + ) else: # Never fall back to system-wide profiling when cgroups are explicitly requested from gprofiler.exceptions import PerfNoSupportedEvent + if max_docker_containers > 0: - logger.error(f"No Docker containers found for profiling despite --perf-max-docker-containers={max_docker_containers}. " - "This could indicate cgroup v2 compatibility issues or no running containers. " - "Perf profiler will be disabled to prevent system-wide profiling.") - raise PerfNoSupportedEvent("Docker container profiling requested but no containers available") + logger.error( + f"No Docker containers found for profiling despite " + f"--perf-max-docker-containers={max_docker_containers}. " + "This could indicate cgroup v2 compatibility issues or no running containers. " + "Perf profiler will be disabled to prevent system-wide profiling." + ) + raise PerfNoSupportedEvent( + "Docker container profiling requested but no containers available" + ) elif max_cgroups > 0: - logger.error(f"No cgroups found for profiling despite --perf-max-cgroups={max_cgroups}. " - "This could indicate cgroup compatibility issues or no active cgroups. " - "Perf profiler will be disabled to prevent system-wide profiling.") + logger.error( + f"No cgroups found for profiling despite --perf-max-cgroups={max_cgroups}. " + "This could indicate cgroup compatibility issues or no active cgroups. " + "Perf profiler will be disabled to prevent system-wide profiling." + ) raise PerfNoSupportedEvent("Cgroup profiling requested but no cgroups available") else: - logger.error("Cgroup profiling was requested (--perf-use-cgroups) but no specific limits were set. " - "Perf profiler will be disabled to prevent system-wide profiling.") - raise PerfNoSupportedEvent("Cgroup profiling requested but no containers or cgroups specified") + logger.error( + "Cgroup profiling was requested (--perf-use-cgroups) but no specific limits were set. " + "Perf profiler will be disabled to prevent system-wide profiling." + ) + raise PerfNoSupportedEvent( + "Cgroup profiling requested but no containers or cgroups specified" + ) except Exception as e: # Never fall back to system-wide profiling when cgroups are explicitly requested from gprofiler.exceptions import PerfNoSupportedEvent - logger.error(f"Failed to get cgroups for profiling: {e}. " - "Perf profiler will be disabled to prevent system-wide profiling.") + + logger.error( + f"Failed to get cgroups for profiling: {e}. " + "Perf profiler will be disabled to prevent system-wide profiling." + ) raise PerfNoSupportedEvent(f"Cgroup profiling failed: {e}") elif processes_to_profile is not None: # Traditional PID-based profiling @@ -109,7 +128,7 @@ def __init__( else: # System-wide profiling self._pid_args.append("-a") - + self._extra_args = extra_args self._switch_timeout_s = switch_timeout_s self._process: Optional[Popen] = None @@ -130,7 +149,7 @@ def _get_perf_cmd(self) -> List[str]: if arg == "-G" and i + 1 < len(self._cgroup_args): cgroup_arg = self._cgroup_args[i + 1] break - + if cgroup_arg: num_cgroups = len(cgroup_arg.split(",")) # Add one event per cgroup (perf requirement) @@ -140,7 +159,7 @@ def _get_perf_cmd(self) -> List[str]: else: # Fallback: single event extra_args = ["-e", "cycles"] - + return ( [ perf_path(), diff --git a/tests/run_heartbeat_agent.py b/tests/run_heartbeat_agent.py index d37736b56..14aed9e6a 100644 --- a/tests/run_heartbeat_agent.py +++ b/tests/run_heartbeat_agent.py @@ -6,85 +6,88 @@ to receive dynamic profiling commands from the Performance Studio backend. """ -import subprocess -import sys import os import signal -import time +import subprocess +import sys from pathlib import Path + def run_gprofiler_heartbeat_mode(): """Run gProfiler in heartbeat mode""" - + # Configuration - adjust these values for your environment config = { "server_token": "test-token", "service_name": "test-service", "api_server": "http://localhost:8000", # Performance Studio backend URL - "server_host": "http://localhost:8000", # Profile upload server URL (can be same) + "server_host": "http://localhost:8000", # Profile upload server URL (can be same) "output_dir": "/tmp/gprofiler-test", "log_file": "/tmp/gprofiler-heartbeat.log", "heartbeat_interval": "10", # seconds - "verbose": True + "verbose": True, } - + # Ensure output directory exists os.makedirs(config["output_dir"], exist_ok=True) - + # Build the command gprofiler_path = Path(__file__).parent.parent / "gprofiler" / "main.py" - + cmd = [ sys.executable, str(gprofiler_path), "--enable-heartbeat-server", "--upload-results", - "--token", config["server_token"], - "--service-name", config["service_name"], - "--api-server", config["api_server"], - "--server-host", config["server_host"], - "--output-dir", config["output_dir"], - "--log-file", config["log_file"], - "--heartbeat-interval", config["heartbeat_interval"], + "--token", + config["server_token"], + "--service-name", + config["service_name"], + "--api-server", + config["api_server"], + "--server-host", + config["server_host"], + "--output-dir", + config["output_dir"], + "--log-file", + config["log_file"], + "--heartbeat-interval", + config["heartbeat_interval"], "--no-verify", # For testing with localhost ] - + if config["verbose"]: cmd.append("--verbose") - + print("🤖 Starting gProfiler in heartbeat mode...") print(f"📝 Command: {' '.join(cmd)}") - print("="*60) + print("=" * 60) print("The agent will:") print("1. Send heartbeats to the backend every 10 seconds") print("2. Wait for profiling commands from the server") print("3. Execute start/stop commands as received") print("4. Maintain idempotency for duplicate commands") - print("="*60) + print("=" * 60) print("💡 To test the system:") print("1. Start the Performance Studio backend") print("2. Run this script to start the agent") print("3. Use the backend API to send profiling requests") print("4. Watch the agent logs to see command execution") - print("="*60) + print("=" * 60) print("\n🚀 Starting agent... (Press Ctrl+C to stop)") - + try: # Start the process process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=True, - bufsize=1 + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, bufsize=1 ) - + # Monitor output - for line in iter(process.stdout.readline, ''): + for line in iter(process.stdout.readline, ""): print(f"[AGENT] {line.rstrip()}") - + process.wait() - + except KeyboardInterrupt: print("\n🛑 Received interrupt signal, stopping agent...") if process: @@ -95,18 +98,19 @@ def run_gprofiler_heartbeat_mode(): print("⚠️ Process didn't stop gracefully, forcing termination...") process.kill() process.wait() - + except Exception as e: print(f"❌ Error running gProfiler: {e}") return 1 - + print("✅ Agent stopped") return 0 + def print_usage(): """Print usage instructions""" print("📖 gProfiler Heartbeat Mode Test Runner") - print("="*50) + print("=" * 50) print("\nThis script runs gProfiler in heartbeat mode for testing.") print("\nPrerequisites:") print("1. Performance Studio backend running on http://localhost:8000") @@ -124,13 +128,15 @@ def print_usage(): print("3. Use test_heartbeat_system.py to send commands") print("4. Watch the agent respond to commands") + def main(): """Main function""" if len(sys.argv) > 1 and sys.argv[1] in ["-h", "--help"]: print_usage() return 0 - + return run_gprofiler_heartbeat_mode() + if __name__ == "__main__": sys.exit(main()) diff --git a/tests/test_heartbeat_system.py b/tests/test_heartbeat_system.py index 41158e824..a0a15e976 100644 --- a/tests/test_heartbeat_system.py +++ b/tests/test_heartbeat_system.py @@ -11,13 +11,13 @@ Supports both mock mode (default) and live mode with real backend. """ -import json -import requests +import sys import time -from datetime import datetime -from typing import Dict, Any, Optional import unittest.mock -import sys +from datetime import datetime +from typing import Any, Dict, Optional + +import requests # Configuration BACKEND_URL = "http://localhost:8000" # Adjust based on your setup @@ -28,17 +28,18 @@ # Check if we should run in mock mode (no real backend) MOCK_MODE = "--live" not in sys.argv # Default to mock mode unless --live specified + class HeartbeatClient: """Client to simulate agent heartbeat behavior""" - + def __init__(self, backend_url: str, service_name: str, hostname: str, ip_address: str): - self.backend_url = backend_url.rstrip('/') + self.backend_url = backend_url.rstrip("/") self.service_name = service_name self.hostname = hostname self.ip_address = ip_address self.last_command_id: Optional[str] = None self.executed_commands = set() - + def send_heartbeat(self) -> Optional[Dict[str, Any]]: """Send heartbeat to backend and return response""" heartbeat_data = { @@ -47,40 +48,36 @@ def send_heartbeat(self) -> Optional[Dict[str, Any]]: "service_name": self.service_name, "last_command_id": self.last_command_id, "status": "active", - "timestamp": datetime.now().isoformat() + "timestamp": datetime.now().isoformat(), } - + try: - response = requests.post( - f"{self.backend_url}/api/metrics/heartbeat", - json=heartbeat_data, - timeout=10 - ) - + response = requests.post(f"{self.backend_url}/api/metrics/heartbeat", json=heartbeat_data, timeout=10) + if response.status_code == 200: result = response.json() print(f"✓ Heartbeat successful: {result.get('message')}") - + if result.get("profiling_command") and result.get("command_id"): command_id = result["command_id"] profiling_command = result["profiling_command"] command_type = profiling_command.get("command_type", "unknown") - + print(f"📋 Received command: {command_type} (ID: {command_id})") - + # Check idempotency if command_id in self.executed_commands: print(f"⚠️ Command {command_id} already executed, skipping...") return None - + # Mark as executed self.executed_commands.add(command_id) self.last_command_id = command_id - + return { "command_type": command_type, "command_id": command_id, - "profiling_command": profiling_command + "profiling_command": profiling_command, } else: print("📭 No pending commands") @@ -88,13 +85,14 @@ def send_heartbeat(self) -> Optional[Dict[str, Any]]: else: print(f"❌ Heartbeat failed: {response.status_code} - {response.text}") return None - + except Exception as e: print(f"❌ Heartbeat error: {e}") return None - - def send_command_completion(self, command_id: str, status: str, execution_time: int = 0, - error_message: str = None, results_path: str = None) -> bool: + + def send_command_completion( + self, command_id: str, status: str, execution_time: int = 0, error_message: str = None, results_path: str = None + ) -> bool: """Send command completion status to backend""" completion_data = { "command_id": command_id, @@ -102,23 +100,21 @@ def send_command_completion(self, command_id: str, status: str, execution_time: "status": status, "execution_time": execution_time, "error_message": error_message, - "results_path": results_path + "results_path": results_path, } - + try: response = requests.post( - f"{self.backend_url}/api/metrics/command_completion", - json=completion_data, - timeout=10 + f"{self.backend_url}/api/metrics/command_completion", json=completion_data, timeout=10 ) - + if response.status_code == 200: print(f"✅ Command completion sent successfully for {command_id} with status: {status}") return True else: print(f"❌ Failed to send command completion: {response.status_code} - {response.text}") return False - + except Exception as e: print(f"❌ Error sending command completion: {e}") return False @@ -129,14 +125,14 @@ def simulate_profiling_action(self, command_type: str, command_id: str): print(f"🚀 Starting profiler for command {command_id}") # Simulate profiling work time.sleep(2) - print(f"✅ Profiler completed successfully") + print("✅ Profiler completed successfully") # Send completion acknowledgment self.send_command_completion(command_id, "completed", execution_time=2) elif command_type == "stop": print(f"🛑 Stopping profiler for command {command_id}") # Simulate stopping time.sleep(1) - print(f"✅ Profiler stopped successfully") + print("✅ Profiler stopped successfully") # Send completion acknowledgment self.send_command_completion(command_id, "completed", execution_time=1) else: @@ -144,6 +140,7 @@ def simulate_profiling_action(self, command_type: str, command_id: str): # Send failure acknowledgment self.send_command_completion(command_id, "failed", error_message=f"Unknown command type: {command_type}") + def create_test_profiling_request(backend_url: str, service_name: str, command_type: str = "start") -> bool: """Create a test profiling request""" request_data = { @@ -153,16 +150,12 @@ def create_test_profiling_request(backend_url: str, service_name: str, command_t "frequency": 11, "profiling_mode": "cpu", "target_hostnames": [HOSTNAME], - "additional_args": {"test": True} + "additional_args": {"test": True}, } - + try: - response = requests.post( - f"{backend_url}/api/metrics/profile_request", - json=request_data, - timeout=10 - ) - + response = requests.post(f"{backend_url}/api/metrics/profile_request", json=request_data, timeout=10) + if response.status_code == 200: result = response.json() print(f"✅ Profiling request created: {result.get('message')}") @@ -172,89 +165,86 @@ def create_test_profiling_request(backend_url: str, service_name: str, command_t else: print(f"❌ Failed to create profiling request: {response.status_code} - {response.text}") return False - + except Exception as e: print(f"❌ Error creating profiling request: {e}") return False + def create_mock_responses(): """Create mock responses for testing without a real backend""" - mock_state = { - "pending_commands": [], - "completed_commands": [], - "heartbeat_count": 0 - } - - def mock_heartbeat_post(url, json=None, timeout=None): + mock_state = {"pending_commands": [], "completed_commands": [], "heartbeat_count": 0} + + def mock_heartbeat_post(url, json=None, timeout=None): # noqa: F811 """Mock heartbeat endpoint""" mock_state["heartbeat_count"] += 1 - + # Mock response object response = unittest.mock.Mock() response.status_code = 200 - + # Check if there are pending commands if mock_state["pending_commands"]: command = mock_state["pending_commands"].pop(0) response.json.return_value = { "message": "Heartbeat received", "command_id": command["command_id"], - "profiling_command": command["profiling_command"] + "profiling_command": command["profiling_command"], } else: - response.json.return_value = { - "message": "Heartbeat received, no pending commands" - } - + response.json.return_value = {"message": "Heartbeat received, no pending commands"} + return response - - def mock_profile_request_post(url, json=None, timeout=None): + + def mock_profile_request_post(url, json=None, timeout=None): # noqa: F811 """Mock profile request endpoint""" # Generate unique IDs based on total requests made total_requests = len(mock_state["completed_commands"]) + len(mock_state["pending_commands"]) + 1 command_id = f"cmd_{total_requests}" request_id = f"req_{total_requests}" - + # Add command to pending queue - mock_state["pending_commands"].append({ - "command_id": command_id, - "profiling_command": { - "command_type": json.get("command_type", "start"), - "combined_config": { - "duration": json.get("duration", 60), - "frequency": json.get("frequency", 11), - "profiling_mode": json.get("profiling_mode", "cpu") - } + mock_state["pending_commands"].append( + { + "command_id": command_id, + "profiling_command": { + "command_type": json.get("command_type", "start"), + "combined_config": { + "duration": json.get("duration", 60), + "frequency": json.get("frequency", 11), + "profiling_mode": json.get("profiling_mode", "cpu"), + }, + }, } - }) - + ) + response = unittest.mock.Mock() response.status_code = 200 response.json.return_value = { - "message": f"Profiling request created", + "message": "Profiling request created", "request_id": request_id, - "command_id": command_id + "command_id": command_id, } - + return response - - def mock_command_completion_post(url, json=None, timeout=None): + + def mock_command_completion_post(url, json=None, timeout=None): # noqa: F811 """Mock command completion endpoint""" - mock_state["completed_commands"].append({ - "command_id": json.get("command_id"), - "status": json.get("status"), - "execution_time": json.get("execution_time") - }) - + mock_state["completed_commands"].append( + { + "command_id": json.get("command_id"), + "status": json.get("status"), + "execution_time": json.get("execution_time"), + } + ) + response = unittest.mock.Mock() response.status_code = 200 - response.json.return_value = { - "message": "Command completion received" - } - + response.json.return_value = {"message": "Command completion received"} + return response - - def mock_post(url, json=None, timeout=None): + + def mock_post(url, json=None, timeout=None): # noqa: F811 """Route mock requests to appropriate handlers""" if "/heartbeat" in url: return mock_heartbeat_post(url, json, timeout) @@ -268,91 +258,94 @@ def mock_post(url, json=None, timeout=None): response.status_code = 404 response.text = "Not found" return response - + return mock_post, mock_state + def run_tests(): """Run the actual test logic""" - + # Initialize test client client = HeartbeatClient(BACKEND_URL, SERVICE_NAME, HOSTNAME, IP_ADDRESS) - + # Test 1: Send initial heartbeat (should have no commands) print("\n1️⃣ Test: Initial heartbeat (no commands expected)") client.send_heartbeat() - + # Test 2: Create a START profiling request print("\n2️⃣ Test: Create START profiling request") if create_test_profiling_request(BACKEND_URL, SERVICE_NAME, "start"): time.sleep(0.1) # Give backend time to process - + # Send heartbeat to receive the command print("\n 📡 Sending heartbeat to receive command...") command = client.send_heartbeat() - + if command: client.simulate_profiling_action(command["command_type"], command["command_id"]) - + # Test idempotency - send heartbeat again print("\n 🔄 Testing idempotency - sending heartbeat again...") command = client.send_heartbeat() if command is None: print("✅ Idempotency working - no duplicate command received") - + # Test 3: Create a STOP profiling request print("\n3️⃣ Test: Create STOP profiling request") if create_test_profiling_request(BACKEND_URL, SERVICE_NAME, "stop"): time.sleep(0.1) # Give backend time to process - + # Send heartbeat to receive the stop command print("\n 📡 Sending heartbeat to receive stop command...") command = client.send_heartbeat() - + if command: client.simulate_profiling_action(command["command_type"], command["command_id"]) - + # Test 4: Multiple heartbeats with no commands print("\n4️⃣ Test: Multiple heartbeats with no pending commands") for i in range(3): print(f"\n Heartbeat {i+1}/3:") client.send_heartbeat() time.sleep(0.1) - + print("\n✅ Test completed!") print("\nTest Summary:") print(f" - Executed commands: {len(client.executed_commands)}") print(f" - Last command ID: {client.last_command_id}") print(f" - Commands executed: {list(client.executed_commands)}") + def main(): """Main test function""" print("🧪 Testing Heartbeat-Based Profiling Control System") - + if MOCK_MODE: print("🎭 Running in MOCK MODE (no real backend required)") print(" Use --live flag to test against real backend on localhost:8000") mock_post, mock_state = create_mock_responses() - + # Patch requests.post for mock mode - with unittest.mock.patch('requests.post', side_effect=mock_post): + with unittest.mock.patch("requests.post", side_effect=mock_post): print("=" * 60) run_tests() - + # Print mock state summary - print(f"\n📊 Mock Backend State:") + print("\n📊 Mock Backend State:") print(f" - Total heartbeats: {mock_state['heartbeat_count']}") print(f" - Pending commands: {len(mock_state['pending_commands'])}") print(f" - Completed commands: {len(mock_state['completed_commands'])}") - - if mock_state['completed_commands']: + + if mock_state["completed_commands"]: print(" - Command completions:") - for cmd in mock_state['completed_commands']: + for cmd in mock_state["completed_commands"]: print(f" * {cmd['command_id']}: {cmd['status']} ({cmd['execution_time']}s)") - + else: print("🌐 Running in LIVE MODE (requires backend on localhost:8000)") print("=" * 60) run_tests() + if __name__ == "__main__": main() From 5a98709859efbdbe5de83f51a507f2c3a94553d7 Mon Sep 17 00:00:00 2001 From: ashokchatharajupalli Date: Thu, 22 Jan 2026 15:52:39 +0000 Subject: [PATCH 7/8] Fixed by ensuring -a comes before -- in the perf command --- gprofiler/utils/perf_process.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/gprofiler/utils/perf_process.py b/gprofiler/utils/perf_process.py index a5df15a9f..167af7fee 100644 --- a/gprofiler/utils/perf_process.py +++ b/gprofiler/utils/perf_process.py @@ -142,7 +142,23 @@ def _get_perf_cmd(self) -> List[str]: # If no explicit events are provided but cgroups are used, add default event. # For multiple cgroups, perf requires one event per cgroup. extra_args = self._extra_args - if self._cgroup_args and not extra_args: + + # Separate extra_args into perf options and application command + # The "--" separator marks the boundary between perf args and the app command + perf_extra_args = [] + app_command = [] + separator_found = False + + for arg in extra_args: + if arg == "--": + separator_found = True + app_command.append(arg) + elif separator_found: + app_command.append(arg) + else: + perf_extra_args.append(arg) + + if self._cgroup_args and not perf_extra_args: # Count the number of cgroups (they are comma-separated in -G argument) cgroup_arg = None for i, arg in enumerate(self._cgroup_args): @@ -153,12 +169,12 @@ def _get_perf_cmd(self) -> List[str]: if cgroup_arg: num_cgroups = len(cgroup_arg.split(",")) # Add one event per cgroup (perf requirement) - extra_args = [] + perf_extra_args = [] for _ in range(num_cgroups): - extra_args.extend(["-e", "cycles"]) + perf_extra_args.extend(["-e", "cycles"]) else: # Fallback: single event - extra_args = ["-e", "cycles"] + perf_extra_args = ["-e", "cycles"] return ( [ @@ -178,10 +194,11 @@ def _get_perf_cmd(self) -> List[str]: "-m", str(self._MMAP_SIZES[self._type]), ] - + extra_args # Events must come before cgroups + + perf_extra_args # Events must come before cgroups + self._pid_args + self._cgroup_args + (["-k", "1"] if self._inject_jit else []) + + app_command # Application command (with "--") must be last ) def start(self) -> None: From 01b3ad08535f26db7aff717ee1b30257a451b3b9 Mon Sep 17 00:00:00 2001 From: ashokchatharajupalli Date: Mon, 26 Jan 2026 22:19:42 +0000 Subject: [PATCH 8/8] Improve heartbeat mode logging with clearer status messages after stop command --- gprofiler/heartbeat.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gprofiler/heartbeat.py b/gprofiler/heartbeat.py index 92ee05bf8..6ebc6e904 100644 --- a/gprofiler/heartbeat.py +++ b/gprofiler/heartbeat.py @@ -98,7 +98,7 @@ def send_heartbeat(self) -> Optional[Dict[str, Any]]: logger.info(f"Received profiling command from server: {result.get('command_id')}") return result else: - logger.debug("Heartbeat successful, no pending commands") + logger.debug("Heartbeat sent. No pending commands, waiting for instructions...") return None else: logger.error(f"Heartbeat failed with status {response.status_code}: {response.text}") @@ -214,7 +214,7 @@ def start_heartbeat_loop(self): # Check for idempotency - skip if command already executed if command_id in self.heartbeat_client.executed_command_ids: - logger.info(f"Command ID {command_id} already executed, skipping...") + logger.debug(f"Command ID {command_id} already executed, skipping...") # Wait for next heartbeat self.stop_event.wait(self.heartbeat_interval) @@ -257,6 +257,9 @@ def start_heartbeat_loop(self): error_message=None, results_path=None, ) + + # Log a clear message after stop is completed + logger.info("Profiling stopped. Running in heartbeat mode, waiting for commands...") elif command_type == "start": # Stop current profiler if running, then start new one logger.info(f"Executing START command for command ID: {command_id}")