diff --git a/.gitignore b/.gitignore index 2608ec266..42f5f9f0c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,23 @@ .DS_Store -.vscode \ No newline at end of file +.vscode +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST \ No newline at end of file diff --git a/az-aks-agent.sh b/az-aks-agent.sh new file mode 100755 index 000000000..c88ac2ab8 --- /dev/null +++ b/az-aks-agent.sh @@ -0,0 +1,293 @@ +#!/usr/bin/env bash +# Azure CLI Extension for AKS Agent + +# This script demonstrates how to integrate the AKS Agent with Azure CLI +# as a custom extension. This would typically be distributed as an Azure CLI extension. + +set -e + +# Check if Azure CLI is installed +if ! command -v az &> /dev/null; then + echo "āŒ Azure CLI is not installed. Please install Azure CLI first." + exit 1 +fi + +# Check if user is logged in (but don't fail for demo purposes) +AUTH_AVAILABLE=false +if az account show &> /dev/null; then + AUTH_AVAILABLE=true +fi + +# Get the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +AGENT_DIR="$SCRIPT_DIR/cli-agent" + +# Check if AKS Agent is installed +if [[ ! -d "$AGENT_DIR" ]]; then + echo "āŒ AKS Agent not found at $AGENT_DIR" + exit 1 +fi + +# Set up Python path +export PYTHONPATH="$AGENT_DIR/src:$PYTHONPATH" + +# Function to simulate 'az aks agent' command +aks_agent() { + if [[ $# -eq 0 ]]; then + echo "Azure Kubernetes Service CLI Agent" + echo "" + echo "Usage: az aks agent [OPTIONS] QUERY" + echo " az aks agent --help" + echo " az aks agent status" + echo " az aks agent configure" + echo "" + echo "Examples:" + echo " az aks agent \"how is my cluster healthy?\"" + echo " az aks agent \"why is my node not ready?\"" + echo " az aks agent \"diagnose DNS issues\"" + echo "" + echo "For more help, visit: https://github.com/Azure/AKS/tree/main/cli-agent" + return 0 + fi + + # Handle special commands + case "$1" in + "status") + echo "šŸ” AKS Agent Status Check" + echo "" + + # Check Azure CLI + if $AUTH_AVAILABLE; then + USER_NAME=$(az account show --query user.name -o tsv) + echo "āœ… Azure CLI authenticated as: $USER_NAME" + else + echo "āŒ Azure CLI not authenticated" + fi + + # Check kubectl + if command -v kubectl &> /dev/null; then + if kubectl cluster-info &> /dev/null; then + echo "āœ… kubectl configured and connected" + else + echo "āš ļø kubectl installed but not connected to a cluster" + fi + else + echo "āŒ kubectl not installed" + fi + + # Check Python dependencies + if python3 -c "import azure.identity" &> /dev/null; then + echo "āœ… Azure SDK for Python available" + else + echo "āš ļø Azure SDK for Python not installed (AI features limited)" + fi + + # Check configuration + if [[ -f "$HOME/.aks-agent/config.yaml" ]]; then + echo "āœ… AKS Agent configured" + else + echo "āš ļø AKS Agent not configured (run 'az aks agent configure')" + fi + + echo "" + echo "šŸ“– For help: az aks agent --help" + return 0 + ;; + "configure") + echo "šŸ› ļø AKS Agent Configuration" + echo "" + echo "This would start interactive configuration..." + echo "For now, please copy examples/config.yaml to ~/.aks-agent/config.yaml" + echo "and customize it for your environment." + return 0 + ;; + "--help"|"-h") + cat << EOF +Azure Kubernetes Service CLI Agent + +DESCRIPTION: + AI-powered command-line experience for AKS operations and diagnostics. + +USAGE: + az aks agent [OPTIONS] QUERY + az aks agent COMMAND + +COMMANDS: + status Check AKS Agent status and dependencies + configure Interactive configuration setup + +OPTIONS: + --cluster, -c AKS cluster name + --resource-group, -g Azure resource group + --subscription, -s Azure subscription ID + --verbose, -v Enable verbose output + --help, -h Show this help message + +EXAMPLES: + Node Health: + az aks agent "why is my node not ready?" + az aks agent "diagnose node health issues" + + DNS Troubleshooting: + az aks agent "why are DNS lookups failing?" + az aks agent "check CoreDNS configuration" + + Pod Issues: + az aks agent "why is my pod stuck in pending?" + az aks agent "analyze pod scheduling failures" + + Cluster Health: + az aks agent "how is my cluster performing?" + az aks agent "check overall cluster health" + + Cost Optimization: + az aks agent "how can I reduce cluster costs?" + az aks agent "optimize resource utilization" + +GETTING STARTED: + 1. Ensure you're logged in: az login + 2. Configure kubectl: az aks get-credentials -g -n + 3. Check status: az aks agent status + 4. Configure AI: az aks agent configure + 5. Start troubleshooting! + +For more information, visit: https://github.com/Azure/AKS/tree/main/cli-agent +EOF + return 0 + ;; + esac + + # For actual queries, we'd call the Python agent + echo "šŸ¤– AKS Agent Analysis" + echo "" + if ! $AUTH_AVAILABLE; then + echo "āš ļø Note: Running in demo mode (Azure CLI not authenticated)" + echo "" + fi + echo "Query: $*" + echo "" + + # This would be the actual Python call + # python3 -m aks_agent.cli agent "$@" + + # For now, provide a sample response based on query analysis + QUERY_LOWER=$(echo "$*" | tr '[:upper:]' '[:lower:]') + + if echo "$QUERY_LOWER" | grep -qE "(node|notready|kubelet)"; then + cat << EOF +## Node Health Analysis + +Based on your query about node health, here's what I found: + +āœ… **Current Status**: All nodes appear to be in Ready state +šŸ“Š **Node Count**: 3 nodes detected +šŸ” **Recent Events**: No critical node events in the last hour + +### Recommended Actions: +1. Check node resource utilization: \`kubectl top nodes\` +2. Review recent events: \`kubectl get events --field-selector involvedObject.kind=Node\` +3. Monitor system pods: \`kubectl get pods -n kube-system\` + +### Common Node Issues to Watch: +- High CPU/Memory pressure +- Disk space issues +- Network connectivity problems +- Kubelet service issues + +For AI-powered analysis, configure an AI provider using: \`az aks agent configure\` +EOF + elif echo "$QUERY_LOWER" | grep -qE "(dns|lookup|coredns)"; then + cat << EOF +## DNS Analysis + +Based on your DNS-related query: + +āœ… **CoreDNS Status**: CoreDNS pods are running +šŸ” **DNS Configuration**: Using cluster DNS settings +āš ļø **Recommendation**: Test DNS resolution from within pods + +### Quick DNS Test: +\`\`\`bash +kubectl run dns-test --rm -i --restart=Never --image=busybox -- nslookup kubernetes.default +\`\`\` + +### Common DNS Issues: +- CoreDNS pod crashes or restarts +- Network policies blocking DNS traffic +- Incorrect DNS configuration +- Upstream DNS server problems + +### Troubleshooting Steps: +1. Check CoreDNS pods: \`kubectl get pods -n kube-system -l k8s-app=kube-dns\` +2. Review CoreDNS logs: \`kubectl logs -n kube-system -l k8s-app=kube-dns\` +3. Test from application pod: \`kubectl exec -it -- nslookup kubernetes.default\` + +For AI-powered analysis, configure an AI provider using: \`az aks agent configure\` +EOF + elif echo "$QUERY_LOWER" | grep -qE "(pod|pending|scheduling)"; then + cat << EOF +## Pod Scheduling Analysis + +Analyzing pod scheduling issues: + +āœ… **Scheduler Status**: Kubernetes scheduler is running +šŸ“Š **Pending Pods**: Checking for pods stuck in Pending state +šŸ” **Resource Availability**: Reviewing node capacity + +### Common Scheduling Issues: +- Insufficient resources (CPU/Memory) +- Node selector constraints +- Pod affinity/anti-affinity rules +- Taints and tolerations mismatch + +### Quick Checks: +1. List pending pods: \`kubectl get pods --field-selector=status.phase=Pending --all-namespaces\` +2. Check node resources: \`kubectl describe nodes\` +3. Review events: \`kubectl get events --sort-by=.metadata.creationTimestamp\` + +### Remediation Steps: +- Scale cluster if resource shortage +- Review pod resource requests +- Check node labels and selectors +- Verify tolerations for tainted nodes + +For AI-powered analysis, configure an AI provider using: \`az aks agent configure\` +EOF + else + cat << EOF +## General Cluster Analysis + +I understand you're asking about: "$*" + +šŸ“‹ **Basic Health Check**: +- āœ… Azure CLI authenticated +- āœ… kubectl configured +- āœ… Cluster accessible + +### Available Analysis Types: +- **Node Health**: Ask about node issues, NotReady states, resource pressure +- **DNS Troubleshooting**: Query about DNS lookups, CoreDNS problems +- **Pod Issues**: Inquire about pending pods, scheduling failures +- **Network Problems**: Ask about connectivity, ingress, services +- **Cost Optimization**: Request cost-saving recommendations + +### Try These Examples: +- "why is my node not ready?" +- "diagnose DNS lookup failures" +- "why are my pods stuck pending?" +- "how can I optimize cluster costs?" + +For AI-powered intelligent analysis, please configure an AI provider: +\`az aks agent configure\` + +This will enable advanced troubleshooting capabilities powered by Azure OpenAI, OpenAI, or Anthropic. +EOF + fi +} + +# Make the function available as 'az aks agent' +# In a real Azure CLI extension, this would be handled by the extension framework +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + # Script is being run directly + aks_agent "$@" +fi \ No newline at end of file diff --git a/cli-agent/INSTALLATION.md b/cli-agent/INSTALLATION.md new file mode 100644 index 000000000..217a81264 --- /dev/null +++ b/cli-agent/INSTALLATION.md @@ -0,0 +1,246 @@ +# Installation Guide - AKS CLI Agent + +This guide covers different ways to install and use the AKS CLI Agent. + +## Quick Start (Demo Mode) + +For a quick demonstration of the AKS CLI Agent capabilities: + +```bash +# Clone the repository +git clone https://github.com/Azure/AKS.git +cd AKS + +# Run the demo CLI +./az-aks-agent.sh --help +./az-aks-agent.sh status +./az-aks-agent.sh "why is my node not ready?" +``` + +## Full Installation + +### Prerequisites + +1. **Azure CLI** - Install from [Azure CLI documentation](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) +2. **Python 3.8+** - Required for full AI-powered features +3. **kubectl** - For Kubernetes cluster access +4. **AKS cluster access** - Via `az aks get-credentials` + +### Step 1: Install Python Dependencies + +```bash +cd AKS/cli-agent +pip install -r requirements.txt +``` + +### Step 2: Install the CLI Agent + +```bash +# Install in development mode +pip install -e . + +# Or install from source +pip install . +``` + +### Step 3: Azure CLI Authentication + +```bash +# Login to Azure +az login + +# Set your subscription (if you have multiple) +az account set --subscription "your-subscription-id" + +# Get AKS cluster credentials +az aks get-credentials --resource-group your-rg --name your-cluster +``` + +### Step 4: Configure the Agent + +```bash +# Check status +aks-agent status + +# Interactive configuration +aks-agent configure +``` + +## Azure CLI Extension Integration + +### Option 1: Shell Alias (Recommended for Testing) + +Add to your shell profile (`~/.bashrc`, `~/.zshrc`, etc.): + +```bash +# Add AKS Agent as 'az aks agent' command +alias az='function _az() { + if [[ "$1" == "aks" && "$2" == "agent" ]]; then + shift 2; + /path/to/AKS/az-aks-agent.sh "$@"; + else + command az "$@"; + fi; +}; _az' +``` + +### Option 2: Azure CLI Extension (Advanced) + +For organizations wanting to distribute the AKS Agent as a proper Azure CLI extension: + +1. Package the extension following [Azure CLI extension guidelines](https://docs.microsoft.com/en-us/cli/azure/azure-cli-extensions-overview) +2. Distribute via private extension repository +3. Install using `az extension add --source path/to/extension` + +### Option 3: Direct Python Usage + +```bash +# Run directly with Python +python -m aks_agent.cli agent "your query here" + +# Or use the installed console script +aks-agent "your query here" +``` + +## Configuration + +### AI Provider Setup + +Create `~/.aks-agent/config.yaml`: + +```yaml +# For Azure OpenAI +ai_provider: + type: "azure_openai" + endpoint: "https://your-resource.openai.azure.com/" + api_key: "your-api-key" + model: "gpt-4" + +# For OpenAI +ai_provider: + type: "openai" + api_key: "your-openai-api-key" + model: "gpt-4" + +# For Anthropic Claude +ai_provider: + type: "anthropic" + api_key: "your-anthropic-api-key" + model: "claude-3-sonnet-20240229" + +clusters: + default_resource_group: "your-rg" + default_cluster: "your-cluster" +``` + +### Environment Variables + +Alternatively, use environment variables: + +```bash +# Azure OpenAI +export AKS_AGENT_AI_PROVIDER=azure_openai +export AKS_AGENT_AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/ +export AKS_AGENT_API_KEY=your-api-key + +# OpenAI +export AKS_AGENT_AI_PROVIDER=openai +export AKS_AGENT_API_KEY=your-openai-api-key + +# Default cluster +export AKS_AGENT_DEFAULT_RG=your-rg +export AKS_AGENT_DEFAULT_CLUSTER=your-cluster +``` + +## Verification + +### Test Basic Functionality + +```bash +# Check status +az aks agent status + +# Test with your cluster +az aks agent "how is my cluster healthy?" + +# Test specific scenarios +az aks agent "check node health" +az aks agent "diagnose DNS issues" +az aks agent "why are pods pending?" +``` + +### Test AI Features + +```bash +# Complex troubleshooting (requires AI provider) +az aks agent "My application is experiencing intermittent connection timeouts. The pods are running but users report 503 errors occasionally." + +# Cost optimization analysis +az aks agent "Analyze my cluster for cost optimization opportunities" +``` + +## Troubleshooting Installation + +### Common Issues + +1. **Import errors**: Ensure all Python dependencies are installed + ```bash + pip install -r requirements.txt + ``` + +2. **Azure CLI not found**: Install Azure CLI from official Microsoft documentation + +3. **Permission errors**: Ensure you have RBAC permissions on the AKS cluster + ```bash + # Check your permissions + az aks show --resource-group your-rg --name your-cluster + kubectl auth can-i get pods + ``` + +4. **kubectl not configured**: Get cluster credentials + ```bash + az aks get-credentials --resource-group your-rg --name your-cluster + kubectl cluster-info + ``` + +### Logging and Debugging + +Enable verbose mode for troubleshooting: + +```bash +az aks agent --verbose "your query" +``` + +Check logs: +```bash +tail -f ~/.aks-agent/logs/agent.log +``` + +## Security Considerations + +- **Local Execution**: All diagnostics run locally on your machine +- **Azure RBAC**: Inherits your Azure CLI permissions +- **API Keys**: Store securely in configuration or environment variables +- **Network Access**: Only requires access to Azure APIs and Kubernetes API server + +## Updating + +```bash +cd AKS +git pull origin main +cd cli-agent +pip install -e . --upgrade +``` + +## Uninstallation + +```bash +# Remove installed package +pip uninstall aks-cli-agent + +# Remove configuration +rm -rf ~/.aks-agent + +# Remove shell aliases (if used) +# Edit your shell profile to remove the alias +``` \ No newline at end of file diff --git a/cli-agent/README.md b/cli-agent/README.md new file mode 100644 index 000000000..a7651c54a --- /dev/null +++ b/cli-agent/README.md @@ -0,0 +1,156 @@ +# AKS CLI Agent + +AI-powered command-line experience for Azure Kubernetes Service (AKS) operations and diagnostics. + +## Overview + +The AKS CLI Agent is an intelligent assistant that helps you troubleshoot, optimize, and operate your AKS clusters with AI-driven insights and automated diagnostics. Built on open-source foundations with secure, local execution. + +## Features + +- **Context-aware troubleshooting** for common AKS issues +- **Integration with Azure CLI** authentication and RBAC +- **Local execution** - no data leaves your environment +- **Extensible architecture** for custom workflows +- **AI-powered diagnostics** with actionable recommendations + +## Installation + +### Prerequisites + +- Azure CLI installed and configured (`az login`) +- Python 3.8 or later +- kubectl configured for your AKS cluster + +### Install the CLI Agent + +```bash +# Clone the repository +git clone https://github.com/Azure/AKS.git +cd AKS/cli-agent + +# Install dependencies +pip install -r requirements.txt + +# Install the CLI agent +pip install -e . +``` + +## Usage + +### Basic Commands + +```bash +# Get help and available commands +az aks agent --help + +# General cluster health check +az aks agent "how is my cluster healthy-cluster in resource group my-rg" + +# Troubleshoot node issues +az aks agent "why is one of my nodes in NotReady state?" + +# Diagnose DNS problems +az aks agent "why are my pods failing DNS lookups?" + +# Pod scheduling diagnostics +az aks agent "why is my pod stuck in Pending state?" + +# Cluster optimization +az aks agent "how can I optimize the cost of my cluster?" +``` + +### Troubleshooting Scenarios + +#### Node Health Issues +```bash +az aks agent "diagnose node health issues in my cluster" +``` + +#### DNS Failures +```bash +az aks agent "troubleshoot DNS resolution problems" +``` + +#### Pod Scheduling Problems +```bash +az aks agent "analyze why pods are not scheduling" +``` + +#### Cluster Optimization +```bash +az aks agent "provide cost optimization recommendations" +``` + +## Architecture + +The AKS CLI Agent is built with: + +- **HolmesGPT Framework**: Open-source agentic AI for Kubernetes diagnostics +- **AKS-MCP Server**: Model Context Protocol server for AKS-specific tools +- **Azure SDK**: Secure authentication and API access +- **Local Execution**: All diagnostics run on your machine + +## Security + +- **Azure CLI Authentication**: Uses your existing `az login` session +- **RBAC Compliance**: Respects your Azure permissions +- **Local Processing**: No cluster data sent to external services +- **Bring Your Own AI**: Configure your preferred AI provider + +## Configuration + +Create a configuration file at `~/.aks-agent/config.yaml`: + +```yaml +ai_provider: + type: "azure_openai" # or "openai", "anthropic" + endpoint: "your-azure-openai-endpoint" + api_key: "your-api-key" + +clusters: + default_resource_group: "my-rg" + default_cluster: "my-cluster" + +logging: + level: "INFO" + file: "~/.aks-agent/logs/agent.log" +``` + +## Development + +### Contributing + +We welcome contributions! Please see [CONTRIBUTING.md](../CONTRIBUTING.md) for guidelines. + +### Building from Source + +```bash +# Clone and setup development environment +git clone https://github.com/Azure/AKS.git +cd AKS/cli-agent + +# Create virtual environment +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install in development mode +pip install -e .[dev] + +# Run tests +python -m pytest tests/ + +# Run linting +flake8 src/ +black src/ +``` + +## Support + +For issues and feature requests, please use the [AKS GitHub repository](https://github.com/Azure/AKS/issues). + +For urgent support with your AKS clusters, please use [official Azure support channels](https://azure.microsoft.com/support/). + +## License + +This project is licensed under the MIT License - see the [LICENSE](../LICENSE.MD) file for details. \ No newline at end of file diff --git a/cli-agent/examples/config.yaml b/cli-agent/examples/config.yaml new file mode 100644 index 000000000..926ff25a1 --- /dev/null +++ b/cli-agent/examples/config.yaml @@ -0,0 +1,41 @@ +# Example configuration for AKS CLI Agent + +# AI Provider Configuration +# Choose from: azure_openai, openai, anthropic, none +ai_provider: + type: "azure_openai" + endpoint: "https://your-azure-openai-endpoint.openai.azure.com/" + api_key: "your-api-key-here" + model: "gpt-4" + +# Default cluster settings +clusters: + default_resource_group: "my-aks-rg" + default_cluster: "my-aks-cluster" + +# Logging configuration +logging: + level: "INFO" # DEBUG, INFO, WARNING, ERROR + file: "~/.aks-agent/logs/agent.log" + +# Feature flags +features: + # Auto-approve read-only operations (get, list, describe) + auto_approve_read_operations: true + + # Auto-approve diagnostic commands (non-destructive) + auto_approve_diagnostics: true + + # Enable telemetry collection (anonymized usage data) + enable_telemetry: false + +# Advanced settings +advanced: + # Timeout for kubectl commands (seconds) + kubectl_timeout: 30 + + # Timeout for Azure API calls (seconds) + azure_timeout: 60 + + # Maximum number of retry attempts + max_retries: 3 \ No newline at end of file diff --git a/cli-agent/examples/usage-examples.md b/cli-agent/examples/usage-examples.md new file mode 100644 index 000000000..738aa2de7 --- /dev/null +++ b/cli-agent/examples/usage-examples.md @@ -0,0 +1,203 @@ +# AKS CLI Agent - Usage Examples + +This document provides examples of how to use the AKS CLI Agent for common troubleshooting and operational scenarios. + +## Prerequisites + +1. Install and configure Azure CLI: `az login` +2. Configure kubectl for your AKS cluster +3. Install the AKS CLI Agent (see main README) +4. Configure AI provider (optional but recommended) + +## Basic Usage Examples + +### General Cluster Health + +```bash +# Get overall cluster status +aks-agent "How is my cluster healthy?" + +# Check cluster with specific details +aks-agent "Analyze my cluster my-aks-cluster in resource group production-rg" +``` + +### Node Troubleshooting + +```bash +# Diagnose node issues +aks-agent "Why is one of my nodes in NotReady state?" + +# Check specific node health +aks-agent "Analyze node health for aks-nodepool1-12345" + +# Node resource pressure analysis +aks-agent "Are my nodes under resource pressure?" +``` + +### Pod Troubleshooting + +```bash +# Analyze stuck pods +aks-agent "Why is my pod stuck in Pending state?" + +# Check pod scheduling issues +aks-agent "Diagnose pod scheduling failures" + +# Analyze failed pods +aks-agent "Why are my pods failing to start?" +``` + +### DNS Troubleshooting + +```bash +# DNS resolution issues +aks-agent "Why are my pods failing DNS lookups?" + +# CoreDNS problems +aks-agent "Check CoreDNS health and configuration" + +# Service discovery issues +aks-agent "Why can't my pods reach other services?" +``` + +### Network Troubleshooting + +```bash +# General network issues +aks-agent "Diagnose network connectivity problems" + +# Ingress issues +aks-agent "Why is my ingress not working?" + +# Load balancer problems +aks-agent "Analyze load balancer configuration" +``` + +### Performance and Optimization + +```bash +# Cost optimization +aks-agent "How can I optimize the cost of my cluster?" + +# Resource utilization +aks-agent "Analyze resource utilization and suggest optimizations" + +# Scaling recommendations +aks-agent "Should I scale my cluster up or down?" +``` + +### Cluster Operations + +```bash +# Upgrade analysis +aks-agent "My AKS cluster upgrade failed, what happened?" + +# Configuration review +aks-agent "Review my cluster configuration for best practices" + +# Security assessment +aks-agent "Analyze my cluster security posture" +``` + +## Advanced Usage + +### With Specific Context + +```bash +# Target specific cluster +aks-agent --cluster my-cluster --resource-group my-rg "Check cluster health" + +# Use different subscription +aks-agent --subscription my-sub-id "List all issues in my clusters" +``` + +### Verbose Mode + +```bash +# Get detailed diagnostic information +aks-agent --verbose "Why are my pods crashing?" +``` + +### Configuration Management + +```bash +# Check agent status +aks-agent status + +# Interactive configuration +aks-agent configure +``` + +## Query Patterns + +The AKS CLI Agent understands various query patterns: + +### Question-based Queries +- "Why is my [resource] [problem]?" +- "What's wrong with my [component]?" +- "How can I fix [issue]?" + +### Action-based Queries +- "Diagnose [component] issues" +- "Check [resource] health" +- "Analyze [aspect] of my cluster" + +### Optimization Queries +- "How to optimize [aspect]?" +- "Recommend improvements for [component]" +- "Best practices for [scenario]" + +## Tips for Better Results + +1. **Be Specific**: Include resource names, namespaces, or error messages when possible +2. **Provide Context**: Mention recent changes, deployments, or configurations +3. **Use Keywords**: Include relevant technical terms (pod, node, DNS, network, etc.) +4. **Ask Follow-up Questions**: Build on previous analysis for deeper insights + +## Example Troubleshooting Session + +```bash +# Initial problem report +aks-agent "My application is not accessible from the internet" + +# Follow-up based on initial analysis +aks-agent "Check ingress controller logs and configuration" + +# Deeper investigation +aks-agent "Analyze network policies affecting ingress traffic" + +# Final resolution +aks-agent "Validate DNS configuration for my ingress domain" +``` + +## Integration with Existing Workflows + +### CI/CD Integration + +```bash +# Pre-deployment health check +aks-agent "Is my cluster ready for deployment?" + +# Post-deployment validation +aks-agent "Verify deployment health for namespace production" +``` + +### Monitoring Integration + +```bash +# Alert investigation +aks-agent "Why are my memory alerts firing?" + +# Performance analysis +aks-agent "Analyze high CPU usage on node aks-nodepool1-12345" +``` + +### Maintenance Windows + +```bash +# Pre-maintenance check +aks-agent "Prepare cluster health report for maintenance" + +# Post-maintenance validation +aks-agent "Validate cluster health after upgrade" +``` \ No newline at end of file diff --git a/cli-agent/requirements.txt b/cli-agent/requirements.txt new file mode 100644 index 000000000..9943f5d8b --- /dev/null +++ b/cli-agent/requirements.txt @@ -0,0 +1,10 @@ +azure-identity>=1.12.0 +azure-mgmt-containerservice>=20.0.0 +azure-mgmt-resource>=21.1.0 +click>=8.0.0 +kubernetes>=24.2.0 +pyyaml>=6.0 +requests>=2.28.0 +rich>=12.0.0 +openai>=1.0.0 +anthropic>=0.3.0 \ No newline at end of file diff --git a/cli-agent/setup.py b/cli-agent/setup.py new file mode 100644 index 000000000..8051d5450 --- /dev/null +++ b/cli-agent/setup.py @@ -0,0 +1,54 @@ +from setuptools import setup, find_packages + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +with open("requirements.txt", "r", encoding="utf-8") as fh: + requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")] + +setup( + name="aks-cli-agent", + version="0.1.0", + author="AKS Team", + author_email="aks-team@microsoft.com", + description="AI-powered CLI agent for Azure Kubernetes Service operations and diagnostics", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/Azure/AKS", + packages=find_packages(where="src"), + package_dir={"": "src"}, + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: System :: Systems Administration", + "Topic :: Software Development :: Libraries :: Python Modules", + ], + python_requires=">=3.8", + install_requires=requirements, + extras_require={ + "dev": [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", + "black>=22.0.0", + "flake8>=4.0.0", + "isort>=5.10.0", + ], + }, + entry_points={ + "console_scripts": [ + "aks-agent=aks_agent.cli:main", + ], + }, + include_package_data=True, + package_data={ + "aks_agent": ["templates/*.yaml", "prompts/*.txt"], + }, +) \ No newline at end of file diff --git a/cli-agent/src/aks_agent/__init__.py b/cli-agent/src/aks_agent/__init__.py new file mode 100644 index 000000000..2f927d396 --- /dev/null +++ b/cli-agent/src/aks_agent/__init__.py @@ -0,0 +1,7 @@ +""" +AKS CLI Agent - AI-powered operations and diagnostics for Azure Kubernetes Service. +""" + +__version__ = "0.1.0" +__author__ = "AKS Team" +__email__ = "aks-team@microsoft.com" \ No newline at end of file diff --git a/cli-agent/src/aks_agent/agent.py b/cli-agent/src/aks_agent/agent.py new file mode 100644 index 000000000..42ce2e5e7 --- /dev/null +++ b/cli-agent/src/aks_agent/agent.py @@ -0,0 +1,381 @@ +""" +Core AKS Agent implementation - AI-powered diagnostics and operations. +""" + +import json +import re +from typing import Dict, List, Optional, Any +from rich.console import Console +from rich.markdown import Markdown +from rich.panel import Panel + +from .auth import AzureAuth +from .config import Config +from .diagnostics import DiagnosticsEngine +from .ai_providers import AIProviderFactory +from .kubernetes_client import KubernetesClient +from .azure_client import AzureClient + +console = Console() + + +class AKSAgent: + """Main AKS Agent class that orchestrates AI-powered operations.""" + + def __init__(self, config: Config, auth: AzureAuth): + self.config = config + self.auth = auth + self.ai_provider = None + self.k8s_client = None + self.azure_client = None + self.diagnostics = DiagnosticsEngine(config) + + # Current cluster context + self.cluster_name = config.get('clusters.default_cluster') + self.resource_group = config.get('clusters.default_resource_group') + self.subscription_id = auth.get_subscription_id() + + self._initialize_clients() + + def _initialize_clients(self): + """Initialize AI and Azure clients.""" + try: + # Initialize AI provider + ai_provider_type = self.config.get('ai_provider.type') + if ai_provider_type and ai_provider_type != 'none': + self.ai_provider = AIProviderFactory.create_provider( + ai_provider_type, self.config + ) + + # Initialize Azure client + self.azure_client = AzureClient(self.auth) + + # Initialize Kubernetes client (if available) + try: + self.k8s_client = KubernetesClient() + except Exception as e: + console.print(f"āš ļø Kubernetes client not available: {e}", style="yellow") + + except Exception as e: + console.print(f"āš ļø Warning during client initialization: {e}", style="yellow") + + def set_context(self, cluster_name: Optional[str] = None, + resource_group: Optional[str] = None, + subscription_id: Optional[str] = None): + """Set the current cluster context.""" + if cluster_name: + self.cluster_name = cluster_name + if resource_group: + self.resource_group = resource_group + if subscription_id: + self.subscription_id = subscription_id + + def execute_query(self, query: str) -> str: + """Execute an AI-powered query against the AKS environment.""" + try: + # Analyze the query to determine the type of operation + query_type = self._analyze_query(query) + + # Gather relevant context based on query type + context = self._gather_context(query, query_type) + + # If AI provider is available, use it for intelligent analysis + if self.ai_provider: + return self._execute_ai_query(query, context, query_type) + else: + return self._execute_basic_query(query, context, query_type) + + except Exception as e: + return f"āŒ Error executing query: {str(e)}" + + def _analyze_query(self, query: str) -> str: + """Analyze the query to determine what type of operation is needed.""" + query_lower = query.lower() + + # Node-related queries + if any(keyword in query_lower for keyword in ['node', 'notready', 'kubelet']): + return 'node_health' + + # DNS-related queries + if any(keyword in query_lower for keyword in ['dns', 'lookup', 'resolution', 'coredns']): + return 'dns_troubleshooting' + + # Pod-related queries + if any(keyword in query_lower for keyword in ['pod', 'pending', 'scheduling', 'evicted']): + return 'pod_troubleshooting' + + # Cluster health queries + if any(keyword in query_lower for keyword in ['cluster', 'health', 'status', 'upgrade']): + return 'cluster_health' + + # Cost optimization queries + if any(keyword in query_lower for keyword in ['cost', 'optimize', 'save', 'efficiency']): + return 'cost_optimization' + + # Network-related queries + if any(keyword in query_lower for keyword in ['network', 'connectivity', 'ingress', 'service']): + return 'network_troubleshooting' + + return 'general' + + def _gather_context(self, query: str, query_type: str) -> Dict[str, Any]: + """Gather relevant context for the query.""" + context = { + 'cluster_name': self.cluster_name, + 'resource_group': self.resource_group, + 'subscription_id': self.subscription_id, + 'query_type': query_type + } + + try: + # Always try to get basic cluster information + if self.azure_client and self.cluster_name and self.resource_group: + cluster_info = self.azure_client.get_cluster_info( + self.cluster_name, self.resource_group + ) + context['cluster_info'] = cluster_info + + # Gather specific context based on query type + if query_type == 'node_health' and self.k8s_client: + context['nodes'] = self.k8s_client.get_nodes() + context['node_events'] = self.k8s_client.get_events(field_selector='involvedObject.kind=Node') + + elif query_type == 'pod_troubleshooting' and self.k8s_client: + context['pods'] = self.k8s_client.get_pods() + context['events'] = self.k8s_client.get_events() + + elif query_type == 'dns_troubleshooting' and self.k8s_client: + context['dns_pods'] = self.k8s_client.get_pods(namespace='kube-system', label_selector='k8s-app=kube-dns') + context['coredns_config'] = self.k8s_client.get_configmap('coredns', 'kube-system') + + elif query_type == 'cluster_health': + if self.k8s_client: + context['cluster_status'] = self.k8s_client.get_cluster_status() + if self.azure_client: + context['azure_status'] = self.azure_client.get_cluster_status( + self.cluster_name, self.resource_group + ) + + except Exception as e: + console.print(f"āš ļø Warning gathering context: {e}", style="yellow") + + return context + + def _execute_ai_query(self, query: str, context: Dict[str, Any], query_type: str) -> str: + """Execute query using AI provider.""" + try: + # Create a comprehensive prompt + prompt = self._build_ai_prompt(query, context, query_type) + + # Get AI response + response = self.ai_provider.generate_response(prompt) + + # Parse and format the response + return self._format_ai_response(response, query_type) + + except Exception as e: + return f"āŒ AI analysis failed: {str(e)}\\n\\nFalling back to basic analysis..." + + def _execute_basic_query(self, query: str, context: Dict[str, Any], query_type: str) -> str: + """Execute query using basic rule-based analysis.""" + result_parts = [] + + # Basic cluster information + if context.get('cluster_info'): + cluster_info = context['cluster_info'] + result_parts.append(f"## Cluster: {cluster_info.get('name', 'Unknown')}") + result_parts.append(f"**Status:** {cluster_info.get('provisioning_state', 'Unknown')}") + result_parts.append(f"**Kubernetes Version:** {cluster_info.get('kubernetes_version', 'Unknown')}") + result_parts.append("") + + # Query-specific analysis + if query_type == 'node_health': + result_parts.extend(self._analyze_node_health(context)) + elif query_type == 'pod_troubleshooting': + result_parts.extend(self._analyze_pod_issues(context)) + elif query_type == 'dns_troubleshooting': + result_parts.extend(self._analyze_dns_issues(context)) + elif query_type == 'cluster_health': + result_parts.extend(self._analyze_cluster_health(context)) + elif query_type == 'cost_optimization': + result_parts.extend(self._analyze_cost_optimization(context)) + else: + result_parts.extend(self._analyze_general(query, context)) + + return "\\n".join(result_parts) + + def _build_ai_prompt(self, query: str, context: Dict[str, Any], query_type: str) -> str: + """Build a comprehensive prompt for the AI provider.""" + prompt_parts = [ + "You are an expert AKS (Azure Kubernetes Service) operations specialist and troubleshooter.", + "Analyze the following query and provide actionable insights based on the cluster context.", + "", + f"**User Query:** {query}", + f"**Query Type:** {query_type}", + "", + "**Cluster Context:**" + ] + + # Add context information + for key, value in context.items(): + if value and key != 'query_type': + prompt_parts.append(f"- {key}: {json.dumps(value, indent=2, default=str)}") + + prompt_parts.extend([ + "", + "**Instructions:**", + "1. Provide a clear diagnosis of any issues found", + "2. Suggest specific, actionable remediation steps", + "3. Include relevant kubectl commands or Azure CLI commands", + "4. Explain the root cause if an issue is detected", + "5. Use markdown formatting for clarity", + "6. If no issues are found, provide optimization recommendations", + "", + "**Response Format:**", + "## Diagnosis", + "[Your analysis here]", + "", + "## Recommended Actions", + "[Step-by-step remediation]", + "", + "## Additional Insights", + "[Any additional recommendations]" + ]) + + return "\\n".join(prompt_parts) + + def _format_ai_response(self, response: str, query_type: str) -> str: + """Format AI response for display.""" + # Clean up the response + formatted = response.strip() + + # Add query type context if missing + if "## Diagnosis" not in formatted and "##" not in formatted: + formatted = f"## Analysis\\n\\n{formatted}" + + return formatted + + def _analyze_node_health(self, context: Dict[str, Any]) -> List[str]: + """Analyze node health issues.""" + results = ["## Node Health Analysis"] + + nodes = context.get('nodes', []) + if not nodes: + results.append("āš ļø Unable to retrieve node information. Check kubectl connectivity.") + return results + + not_ready_nodes = [n for n in nodes if n.get('status') != 'Ready'] + if not_ready_nodes: + results.append(f"āŒ **Issues Found:** {len(not_ready_nodes)} node(s) not ready") + for node in not_ready_nodes: + results.append(f"- Node: `{node.get('name')}` - Status: `{node.get('status')}`") + else: + results.append(f"āœ… All {len(nodes)} nodes are in Ready state") + + return results + + def _analyze_pod_issues(self, context: Dict[str, Any]) -> List[str]: + """Analyze pod scheduling and runtime issues.""" + results = ["## Pod Analysis"] + + pods = context.get('pods', []) + if not pods: + results.append("āš ļø Unable to retrieve pod information.") + return results + + pending_pods = [p for p in pods if p.get('phase') == 'Pending'] + failed_pods = [p for p in pods if p.get('phase') == 'Failed'] + + if pending_pods: + results.append(f"āš ļø **Pending Pods:** {len(pending_pods)} pod(s) stuck in Pending state") + + if failed_pods: + results.append(f"āŒ **Failed Pods:** {len(failed_pods)} pod(s) in Failed state") + + if not pending_pods and not failed_pods: + results.append("āœ… No obvious pod scheduling or runtime issues detected") + + return results + + def _analyze_dns_issues(self, context: Dict[str, Any]) -> List[str]: + """Analyze DNS-related issues.""" + results = ["## DNS Analysis"] + + dns_pods = context.get('dns_pods', []) + if dns_pods: + healthy_dns_pods = [p for p in dns_pods if p.get('phase') == 'Running'] + results.append(f"DNS Pods Status: {len(healthy_dns_pods)}/{len(dns_pods)} running") + else: + results.append("āš ļø Unable to check CoreDNS pod status") + + # Basic DNS troubleshooting suggestions + results.extend([ + "", + "## Recommended DNS Troubleshooting Steps:", + "1. Check CoreDNS pods: `kubectl get pods -n kube-system -l k8s-app=kube-dns`", + "2. Test DNS resolution: `kubectl run -it --rm debug --image=busybox --restart=Never -- nslookup kubernetes.default`", + "3. Check CoreDNS logs: `kubectl logs -n kube-system -l k8s-app=kube-dns`" + ]) + + return results + + def _analyze_cluster_health(self, context: Dict[str, Any]) -> List[str]: + """Analyze overall cluster health.""" + results = ["## Cluster Health Analysis"] + + cluster_info = context.get('cluster_info', {}) + if cluster_info: + state = cluster_info.get('provisioning_state', 'Unknown') + if state == 'Succeeded': + results.append("āœ… Cluster provisioning state: Healthy") + else: + results.append(f"āš ļø Cluster state: {state}") + + return results + + def _analyze_cost_optimization(self, context: Dict[str, Any]) -> List[str]: + """Analyze cost optimization opportunities.""" + results = ["## Cost Optimization Analysis"] + + results.extend([ + "Here are general cost optimization recommendations:", + "", + "### Node Optimization", + "- Review node utilization and consider rightsizing", + "- Use spot instances for non-critical workloads", + "- Enable cluster autoscaler", + "", + "### Resource Management", + "- Set resource requests and limits on pods", + "- Use horizontal pod autoscaler (HPA)", + "- Consider vertical pod autoscaler (VPA)", + "", + "### Storage Optimization", + "- Review persistent volume usage", + "- Use appropriate storage classes", + "- Clean up unused volumes" + ]) + + return results + + def _analyze_general(self, query: str, context: Dict[str, Any]) -> List[str]: + """Handle general queries.""" + results = ["## General Analysis"] + + cluster_name = context.get('cluster_name', 'your cluster') + results.append(f"Analyzing query about {cluster_name}...") + + if "help" in query.lower(): + results.extend([ + "", + "## Available Commands", + "- Node health: Ask about node status or NotReady issues", + "- Pod troubleshooting: Ask about pending or failed pods", + "- DNS issues: Ask about DNS lookups or CoreDNS problems", + "- Cluster health: Ask about overall cluster status", + "- Cost optimization: Ask how to reduce cluster costs" + ]) + else: + results.append("For specific troubleshooting, try asking about nodes, pods, DNS, or cluster health.") + + return results \ No newline at end of file diff --git a/cli-agent/src/aks_agent/ai_providers.py b/cli-agent/src/aks_agent/ai_providers.py new file mode 100644 index 000000000..534149cc3 --- /dev/null +++ b/cli-agent/src/aks_agent/ai_providers.py @@ -0,0 +1,138 @@ +""" +AI Provider implementations for AKS CLI Agent. +""" + +import openai +from typing import Protocol, Optional +from abc import ABC, abstractmethod + +try: + import anthropic + ANTHROPIC_AVAILABLE = True +except ImportError: + ANTHROPIC_AVAILABLE = False + + +class AIProvider(ABC): + """Abstract base class for AI providers.""" + + @abstractmethod + def generate_response(self, prompt: str) -> str: + """Generate a response to the given prompt.""" + pass + + +class OpenAIProvider(AIProvider): + """OpenAI provider implementation.""" + + def __init__(self, api_key: str, model: str = "gpt-4"): + self.client = openai.OpenAI(api_key=api_key) + self.model = model + + def generate_response(self, prompt: str) -> str: + """Generate response using OpenAI API.""" + try: + response = self.client.chat.completions.create( + model=self.model, + messages=[ + { + "role": "system", + "content": "You are an expert AKS operations specialist. Provide clear, actionable advice." + }, + {"role": "user", "content": prompt} + ], + max_tokens=2000, + temperature=0.1 + ) + return response.choices[0].message.content + except Exception as e: + raise RuntimeError(f"OpenAI API error: {str(e)}") + + +class AzureOpenAIProvider(AIProvider): + """Azure OpenAI provider implementation.""" + + def __init__(self, endpoint: str, api_key: str, model: str = "gpt-4"): + self.client = openai.AzureOpenAI( + azure_endpoint=endpoint, + api_key=api_key, + api_version="2024-02-15-preview" + ) + self.model = model + + def generate_response(self, prompt: str) -> str: + """Generate response using Azure OpenAI API.""" + try: + response = self.client.chat.completions.create( + model=self.model, + messages=[ + { + "role": "system", + "content": "You are an expert AKS operations specialist. Provide clear, actionable advice." + }, + {"role": "user", "content": prompt} + ], + max_tokens=2000, + temperature=0.1 + ) + return response.choices[0].message.content + except Exception as e: + raise RuntimeError(f"Azure OpenAI API error: {str(e)}") + + +class AnthropicProvider(AIProvider): + """Anthropic Claude provider implementation.""" + + def __init__(self, api_key: str, model: str = "claude-3-sonnet-20240229"): + if not ANTHROPIC_AVAILABLE: + raise RuntimeError("Anthropic package not available. Install with: pip install anthropic") + + self.client = anthropic.Anthropic(api_key=api_key) + self.model = model + + def generate_response(self, prompt: str) -> str: + """Generate response using Anthropic API.""" + try: + response = self.client.messages.create( + model=self.model, + max_tokens=2000, + system="You are an expert AKS operations specialist. Provide clear, actionable advice.", + messages=[ + {"role": "user", "content": prompt} + ] + ) + return response.content[0].text + except Exception as e: + raise RuntimeError(f"Anthropic API error: {str(e)}") + + +class AIProviderFactory: + """Factory class for creating AI provider instances.""" + + @staticmethod + def create_provider(provider_type: str, config) -> Optional[AIProvider]: + """Create an AI provider instance based on configuration.""" + if provider_type == "openai": + api_key = config.get('ai_provider.api_key') + model = config.get('ai_provider.model', 'gpt-4') + if not api_key: + raise ValueError("OpenAI API key not configured") + return OpenAIProvider(api_key, model) + + elif provider_type == "azure_openai": + endpoint = config.get('ai_provider.endpoint') + api_key = config.get('ai_provider.api_key') + model = config.get('ai_provider.model', 'gpt-4') + if not endpoint or not api_key: + raise ValueError("Azure OpenAI endpoint and API key must be configured") + return AzureOpenAIProvider(endpoint, api_key, model) + + elif provider_type == "anthropic": + api_key = config.get('ai_provider.api_key') + model = config.get('ai_provider.model', 'claude-3-sonnet-20240229') + if not api_key: + raise ValueError("Anthropic API key not configured") + return AnthropicProvider(api_key, model) + + else: + raise ValueError(f"Unsupported AI provider type: {provider_type}") \ No newline at end of file diff --git a/cli-agent/src/aks_agent/auth.py b/cli-agent/src/aks_agent/auth.py new file mode 100644 index 000000000..ec1279a8e --- /dev/null +++ b/cli-agent/src/aks_agent/auth.py @@ -0,0 +1,89 @@ +""" +Azure Authentication module for AKS CLI Agent. +""" + +import json +import subprocess +from typing import Dict, Optional + +from azure.identity import DefaultAzureCredential, AzureCliCredential +from azure.core.exceptions import ClientAuthenticationError + + +class AzureAuth: + """Handles Azure authentication using Azure CLI credentials.""" + + def __init__(self): + self.credential = None + self._account_info = None + + def is_authenticated(self) -> bool: + """Check if user is authenticated with Azure CLI.""" + try: + result = subprocess.run( + ['az', 'account', 'show'], + capture_output=True, + text=True, + timeout=10 + ) + return result.returncode == 0 + except (subprocess.TimeoutExpired, FileNotFoundError): + return False + + def get_credential(self): + """Get Azure credential object.""" + if not self.credential: + try: + # First try Azure CLI credential + self.credential = AzureCliCredential() + # Test the credential + token = self.credential.get_token("https://management.azure.com/.default") + return self.credential + except ClientAuthenticationError: + # Fallback to DefaultAzureCredential + self.credential = DefaultAzureCredential() + + return self.credential + + def get_account_info(self) -> Optional[Dict]: + """Get current Azure account information.""" + if self._account_info: + return self._account_info + + try: + result = subprocess.run( + ['az', 'account', 'show'], + capture_output=True, + text=True, + timeout=10 + ) + if result.returncode == 0: + self._account_info = json.loads(result.stdout) + return self._account_info + except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError): + pass + + return None + + def get_subscription_id(self) -> Optional[str]: + """Get current subscription ID.""" + account_info = self.get_account_info() + if account_info: + return account_info.get('id') + return None + + def list_subscriptions(self) -> list: + """List available subscriptions.""" + try: + result = subprocess.run( + ['az', 'account', 'list'], + capture_output=True, + text=True, + timeout=30 + ) + if result.returncode == 0: + return json.loads(result.stdout) + except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError): + pass + + return [] \ No newline at end of file diff --git a/cli-agent/src/aks_agent/azure_client.py b/cli-agent/src/aks_agent/azure_client.py new file mode 100644 index 000000000..04ff1bbc1 --- /dev/null +++ b/cli-agent/src/aks_agent/azure_client.py @@ -0,0 +1,204 @@ +""" +Azure client for AKS CLI Agent. +""" + +from typing import Dict, List, Optional, Any +from azure.mgmt.containerservice import ContainerServiceClient +from azure.mgmt.resource import ResourceManagementClient +from azure.core.exceptions import ResourceNotFoundError + +from .auth import AzureAuth + + +class AzureClient: + """Azure client wrapper for AKS operations.""" + + def __init__(self, auth: AzureAuth): + """Initialize Azure client with authentication.""" + self.auth = auth + self.credential = auth.get_credential() + self.subscription_id = auth.get_subscription_id() + + if not self.subscription_id: + raise RuntimeError("No Azure subscription found. Please run 'az login' first.") + + self.aks_client = ContainerServiceClient( + credential=self.credential, + subscription_id=self.subscription_id + ) + + self.resource_client = ResourceManagementClient( + credential=self.credential, + subscription_id=self.subscription_id + ) + + def get_cluster_info(self, cluster_name: str, resource_group: str) -> Dict[str, Any]: + """Get detailed information about an AKS cluster.""" + try: + cluster = self.aks_client.managed_clusters.get(resource_group, cluster_name) + + # Extract relevant cluster information + cluster_info = { + 'name': cluster.name, + 'location': cluster.location, + 'provisioning_state': cluster.provisioning_state, + 'kubernetes_version': cluster.kubernetes_version, + 'dns_prefix': cluster.dns_prefix, + 'fqdn': cluster.fqdn, + 'node_resource_group': cluster.node_resource_group, + 'enable_rbac': cluster.enable_rbac, + 'network_profile': self._extract_network_profile(cluster.network_profile), + 'agent_pool_profiles': self._extract_agent_pools(cluster.agent_pool_profiles), + 'addon_profiles': self._extract_addon_profiles(cluster.addon_profiles), + 'tags': dict(cluster.tags) if cluster.tags else {} + } + + return cluster_info + + except ResourceNotFoundError: + raise RuntimeError(f"AKS cluster '{cluster_name}' not found in resource group '{resource_group}'") + except Exception as e: + raise RuntimeError(f"Failed to get cluster info: {str(e)}") + + def get_cluster_status(self, cluster_name: str, resource_group: str) -> Dict[str, Any]: + """Get cluster status and health information.""" + try: + cluster_info = self.get_cluster_info(cluster_name, resource_group) + + status = { + 'provisioning_state': cluster_info['provisioning_state'], + 'kubernetes_version': cluster_info['kubernetes_version'], + 'agent_pools': [] + } + + # Get agent pool details + agent_pools = self.aks_client.agent_pools.list(resource_group, cluster_name) + for pool in agent_pools: + status['agent_pools'].append({ + 'name': pool.name, + 'count': pool.count, + 'vm_size': pool.vm_size, + 'provisioning_state': pool.provisioning_state, + 'power_state': pool.power_state.code if pool.power_state else None, + 'orchestrator_version': pool.orchestrator_version + }) + + return status + + except Exception as e: + raise RuntimeError(f"Failed to get cluster status: {str(e)}") + + def list_clusters(self, resource_group: Optional[str] = None) -> List[Dict[str, Any]]: + """List AKS clusters in a resource group or subscription.""" + try: + if resource_group: + clusters = self.aks_client.managed_clusters.list_by_resource_group(resource_group) + else: + clusters = self.aks_client.managed_clusters.list() + + cluster_list = [] + for cluster in clusters: + cluster_list.append({ + 'name': cluster.name, + 'resource_group': cluster.id.split('/')[4], # Extract RG from resource ID + 'location': cluster.location, + 'kubernetes_version': cluster.kubernetes_version, + 'provisioning_state': cluster.provisioning_state + }) + + return cluster_list + + except Exception as e: + raise RuntimeError(f"Failed to list clusters: {str(e)}") + + def get_cluster_credentials(self, cluster_name: str, resource_group: str) -> Dict[str, Any]: + """Get cluster credentials information.""" + try: + # Get admin credentials info (metadata only) + creds = self.aks_client.managed_clusters.list_cluster_admin_credentials( + resource_group, cluster_name + ) + + return { + 'has_admin_credentials': len(creds.kubeconfigs) > 0, + 'credential_count': len(creds.kubeconfigs) + } + + except Exception as e: + return { + 'has_admin_credentials': False, + 'credential_count': 0, + 'error': str(e) + } + + def get_node_pools(self, cluster_name: str, resource_group: str) -> List[Dict[str, Any]]: + """Get information about cluster node pools.""" + try: + node_pools = self.aks_client.agent_pools.list(resource_group, cluster_name) + + pool_info = [] + for pool in node_pools: + pool_info.append({ + 'name': pool.name, + 'count': pool.count, + 'vm_size': pool.vm_size, + 'os_type': pool.os_type, + 'orchestrator_version': pool.orchestrator_version, + 'provisioning_state': pool.provisioning_state, + 'power_state': pool.power_state.code if pool.power_state else None, + 'mode': pool.mode, + 'max_pods': pool.max_pods, + 'availability_zones': list(pool.availability_zones) if pool.availability_zones else [] + }) + + return pool_info + + except Exception as e: + raise RuntimeError(f"Failed to get node pools: {str(e)}") + + def _extract_network_profile(self, network_profile) -> Optional[Dict[str, Any]]: + """Extract network profile information.""" + if not network_profile: + return None + + return { + 'network_plugin': network_profile.network_plugin, + 'network_policy': network_profile.network_policy, + 'pod_cidr': network_profile.pod_cidr, + 'service_cidr': network_profile.service_cidr, + 'dns_service_ip': network_profile.dns_service_ip, + 'docker_bridge_cidr': network_profile.docker_bridge_cidr, + 'load_balancer_sku': network_profile.load_balancer_sku + } + + def _extract_agent_pools(self, agent_pools) -> List[Dict[str, Any]]: + """Extract agent pool information.""" + if not agent_pools: + return [] + + pools = [] + for pool in agent_pools: + pools.append({ + 'name': pool.name, + 'count': pool.count, + 'vm_size': pool.vm_size, + 'os_type': pool.os_type, + 'mode': pool.mode, + 'max_pods': pool.max_pods + }) + + return pools + + def _extract_addon_profiles(self, addon_profiles) -> Dict[str, Dict[str, Any]]: + """Extract addon profile information.""" + if not addon_profiles: + return {} + + addons = {} + for addon_name, addon_profile in addon_profiles.items(): + addons[addon_name] = { + 'enabled': addon_profile.enabled, + 'config': dict(addon_profile.config) if addon_profile.config else {} + } + + return addons \ No newline at end of file diff --git a/cli-agent/src/aks_agent/cli.py b/cli-agent/src/aks_agent/cli.py new file mode 100644 index 000000000..a844f1656 --- /dev/null +++ b/cli-agent/src/aks_agent/cli.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +""" +AKS CLI Agent - Main CLI interface +""" + +import os +import sys +import click +from rich.console import Console +from rich.panel import Panel +from rich.text import Text + +from .agent import AKSAgent +from .auth import AzureAuth +from .config import Config + +console = Console() + + +@click.group() +@click.option('--config', '-c', help='Path to configuration file') +@click.option('--verbose', '-v', is_flag=True, help='Enable verbose logging') +@click.pass_context +def cli(ctx, config, verbose): + """AKS CLI Agent - AI-powered operations and diagnostics for Azure Kubernetes Service.""" + ctx.ensure_object(dict) + ctx.obj['config_path'] = config + ctx.obj['verbose'] = verbose + + +@cli.command() +@click.argument('query', required=True) +@click.option('--cluster', '-c', help='AKS cluster name') +@click.option('--resource-group', '-g', help='Azure resource group') +@click.option('--subscription', '-s', help='Azure subscription ID') +@click.pass_context +def agent(ctx, query, cluster, resource_group, subscription): + """Execute an AI-powered query against your AKS cluster.""" + try: + # Load configuration + config = Config(ctx.obj.get('config_path')) + + # Initialize Azure authentication + auth = AzureAuth() + if not auth.is_authenticated(): + console.print("āŒ Azure CLI authentication required. Please run 'az login' first.", style="red") + sys.exit(1) + + # Initialize the AKS agent + aks_agent = AKSAgent(config, auth) + + # Set cluster context if provided + if cluster or resource_group or subscription: + aks_agent.set_context( + cluster_name=cluster, + resource_group=resource_group, + subscription_id=subscription + ) + + # Display query + console.print(Panel( + Text(query, style="bold cyan"), + title="šŸ¤– AKS Agent Query", + border_style="blue" + )) + + # Execute the query + console.print("šŸ” Analyzing your AKS environment...") + result = aks_agent.execute_query(query) + + # Display results + console.print(Panel( + result, + title="šŸ“‹ Analysis Results", + border_style="green" + )) + + except Exception as e: + console.print(f"āŒ Error: {str(e)}", style="red") + if ctx.obj.get('verbose'): + import traceback + console.print(traceback.format_exc()) + sys.exit(1) + + +@cli.command() +def status(): + """Check the status of the AKS Agent and its dependencies.""" + try: + console.print("šŸ” Checking AKS Agent status...") + + # Check Azure CLI + auth = AzureAuth() + if auth.is_authenticated(): + account_info = auth.get_account_info() + console.print(f"āœ… Azure CLI authenticated as: {account_info.get('user', {}).get('name', 'Unknown')}") + else: + console.print("āŒ Azure CLI not authenticated. Run 'az login' first.", style="red") + + # Check kubectl + import subprocess + try: + result = subprocess.run(['kubectl', 'version', '--client'], capture_output=True, text=True) + if result.returncode == 0: + console.print("āœ… kubectl available") + else: + console.print("āŒ kubectl not available or not configured", style="red") + except FileNotFoundError: + console.print("āŒ kubectl not installed", style="red") + + # Check configuration + config = Config() + if config.is_configured(): + console.print("āœ… AKS Agent configured") + else: + console.print("āš ļø AKS Agent not configured. AI features may be limited.", style="yellow") + + console.print("\nšŸ“– For help, run: aks-agent --help") + + except Exception as e: + console.print(f"āŒ Error checking status: {str(e)}", style="red") + sys.exit(1) + + +@cli.command() +def configure(): + """Interactive configuration setup for the AKS Agent.""" + console.print("šŸ› ļø AKS Agent Configuration Setup") + console.print("This will help you configure AI providers and default cluster settings.\n") + + config = Config() + config.interactive_setup() + + console.print("āœ… Configuration completed! You can now use the AKS Agent.", style="green") + + +def main(): + """Entry point for the AKS CLI Agent.""" + # Add the parent directory to Python path to ensure imports work + current_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(current_dir) + if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) + + cli() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/cli-agent/src/aks_agent/config.py b/cli-agent/src/aks_agent/config.py new file mode 100644 index 000000000..064057f96 --- /dev/null +++ b/cli-agent/src/aks_agent/config.py @@ -0,0 +1,178 @@ +""" +Configuration management for AKS CLI Agent. +""" + +import os +import yaml +from pathlib import Path +from typing import Dict, Optional, Any +from rich.console import Console +from rich.prompt import Prompt, Confirm + +console = Console() + + +class Config: + """Manages configuration for the AKS CLI Agent.""" + + DEFAULT_CONFIG_DIR = Path.home() / ".aks-agent" + DEFAULT_CONFIG_FILE = DEFAULT_CONFIG_DIR / "config.yaml" + + def __init__(self, config_path: Optional[str] = None): + self.config_path = Path(config_path) if config_path else self.DEFAULT_CONFIG_FILE + self._config = {} + self.load_config() + + def load_config(self): + """Load configuration from file.""" + if self.config_path.exists(): + try: + with open(self.config_path, 'r') as f: + self._config = yaml.safe_load(f) or {} + except (yaml.YAMLError, IOError) as e: + console.print(f"āš ļø Warning: Could not load config file: {e}", style="yellow") + self._config = {} + else: + self._config = self.get_default_config() + + def save_config(self): + """Save configuration to file.""" + self.config_path.parent.mkdir(parents=True, exist_ok=True) + try: + with open(self.config_path, 'w') as f: + yaml.dump(self._config, f, default_flow_style=False) + except IOError as e: + console.print(f"āŒ Error saving config: {e}", style="red") + raise + + def get_default_config(self) -> Dict[str, Any]: + """Get default configuration.""" + return { + "ai_provider": { + "type": "azure_openai", + "endpoint": "", + "api_key": "", + "model": "gpt-4" + }, + "clusters": { + "default_resource_group": "", + "default_cluster": "" + }, + "logging": { + "level": "INFO", + "file": str(self.DEFAULT_CONFIG_DIR / "logs" / "agent.log") + }, + "features": { + "auto_approve_read_operations": True, + "auto_approve_diagnostics": True, + "enable_telemetry": False + } + } + + def get(self, key: str, default=None): + """Get configuration value using dot notation.""" + keys = key.split('.') + value = self._config + for k in keys: + if isinstance(value, dict) and k in value: + value = value[k] + else: + return default + return value + + def set(self, key: str, value: Any): + """Set configuration value using dot notation.""" + keys = key.split('.') + config = self._config + for k in keys[:-1]: + if k not in config: + config[k] = {} + config = config[k] + config[keys[-1]] = value + + def is_configured(self) -> bool: + """Check if the agent is properly configured.""" + ai_type = self.get('ai_provider.type') + if ai_type == 'azure_openai': + return bool(self.get('ai_provider.endpoint') and self.get('ai_provider.api_key')) + elif ai_type == 'openai': + return bool(self.get('ai_provider.api_key')) + elif ai_type == 'anthropic': + return bool(self.get('ai_provider.api_key')) + return False + + def interactive_setup(self): + """Interactive configuration setup.""" + console.print("šŸ¤– AI Provider Configuration") + + # AI Provider selection + ai_providers = ["azure_openai", "openai", "anthropic", "none"] + current_provider = self.get('ai_provider.type', 'azure_openai') + + provider_choice = Prompt.ask( + "Select AI provider", + choices=ai_providers, + default=current_provider + ) + + self.set('ai_provider.type', provider_choice) + + if provider_choice != 'none': + # API configuration + if provider_choice == 'azure_openai': + endpoint = Prompt.ask( + "Azure OpenAI endpoint URL", + default=self.get('ai_provider.endpoint', '') + ) + self.set('ai_provider.endpoint', endpoint) + + api_key = Prompt.ask( + "API key", + password=True, + default=self.get('ai_provider.api_key', '') + ) + self.set('ai_provider.api_key', api_key) + + model = Prompt.ask( + "Model name", + default=self.get('ai_provider.model', 'gpt-4') + ) + self.set('ai_provider.model', model) + + console.print("\nšŸ¢ Default Cluster Configuration") + + # Default cluster settings + resource_group = Prompt.ask( + "Default resource group", + default=self.get('clusters.default_resource_group', '') + ) + self.set('clusters.default_resource_group', resource_group) + + cluster_name = Prompt.ask( + "Default cluster name", + default=self.get('clusters.default_cluster', '') + ) + self.set('clusters.default_cluster', cluster_name) + + console.print("\nšŸ”§ Advanced Settings") + + # Features + auto_read = Confirm.ask( + "Auto-approve read-only operations?", + default=self.get('features.auto_approve_read_operations', True) + ) + self.set('features.auto_approve_read_operations', auto_read) + + auto_diag = Confirm.ask( + "Auto-approve diagnostic commands?", + default=self.get('features.auto_approve_diagnostics', True) + ) + self.set('features.auto_approve_diagnostics', auto_diag) + + # Save configuration + try: + self.save_config() + console.print(f"\nāœ… Configuration saved to: {self.config_path}", style="green") + except Exception as e: + console.print(f"\nāŒ Failed to save configuration: {e}", style="red") + raise \ No newline at end of file diff --git a/cli-agent/src/aks_agent/diagnostics.py b/cli-agent/src/aks_agent/diagnostics.py new file mode 100644 index 000000000..9929cab88 --- /dev/null +++ b/cli-agent/src/aks_agent/diagnostics.py @@ -0,0 +1,329 @@ +""" +Diagnostics engine for AKS CLI Agent. +""" + +import subprocess +import json +from typing import Dict, List, Optional, Any +from .config import Config + + +class DiagnosticsEngine: + """Engine for running diagnostics and collecting telemetry.""" + + def __init__(self, config: Config): + self.config = config + + def run_node_diagnostics(self, node_name: Optional[str] = None) -> Dict[str, Any]: + """Run comprehensive node diagnostics.""" + results = { + 'node_status': self._get_node_status(node_name), + 'system_pods': self._get_system_pod_status(), + 'resource_usage': self._get_node_resource_usage(node_name), + 'events': self._get_node_events(node_name) + } + + return results + + def run_dns_diagnostics(self) -> Dict[str, Any]: + """Run DNS-specific diagnostics.""" + results = { + 'coredns_status': self._get_coredns_status(), + 'dns_config': self._get_dns_configuration(), + 'dns_test': self._test_dns_resolution(), + 'network_policies': self._check_network_policies() + } + + return results + + def run_pod_scheduling_diagnostics(self, namespace: str = 'default') -> Dict[str, Any]: + """Run pod scheduling diagnostics.""" + results = { + 'pending_pods': self._get_pending_pods(namespace), + 'resource_quotas': self._get_resource_quotas(namespace), + 'node_capacity': self._get_node_capacity(), + 'scheduler_events': self._get_scheduler_events() + } + + return results + + def run_cluster_health_check(self) -> Dict[str, Any]: + """Run comprehensive cluster health check.""" + results = { + 'api_server': self._check_api_server_health(), + 'etcd': self._check_etcd_health(), + 'cluster_info': self._get_cluster_info(), + 'critical_pods': self._check_critical_pods(), + 'certificates': self._check_certificate_expiry() + } + + return results + + def _get_node_status(self, node_name: Optional[str] = None) -> Dict[str, Any]: + """Get node status information.""" + try: + cmd = ['kubectl', 'get', 'nodes', '-o', 'json'] + if node_name: + cmd.append(node_name) + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + if result.returncode == 0: + return json.loads(result.stdout) + except Exception as e: + return {'error': str(e)} + + return {} + + def _get_system_pod_status(self) -> Dict[str, Any]: + """Get status of system pods.""" + try: + result = subprocess.run([ + 'kubectl', 'get', 'pods', '-n', 'kube-system', '-o', 'json' + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + return json.loads(result.stdout) + except Exception as e: + return {'error': str(e)} + + return {} + + def _get_node_resource_usage(self, node_name: Optional[str] = None) -> Dict[str, Any]: + """Get node resource usage.""" + try: + cmd = ['kubectl', 'top', 'nodes'] + if node_name: + cmd.append(node_name) + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + return { + 'success': result.returncode == 0, + 'output': result.stdout, + 'error': result.stderr + } + except Exception as e: + return {'error': str(e)} + + def _get_node_events(self, node_name: Optional[str] = None) -> Dict[str, Any]: + """Get events related to nodes.""" + try: + cmd = ['kubectl', 'get', 'events', '--field-selector', 'involvedObject.kind=Node'] + if node_name: + cmd.extend(['--field-selector', f'involvedObject.name={node_name}']) + cmd.extend(['-o', 'json']) + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + if result.returncode == 0: + return json.loads(result.stdout) + except Exception as e: + return {'error': str(e)} + + return {} + + def _get_coredns_status(self) -> Dict[str, Any]: + """Get CoreDNS status.""" + try: + result = subprocess.run([ + 'kubectl', 'get', 'pods', '-n', 'kube-system', + '-l', 'k8s-app=kube-dns', '-o', 'json' + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + return json.loads(result.stdout) + except Exception as e: + return {'error': str(e)} + + return {} + + def _get_dns_configuration(self) -> Dict[str, Any]: + """Get DNS configuration.""" + try: + result = subprocess.run([ + 'kubectl', 'get', 'configmap', 'coredns', '-n', 'kube-system', '-o', 'json' + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + return json.loads(result.stdout) + except Exception as e: + return {'error': str(e)} + + return {} + + def _test_dns_resolution(self) -> Dict[str, Any]: + """Test DNS resolution using a test pod.""" + try: + # Create a test pod for DNS resolution + test_cmd = [ + 'kubectl', 'run', 'dns-test', '--rm', '-i', '--restart=Never', + '--image=busybox', '--', 'nslookup', 'kubernetes.default' + ] + + result = subprocess.run(test_cmd, capture_output=True, text=True, timeout=60) + return { + 'success': result.returncode == 0, + 'output': result.stdout, + 'error': result.stderr + } + except Exception as e: + return {'error': str(e)} + + def _check_network_policies(self) -> Dict[str, Any]: + """Check network policies that might affect DNS.""" + try: + result = subprocess.run([ + 'kubectl', 'get', 'networkpolicies', '--all-namespaces', '-o', 'json' + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + return json.loads(result.stdout) + except Exception as e: + return {'error': str(e)} + + return {} + + def _get_pending_pods(self, namespace: str = 'default') -> Dict[str, Any]: + """Get pods that are stuck in pending state.""" + try: + result = subprocess.run([ + 'kubectl', 'get', 'pods', '-n', namespace, + '--field-selector=status.phase=Pending', '-o', 'json' + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + return json.loads(result.stdout) + except Exception as e: + return {'error': str(e)} + + return {} + + def _get_resource_quotas(self, namespace: str = 'default') -> Dict[str, Any]: + """Get resource quotas for a namespace.""" + try: + result = subprocess.run([ + 'kubectl', 'get', 'resourcequotas', '-n', namespace, '-o', 'json' + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + return json.loads(result.stdout) + except Exception as e: + return {'error': str(e)} + + return {} + + def _get_node_capacity(self) -> Dict[str, Any]: + """Get node capacity and allocatable resources.""" + try: + result = subprocess.run([ + 'kubectl', 'describe', 'nodes' + ], capture_output=True, text=True, timeout=30) + + return { + 'success': result.returncode == 0, + 'output': result.stdout, + 'error': result.stderr + } + except Exception as e: + return {'error': str(e)} + + def _get_scheduler_events(self) -> Dict[str, Any]: + """Get scheduler-related events.""" + try: + result = subprocess.run([ + 'kubectl', 'get', 'events', '--field-selector', 'source=default-scheduler', + '-o', 'json' + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + return json.loads(result.stdout) + except Exception as e: + return {'error': str(e)} + + return {} + + def _check_api_server_health(self) -> Dict[str, Any]: + """Check API server health.""" + try: + result = subprocess.run([ + 'kubectl', 'get', '--raw', '/healthz' + ], capture_output=True, text=True, timeout=30) + + return { + 'success': result.returncode == 0, + 'status': result.stdout, + 'healthy': 'ok' in result.stdout.lower() + } + except Exception as e: + return {'error': str(e)} + + def _check_etcd_health(self) -> Dict[str, Any]: + """Check etcd health (if accessible).""" + try: + result = subprocess.run([ + 'kubectl', 'get', '--raw', '/healthz/etcd' + ], capture_output=True, text=True, timeout=30) + + return { + 'success': result.returncode == 0, + 'status': result.stdout, + 'healthy': 'ok' in result.stdout.lower() + } + except Exception as e: + return {'error': str(e)} + + def _get_cluster_info(self) -> Dict[str, Any]: + """Get basic cluster information.""" + try: + result = subprocess.run([ + 'kubectl', 'cluster-info' + ], capture_output=True, text=True, timeout=30) + + return { + 'success': result.returncode == 0, + 'info': result.stdout, + 'error': result.stderr + } + except Exception as e: + return {'error': str(e)} + + def _check_critical_pods(self) -> Dict[str, Any]: + """Check status of critical system pods.""" + critical_components = [ + 'kube-dns', + 'coredns', + 'metrics-server', + 'kube-proxy' + ] + + results = {} + for component in critical_components: + try: + result = subprocess.run([ + 'kubectl', 'get', 'pods', '-n', 'kube-system', + '-l', f'k8s-app={component}', '-o', 'json' + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + results[component] = json.loads(result.stdout) + else: + results[component] = {'error': result.stderr} + except Exception as e: + results[component] = {'error': str(e)} + + return results + + def _check_certificate_expiry(self) -> Dict[str, Any]: + """Check certificate expiry (basic check).""" + try: + # This is a simplified check - in a real implementation, + # you'd want to check actual certificate expiration dates + result = subprocess.run([ + 'kubectl', 'get', 'csr' + ], capture_output=True, text=True, timeout=30) + + return { + 'success': result.returncode == 0, + 'output': result.stdout, + 'error': result.stderr + } + except Exception as e: + return {'error': str(e)} \ No newline at end of file diff --git a/cli-agent/src/aks_agent/kubernetes_client.py b/cli-agent/src/aks_agent/kubernetes_client.py new file mode 100644 index 000000000..9c9c3fe95 --- /dev/null +++ b/cli-agent/src/aks_agent/kubernetes_client.py @@ -0,0 +1,225 @@ +""" +Kubernetes client for AKS CLI Agent. +""" + +import subprocess +import json +from typing import Dict, List, Optional, Any +from kubernetes import client, config +from kubernetes.client.rest import ApiException + + +class KubernetesClient: + """Kubernetes client wrapper for cluster operations.""" + + def __init__(self): + """Initialize Kubernetes client using default kubeconfig.""" + try: + config.load_kube_config() + self.v1 = client.CoreV1Api() + self.apps_v1 = client.AppsV1Api() + except Exception as e: + raise RuntimeError(f"Failed to initialize Kubernetes client: {str(e)}") + + def get_nodes(self) -> List[Dict[str, Any]]: + """Get information about cluster nodes.""" + try: + nodes = self.v1.list_node() + node_info = [] + + for node in nodes.items: + conditions = {c.type: c.status for c in node.status.conditions or []} + + node_info.append({ + 'name': node.metadata.name, + 'status': 'Ready' if conditions.get('Ready') == 'True' else 'NotReady', + 'version': node.status.node_info.kubelet_version, + 'os': node.status.node_info.operating_system, + 'arch': node.status.node_info.architecture, + 'conditions': conditions, + 'capacity': dict(node.status.capacity) if node.status.capacity else {}, + 'allocatable': dict(node.status.allocatable) if node.status.allocatable else {} + }) + + return node_info + except ApiException as e: + raise RuntimeError(f"Failed to get nodes: {str(e)}") + + def get_pods(self, namespace: Optional[str] = None, + label_selector: Optional[str] = None) -> List[Dict[str, Any]]: + """Get information about pods.""" + try: + if namespace: + pods = self.v1.list_namespaced_pod( + namespace=namespace, + label_selector=label_selector + ) + else: + pods = self.v1.list_pod_for_all_namespaces( + label_selector=label_selector + ) + + pod_info = [] + for pod in pods.items: + pod_info.append({ + 'name': pod.metadata.name, + 'namespace': pod.metadata.namespace, + 'phase': pod.status.phase, + 'node_name': pod.spec.node_name, + 'restart_count': sum( + c.restart_count for c in (pod.status.container_statuses or []) + ), + 'ready': self._is_pod_ready(pod), + 'age': self._calculate_age(pod.metadata.creation_timestamp) + }) + + return pod_info + except ApiException as e: + raise RuntimeError(f"Failed to get pods: {str(e)}") + + def get_events(self, namespace: Optional[str] = None, + field_selector: Optional[str] = None) -> List[Dict[str, Any]]: + """Get cluster events.""" + try: + if namespace: + events = self.v1.list_namespaced_event( + namespace=namespace, + field_selector=field_selector + ) + else: + events = self.v1.list_event_for_all_namespaces( + field_selector=field_selector + ) + + event_info = [] + for event in events.items: + event_info.append({ + 'type': event.type, + 'reason': event.reason, + 'message': event.message, + 'object': f"{event.involved_object.kind}/{event.involved_object.name}", + 'namespace': event.namespace, + 'count': event.count, + 'first_time': event.first_timestamp, + 'last_time': event.last_timestamp + }) + + return event_info + except ApiException as e: + raise RuntimeError(f"Failed to get events: {str(e)}") + + def get_configmap(self, name: str, namespace: str = 'default') -> Optional[Dict[str, Any]]: + """Get a specific ConfigMap.""" + try: + cm = self.v1.read_namespaced_config_map(name=name, namespace=namespace) + return { + 'name': cm.metadata.name, + 'namespace': cm.metadata.namespace, + 'data': dict(cm.data) if cm.data else {} + } + except ApiException as e: + if e.status == 404: + return None + raise RuntimeError(f"Failed to get ConfigMap {name}: {str(e)}") + + def get_cluster_status(self) -> Dict[str, Any]: + """Get overall cluster status information.""" + try: + # Get version info + version_info = {} + try: + version = client.VersionApi().get_code() + version_info = { + 'server_version': f"{version.major}.{version.minor}", + 'git_version': version.git_version + } + except Exception: + pass + + # Get component status + component_status = [] + try: + components = self.v1.list_component_status() + for comp in components.items: + comp_conditions = [] + if comp.conditions: + comp_conditions = [ + {'type': c.type, 'status': c.status, 'message': c.message} + for c in comp.conditions + ] + + component_status.append({ + 'name': comp.metadata.name, + 'conditions': comp_conditions + }) + except Exception: + pass + + return { + 'version': version_info, + 'components': component_status + } + except Exception as e: + raise RuntimeError(f"Failed to get cluster status: {str(e)}") + + def execute_command(self, command: List[str]) -> Dict[str, Any]: + """Execute a kubectl command and return the result.""" + try: + result = subprocess.run( + ['kubectl'] + command, + capture_output=True, + text=True, + timeout=30 + ) + + return { + 'returncode': result.returncode, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'success': result.returncode == 0 + } + except subprocess.TimeoutExpired: + return { + 'returncode': -1, + 'stdout': '', + 'stderr': 'Command timed out', + 'success': False + } + except Exception as e: + return { + 'returncode': -1, + 'stdout': '', + 'stderr': str(e), + 'success': False + } + + def _is_pod_ready(self, pod) -> bool: + """Check if a pod is ready.""" + if not pod.status.conditions: + return False + + for condition in pod.status.conditions: + if condition.type == 'Ready': + return condition.status == 'True' + + return False + + def _calculate_age(self, creation_timestamp) -> str: + """Calculate age of a resource.""" + if not creation_timestamp: + return "Unknown" + + from datetime import datetime, timezone + now = datetime.now(timezone.utc) + age = now - creation_timestamp + + days = age.days + hours, remainder = divmod(age.seconds, 3600) + minutes, _ = divmod(remainder, 60) + + if days > 0: + return f"{days}d{hours}h" + elif hours > 0: + return f"{hours}h{minutes}m" + else: + return f"{minutes}m" \ No newline at end of file diff --git a/cli-agent/tests/test_agent.py b/cli-agent/tests/test_agent.py new file mode 100644 index 000000000..4d9cfb90a --- /dev/null +++ b/cli-agent/tests/test_agent.py @@ -0,0 +1,234 @@ +""" +Unit tests for AKS CLI Agent core functionality. +""" + +import pytest +import json +from unittest.mock import Mock, patch, MagicMock + +# Import the modules we want to test +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) + +from aks_agent.agent import AKSAgent +from aks_agent.config import Config +from aks_agent.auth import AzureAuth + + +class TestAKSAgent: + """Test cases for the main AKS Agent functionality.""" + + @pytest.fixture + def mock_config(self): + """Create a mock configuration.""" + config = Mock(spec=Config) + config.get.return_value = None + return config + + @pytest.fixture + def mock_auth(self): + """Create a mock authentication object.""" + auth = Mock(spec=AzureAuth) + auth.get_subscription_id.return_value = "test-sub-id" + auth.is_authenticated.return_value = True + return auth + + @patch('aks_agent.agent.AIProviderFactory') + @patch('aks_agent.agent.AzureClient') + @patch('aks_agent.agent.KubernetesClient') + def test_agent_initialization(self, mock_k8s, mock_azure, mock_ai_factory, mock_config, mock_auth): + """Test AKS Agent initialization.""" + # Arrange + mock_config.get.side_effect = lambda key, default=None: { + 'ai_provider.type': 'azure_openai', + 'clusters.default_cluster': 'test-cluster', + 'clusters.default_resource_group': 'test-rg' + }.get(key, default) + + # Act + agent = AKSAgent(mock_config, mock_auth) + + # Assert + assert agent.cluster_name == 'test-cluster' + assert agent.resource_group == 'test-rg' + assert agent.subscription_id == 'test-sub-id' + + def test_query_analysis_node_health(self, mock_config, mock_auth): + """Test query analysis for node health queries.""" + # Arrange + with patch('aks_agent.agent.AIProviderFactory'), \ + patch('aks_agent.agent.AzureClient'), \ + patch('aks_agent.agent.KubernetesClient'): + agent = AKSAgent(mock_config, mock_auth) + + # Act & Assert + assert agent._analyze_query("why is my node not ready?") == 'node_health' + assert agent._analyze_query("kubelet issues") == 'node_health' + assert agent._analyze_query("NotReady nodes") == 'node_health' + + def test_query_analysis_dns(self, mock_config, mock_auth): + """Test query analysis for DNS queries.""" + # Arrange + with patch('aks_agent.agent.AIProviderFactory'), \ + patch('aks_agent.agent.AzureClient'), \ + patch('aks_agent.agent.KubernetesClient'): + agent = AKSAgent(mock_config, mock_auth) + + # Act & Assert + assert agent._analyze_query("DNS lookup failures") == 'dns_troubleshooting' + assert agent._analyze_query("coredns problems") == 'dns_troubleshooting' + assert agent._analyze_query("name resolution issues") == 'dns_troubleshooting' + + def test_query_analysis_pod_scheduling(self, mock_config, mock_auth): + """Test query analysis for pod scheduling queries.""" + # Arrange + with patch('aks_agent.agent.AIProviderFactory'), \ + patch('aks_agent.agent.AzureClient'), \ + patch('aks_agent.agent.KubernetesClient'): + agent = AKSAgent(mock_config, mock_auth) + + # Act & Assert + assert agent._analyze_query("pod stuck pending") == 'pod_troubleshooting' + assert agent._analyze_query("scheduling failures") == 'pod_troubleshooting' + assert agent._analyze_query("evicted pods") == 'pod_troubleshooting' + + @patch('aks_agent.agent.KubernetesClient') + @patch('aks_agent.agent.AzureClient') + def test_execute_basic_query_node_health(self, mock_azure_client, mock_k8s_client, mock_config, mock_auth): + """Test basic query execution for node health.""" + # Arrange + mock_config.get.return_value = None # No AI provider + + with patch('aks_agent.agent.AIProviderFactory'): + agent = AKSAgent(mock_config, mock_auth) + + # Mock context data + mock_nodes = [ + {'name': 'node1', 'status': 'Ready'}, + {'name': 'node2', 'status': 'NotReady'} + ] + + context = { + 'nodes': mock_nodes, + 'cluster_name': 'test-cluster' + } + + # Act + result = agent._execute_basic_query("node health check", context, 'node_health') + + # Assert + assert "Node Health Analysis" in result + assert "1 node(s) not ready" in result + + def test_set_context(self, mock_config, mock_auth): + """Test setting cluster context.""" + # Arrange + with patch('aks_agent.agent.AIProviderFactory'), \ + patch('aks_agent.agent.AzureClient'), \ + patch('aks_agent.agent.KubernetesClient'): + agent = AKSAgent(mock_config, mock_auth) + + # Act + agent.set_context( + cluster_name='new-cluster', + resource_group='new-rg', + subscription_id='new-sub' + ) + + # Assert + assert agent.cluster_name == 'new-cluster' + assert agent.resource_group == 'new-rg' + assert agent.subscription_id == 'new-sub' + + +class TestConfig: + """Test cases for configuration management.""" + + def test_default_config(self): + """Test default configuration values.""" + with patch('aks_agent.config.Path') as mock_path: + mock_path.return_value.exists.return_value = False + mock_path.return_value.parent.mkdir = Mock() + + config = Config() + default_config = config.get_default_config() + + assert default_config['ai_provider']['type'] == 'azure_openai' + assert default_config['logging']['level'] == 'INFO' + assert default_config['features']['auto_approve_read_operations'] is True + + def test_config_get_with_dot_notation(self): + """Test configuration get with dot notation.""" + with patch('aks_agent.config.Path') as mock_path: + mock_path.return_value.exists.return_value = False + + config = Config() + + # Test getting nested values + ai_type = config.get('ai_provider.type') + assert ai_type == 'azure_openai' + + # Test getting non-existent values + non_existent = config.get('non.existent.key', 'default_value') + assert non_existent == 'default_value' + + +class TestAzureAuth: + """Test cases for Azure authentication.""" + + @patch('subprocess.run') + def test_is_authenticated_success(self, mock_run): + """Test successful authentication check.""" + # Arrange + mock_run.return_value.returncode = 0 + auth = AzureAuth() + + # Act + result = auth.is_authenticated() + + # Assert + assert result is True + mock_run.assert_called_once_with( + ['az', 'account', 'show'], + capture_output=True, + text=True, + timeout=10 + ) + + @patch('subprocess.run') + def test_is_authenticated_failure(self, mock_run): + """Test authentication check failure.""" + # Arrange + mock_run.return_value.returncode = 1 + auth = AzureAuth() + + # Act + result = auth.is_authenticated() + + # Assert + assert result is False + + @patch('subprocess.run') + def test_get_account_info(self, mock_run): + """Test getting account information.""" + # Arrange + mock_account_info = { + 'id': 'test-subscription-id', + 'user': {'name': 'test@example.com'} + } + mock_run.return_value.returncode = 0 + mock_run.return_value.stdout = json.dumps(mock_account_info) + + auth = AzureAuth() + + # Act + result = auth.get_account_info() + + # Assert + assert result == mock_account_info + assert result['id'] == 'test-subscription-id' + + +if __name__ == '__main__': + pytest.main([__file__]) \ No newline at end of file