Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: proto manager builder terraformer ansibler kubeEleven test database minio containerimgs crd crd-apply controller-gen kind-load-images
.PHONY: proto manager builder terraformer ansibler kubeEleven test database minio containerimgs crd crd-apply controller-gen kind-load-images kind-deploy

# Enforce same version of protoc
PROTOC_VERSION = "29.5"
Expand Down Expand Up @@ -84,12 +84,23 @@ containerimgs:
done
$(SED_INPLACE) "s/adapter:.*$$/adapter/" services/kuber/templates/cluster-autoscaler.goyaml

KIND_CLUSTER ?= kind
KIND_NAMESPACE ?= claudie
kind-load-images:
for service in $(SERVICES) ; do \
echo " --- loading $$service to kind cluster --- "; \
kind load docker-image ghcr.io/berops/claudie/$$service:$(REV); \
kind load docker-image --name $(KIND_CLUSTER) ghcr.io/berops/claudie/$$service:$(REV); \
done

kind-deploy: kind-load-images
@echo " --- updating deployments in $(KIND_NAMESPACE) namespace --- "
@for svc in ansibler builder claudie-operator kube-eleven kuber manager terraformer; do \
echo " --- updating $$svc deployment --- "; \
kubectl set image deployment/$$svc $$svc=ghcr.io/berops/claudie/$$svc:$(REV) -n $(KIND_NAMESPACE); \
done
@echo " --- waiting for rollouts to complete --- "
@kubectl rollout status deployment -n $(KIND_NAMESPACE)

# Generate CustomResourceDefinition objects.
crd:
go tool controller-gen rbac:roleName=manager-role crd paths="./..." output:crd:artifacts:config=manifests/claudie/crd
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ Before you begin, please make sure you have the following prerequisites installe
| --------------------------------------------------------------------------------- | ------------------ | ------------------ |------------------ | ------------------ |
| [AWS](https://docs.claudie.io/latest/input-manifest/providers/aws/) | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
| [Azure](https://docs.claudie.io/latest/input-manifest/providers/azure/) | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
| [GCP](https://docs.claudie.io/latest/input-manifest/providers/gcp/) | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :x: |
| [GCP](https://docs.claudie.io/latest/input-manifest/providers/gcp/) | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
| [OCI](https://docs.claudie.io/latest/input-manifest/providers/oci/) | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
| [Hetzner](https://docs.claudie.io/latest/input-manifest/providers/hetzner/) | :heavy_check_mark: | :heavy_check_mark: | N/A | :heavy_check_mark: |
| [Cloudflare](https://docs.claudie.io/latest/input-manifest/providers/cloudflare/) | N/A | :heavy_check_mark: |:heavy_check_mark: | N/A |
Expand Down
4 changes: 3 additions & 1 deletion docs/input-manifest/api-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,9 @@ Dynamic nodepools are defined for cloud provider machines that Claudie is expect

- `cpuCount`: specifies the number of cpus used by the `serverType`
- `memory`: specifies the memory in GBs used by the `serverType`
- `nvidiaGpu`: specifies the number of nvidia GPUs used by the `serverType`
- `nvidiaGpuCount`: specifies the number of NVIDIA GPUs used by the `serverType`
- `nvidiaGpuType`: specifies the NVIDIA GPU accelerator type (required for GCP when using GPUs). Examples: `nvidia-tesla-t4`, `nvidia-tesla-v100`, `nvidia-tesla-a100`, `nvidia-l4`
- `nvidiaGpu`: (deprecated) use `nvidiaGpuCount` instead

- `image`

Expand Down
69 changes: 68 additions & 1 deletion docs/input-manifest/gpu-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ from [Nvidia](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/lates
to deploy the `gpu-operator` into a Claudie-built Kubernetes cluster. Make sure you fulfill the necessary listed
requirements in prerequisites before continuing, if you decide to use a different cloud provider.

In this example we will be using [AWS](providers/aws.md) as our provider, with the following config:
## AWS GPU Example

In this example we will be using [AWS](providers/aws.md) as our provider. AWS GPU instances (like `g4dn.xlarge`) come with GPUs attached, so no additional `machineSpec` configuration is needed:

```yaml
apiVersion: claudie.io/v1beta1
Expand Down Expand Up @@ -57,6 +59,71 @@ spec:
- gpu-aws
```

## GCP GPU Example

For [GCP](providers/gcp.md), you must explicitly specify the GPU type and count using the `machineSpec` block. GCP requires both `nvidiaGpuCount` and `nvidiaGpuType` to attach GPUs to instances:

```yaml
apiVersion: claudie.io/v1beta1
kind: InputManifest
metadata:
name: gcp-gpu-example
labels:
app.kubernetes.io/part-of: claudie
spec:
providers:
- name: gcp-1
providerType: gcp
secretRef:
name: gcp-secret
namespace: secrets

nodePools:
dynamic:
- name: control-gcp
providerSpec:
name: gcp-1
region: us-central1
zone: us-central1-a
count: 1
serverType: e2-medium
image: ubuntu-2404-noble-amd64-v20251001

- name: gpu-gcp
providerSpec:
name: gcp-1
region: us-central1
zone: us-central1-a
count: 2
# Use n1-standard machine types for GPU attachment
serverType: n1-standard-4
image: ubuntu-2404-noble-amd64-v20251001
storageDiskSize: 50
# GPU configuration required for GCP
machineSpec:
nvidiaGpuCount: 1
nvidiaGpuType: nvidia-tesla-t4

kubernetes:
clusters:
- name: gpu-example
version: v1.31.0
network: 172.16.2.0/24
pools:
control:
- control-gcp
compute:
- gpu-gcp
```

!!! note "GCP GPU Requirements"
- The `nvidiaGpuType` field is required when `nvidiaGpuCount > 0` for GCP providers
- Available GPU types vary by zone. Check [GCP GPU regions and zones](https://cloud.google.com/compute/docs/gpus/gpu-regions-zones) for availability
- Common GPU types: `nvidia-tesla-t4`, `nvidia-tesla-v100`, `nvidia-tesla-a100`, `nvidia-l4`
- GPU instances cannot be live migrated, so they will be terminated during maintenance events

## Deploying the GPU Operator

After the `InputManifest` has been successfully built by Claudie, deploy the `gpu-operator` to the `gpu-example` Kubernetes cluster.

1. Create a namespace for the gpu-operator.
Expand Down
47 changes: 47 additions & 0 deletions docs/input-manifest/providers/gcp.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,53 @@ If you wish to use GCP as your DNS provider where Claudie creates DNS records po
!!! warning "GCP is not my domain registrar"
If you haven't acquired a domain via GCP and wish to utilize GCP for hosting your zone, you can refer to [this guide](https://cloud.google.com/dns/docs/update-name-servers) on GCP nameservers. However, if you prefer not to use the entire domain, an alternative option is to delegate a subdomain to GCP.

## GPU Support

GCP requires explicit configuration to attach GPUs to compute instances. Unlike some other providers where GPU-enabled instance types automatically include GPUs, GCP uses a separate `guest_accelerator` mechanism that requires both GPU count and GPU type to be specified.

### Configuration

To use GPUs with GCP nodepools, you must specify both `nvidiaGpuCount` and `nvidiaGpuType` in the `machineSpec` block:

```yaml
nodePools:
dynamic:
- name: gpu-nodepool
providerSpec:
name: gcp-1
region: us-central1
zone: us-central1-a
count: 2
serverType: n1-standard-4
image: ubuntu-2404-noble-amd64-v20251001
machineSpec:
nvidiaGpuCount: 1
nvidiaGpuType: nvidia-tesla-t4
```

### Available GPU Types

Common NVIDIA GPU accelerator types available on GCP:

| GPU Type | Description |
|----------|-------------|
| `nvidia-tesla-t4` | NVIDIA Tesla T4 (cost-effective for inference) |
| `nvidia-tesla-v100` | NVIDIA Tesla V100 (high performance training) |
| `nvidia-tesla-a100` | NVIDIA A100 (latest generation) |
| `nvidia-l4` | NVIDIA L4 (successor to T4) |
| `nvidia-tesla-p100` | NVIDIA Tesla P100 |
| `nvidia-tesla-k80` | NVIDIA Tesla K80 (legacy) |

!!! note "GPU Availability"
GPU availability varies by zone. Check [GCP GPU regions and zones](https://cloud.google.com/compute/docs/gpus/gpu-regions-zones) for current availability in your desired region.

!!! warning "GPU Instance Limitations"
- GPU instances cannot be live migrated and will be terminated during maintenance events
- Use `n1-standard-*` or `n1-highmem-*` machine types with GPUs (not `e2-*` types)
- Some GPU types have minimum vCPU and memory requirements

For a complete GPU deployment example including the NVIDIA GPU Operator installation, see the [GPU Example](../gpu-example.md).

## Input manifest examples
### Single provider, multi region cluster example

Expand Down
12 changes: 10 additions & 2 deletions internal/api/manifest/manifest.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,17 @@ type MachineSpec struct {
// Memory specifies the memory the provided instance type will have.
// +optional
Memory int `validate:"required_with=CpuCount,gte=0" yaml:"memory" json:"memory"`
// Nvidia specifies the number of NVIDIA GPUs the provided instance type will have.
// NvidiaGpuCount specifies the number of NVIDIA GPUs the provided instance type will have.
// +optional
NvidiaGpu int `validate:"gte=0" yaml:"nvidiaGpu" json:"nvidiaGpu"`
NvidiaGpuCount int `validate:"gte=0" yaml:"nvidiaGpuCount" json:"nvidiaGpuCount"`
// NvidiaGpuType specifies the NVIDIA GPU accelerator type (required for GCP when using GPUs).
// Examples: nvidia-tesla-k80, nvidia-tesla-v100, nvidia-tesla-a100, nvidia-l4
// +optional
NvidiaGpuType string `validate:"omitempty" yaml:"nvidiaGpuType" json:"nvidiaGpuType,omitempty"`
// NvidiaGpu is deprecated, use NvidiaGpuCount instead. Kept for backward compatibility.
// +optional
// Deprecated: Use NvidiaGpuCount instead.
NvidiaGpu int `validate:"gte=0" yaml:"nvidiaGpu" json:"nvidiaGpu,omitempty"`
}

// DynamicNodePool List of dynamically to-be-created nodepools of not yet existing machines, used for Kubernetes or loadbalancer clusters.
Expand Down
12 changes: 9 additions & 3 deletions internal/api/manifest/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,10 +293,16 @@ func (ds *Manifest) CreateNodepools(pools []string, isControl bool) ([]*spec.Nod

var machineSpec *spec.MachineSpec
if nodePool.MachineSpec != nil {
// Use NvidiaGpuCount as primary, fall back to deprecated NvidiaGpu for backward compatibility
gpuCount := int32(nodePool.MachineSpec.NvidiaGpuCount)
if gpuCount == 0 && nodePool.MachineSpec.NvidiaGpu > 0 {
gpuCount = int32(nodePool.MachineSpec.NvidiaGpu)
}
machineSpec = &spec.MachineSpec{
CpuCount: int32(nodePool.MachineSpec.CpuCount),
Memory: int32(nodePool.MachineSpec.Memory),
NvidiaGpu: int32(nodePool.MachineSpec.NvidiaGpu),
CpuCount: int32(nodePool.MachineSpec.CpuCount),
Memory: int32(nodePool.MachineSpec.Memory),
NvidiaGpuCount: gpuCount,
NvidiaGpuType: nodePool.MachineSpec.NvidiaGpuType,
}
}

Expand Down
35 changes: 35 additions & 0 deletions internal/api/manifest/validate_node_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,11 @@ func (d *DynamicNodePool) Validate(m *Manifest) error {
return fmt.Errorf("max available count for a nodepool is 255")
}

// Validate GCP-specific GPU configuration
if err := d.validateGCPGpuConfig(m); err != nil {
return err
}

validate := validator.New()

if err := validate.RegisterValidation("external_net", validateExternalNet); err != nil {
Expand All @@ -137,6 +142,36 @@ func (d *DynamicNodePool) Validate(m *Manifest) error {
return nil
}

// validateGCPGpuConfig validates that GCP nodepools with GPUs have the required type specified.
// GCP requires the guest_accelerator block with both type and count to attach GPUs to instances.
func (d *DynamicNodePool) validateGCPGpuConfig(m *Manifest) error {
providerType, err := m.GetProviderType(d.ProviderSpec.Name)
if err != nil {
// Provider validation is handled elsewhere
return nil
}

if providerType != "gcp" {
return nil
}

if d.MachineSpec == nil {
return nil
}

// Check both NvidiaGpuCount (new) and NvidiaGpu (deprecated) for backward compatibility
gpuCount := d.MachineSpec.NvidiaGpuCount
if gpuCount == 0 {
gpuCount = d.MachineSpec.NvidiaGpu
}

if gpuCount > 0 && d.MachineSpec.NvidiaGpuType == "" {
return fmt.Errorf("nvidiaGpuType is required for GCP when nvidiaGpuCount > 0")
}

return nil
}

func (s *StaticNodePool) Validate() error {
if err := validator.New().Struct(s); err != nil {
return prettyPrintValidationError(err)
Expand Down
109 changes: 109 additions & 0 deletions internal/api/manifest/validate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,3 +329,112 @@ func TestOptionalZone(t *testing.T) {
// Nodepools with zone should still pass validation
r.NoError(testNodepoolWithZone.Validate(&Manifest{}))
}

// TestGCPGpuValidation tests that GCP nodepools with GPUs require nvidiaGpuType to be specified.
func TestGCPGpuValidation(t *testing.T) {
r := require.New(t)

// Create a manifest with a GCP provider
gcpManifest := &Manifest{
Providers: Provider{
GCP: []GCP{{
Name: "gcp-1",
Credentials: "fake-credentials",
GCPProject: "fake-project",
}},
},
}

// Create a manifest with a Hetzner provider (non-GCP)
hetznerManifest := &Manifest{
Providers: Provider{
Hetzner: []Hetzner{{
Name: "hetzner-1",
Credentials: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
}},
},
}

// Test case 1: GCP nodepool with GPU count but no type - should fail
gcpNodepoolGpuNoType := &DynamicNodePool{
Name: "gpu-np",
ServerType: "n1-standard-4",
Image: "ubuntu-2204",
Count: 1,
ProviderSpec: ProviderSpec{
Name: "gcp-1",
Region: "us-central1",
Zone: "us-central1-a",
},
MachineSpec: &MachineSpec{
NvidiaGpuCount: 1,
},
}
r.Error(gcpNodepoolGpuNoType.Validate(gcpManifest), "GCP nodepool with GPU count but no type should fail validation")

// Test case 2: GCP nodepool with GPU count and type - should pass
gcpNodepoolGpuWithType := &DynamicNodePool{
Name: "gpu-np",
ServerType: "n1-standard-4",
Image: "ubuntu-2204",
Count: 1,
ProviderSpec: ProviderSpec{
Name: "gcp-1",
Region: "us-central1",
Zone: "us-central1-a",
},
MachineSpec: &MachineSpec{
NvidiaGpuCount: 1,
NvidiaGpuType: "nvidia-tesla-t4",
},
}
r.NoError(gcpNodepoolGpuWithType.Validate(gcpManifest), "GCP nodepool with GPU count and type should pass validation")

// Test case 3: GCP nodepool without GPU - should pass
gcpNodepoolNoGpu := &DynamicNodePool{
Name: "regular-np",
ServerType: "e2-medium",
Image: "ubuntu-2204",
Count: 1,
ProviderSpec: ProviderSpec{
Name: "gcp-1",
Region: "us-central1",
Zone: "us-central1-a",
},
}
r.NoError(gcpNodepoolNoGpu.Validate(gcpManifest), "GCP nodepool without GPU should pass validation")

// Test case 4: Non-GCP nodepool with GPU count but no type - should pass (only GCP requires type)
hetznerNodepoolGpuNoType := &DynamicNodePool{
Name: "gpu-np",
ServerType: "cx21",
Image: "ubuntu-22.04",
Count: 1,
ProviderSpec: ProviderSpec{
Name: "hetzner-1",
Region: "fsn1",
Zone: "fsn1-dc14",
},
MachineSpec: &MachineSpec{
NvidiaGpuCount: 1,
},
}
r.NoError(hetznerNodepoolGpuNoType.Validate(hetznerManifest), "Non-GCP nodepool with GPU count but no type should pass validation")

// Test case 5: Non-GCP nodepool with deprecated nvidiaGpu field but no type - should pass (backward compatibility)
hetznerNodepoolDeprecatedGpu := &DynamicNodePool{
Name: "gpu-np-dep",
ServerType: "cx21",
Image: "ubuntu-22.04",
Count: 1,
ProviderSpec: ProviderSpec{
Name: "hetzner-1",
Region: "fsn1",
Zone: "fsn1-dc14",
},
MachineSpec: &MachineSpec{
NvidiaGpu: 1, // Using deprecated field
},
}
r.NoError(hetznerNodepoolDeprecatedGpu.Validate(hetznerManifest), "Non-GCP nodepool with deprecated nvidiaGpu but no type should pass validation")
}
Loading