berops · samuelstolicny · Jan 27, 2026 · Jan 27, 2026
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: proto manager builder terraformer ansibler kubeEleven test database minio containerimgs crd crd-apply controller-gen kind-load-images
+.PHONY: proto manager builder terraformer ansibler kubeEleven test database minio containerimgs crd crd-apply controller-gen kind-load-images kind-deploy
 
 # Enforce same version of protoc
 PROTOC_VERSION = "29.5"
@@ -84,12 +84,23 @@ containerimgs:
 	done
 	$(SED_INPLACE) "s/adapter:.*$$/adapter/" services/kuber/templates/cluster-autoscaler.goyaml
 
+KIND_CLUSTER ?= kind
+KIND_NAMESPACE ?= claudie
 kind-load-images:
 	for service in $(SERVICES) ; do \
 		echo " --- loading $$service to kind cluster --- "; \
-		kind load docker-image ghcr.io/berops/claudie/$$service:$(REV); \
+		kind load docker-image --name $(KIND_CLUSTER) ghcr.io/berops/claudie/$$service:$(REV); \
 	done
 
+kind-deploy: kind-load-images
+	@echo " --- updating deployments in $(KIND_NAMESPACE) namespace --- "
+	@for svc in ansibler builder claudie-operator kube-eleven kuber manager terraformer; do \
+		echo " --- updating $$svc deployment --- "; \
+		kubectl set image deployment/$$svc $$svc=ghcr.io/berops/claudie/$$svc:$(REV) -n $(KIND_NAMESPACE); \
+	done
+	@echo " --- waiting for rollouts to complete --- "
+	@kubectl rollout status deployment -n $(KIND_NAMESPACE)
+
 # Generate CustomResourceDefinition objects.
 crd:
 	go tool controller-gen rbac:roleName=manager-role crd paths="./..." output:crd:artifacts:config=manifests/claudie/crd

diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ Before you begin, please make sure you have the following prerequisites installe
    | --------------------------------------------------------------------------------- | ------------------ | ------------------ |------------------ | ------------------ |
    | [AWS](https://docs.claudie.io/latest/input-manifest/providers/aws/)               | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
    | [Azure](https://docs.claudie.io/latest/input-manifest/providers/azure/)           | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
-   | [GCP](https://docs.claudie.io/latest/input-manifest/providers/gcp/)               | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :x:                |
+   | [GCP](https://docs.claudie.io/latest/input-manifest/providers/gcp/)               | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
    | [OCI](https://docs.claudie.io/latest/input-manifest/providers/oci/)               | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
    | [Hetzner](https://docs.claudie.io/latest/input-manifest/providers/hetzner/)       | :heavy_check_mark: | :heavy_check_mark: | N/A               | :heavy_check_mark: |
    | [Cloudflare](https://docs.claudie.io/latest/input-manifest/providers/cloudflare/) | N/A                | :heavy_check_mark: |:heavy_check_mark: | N/A                |

diff --git a/docs/input-manifest/api-reference.md b/docs/input-manifest/api-reference.md
@@ -246,7 +246,9 @@ Dynamic nodepools are defined for cloud provider machines that Claudie is expect
 
   - `cpuCount`: specifies the number of cpus used by the `serverType`
   - `memory`: specifies the memory in GBs used by the `serverType`
-  - `nvidiaGpu`: specifies the number of nvidia GPUs used by the `serverType`
+  - `nvidiaGpuCount`: specifies the number of NVIDIA GPUs used by the `serverType`
+  - `nvidiaGpuType`: specifies the NVIDIA GPU accelerator type (required for GCP when using GPUs). Examples: `nvidia-tesla-t4`, `nvidia-tesla-v100`, `nvidia-tesla-a100`, `nvidia-l4`
+  - `nvidiaGpu`: (deprecated) use `nvidiaGpuCount` instead
 
 - `image`
 

diff --git a/docs/input-manifest/gpu-example.md b/docs/input-manifest/gpu-example.md
@@ -3,7 +3,9 @@ from [Nvidia](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/lates
 to deploy the `gpu-operator` into a Claudie-built Kubernetes cluster. Make sure you fulfill the necessary listed
 requirements in prerequisites before continuing, if you decide to use a different cloud provider.
 
-In this example we will be using [AWS](providers/aws.md) as our provider, with the following config:
+## AWS GPU Example
+
+In this example we will be using [AWS](providers/aws.md) as our provider. AWS GPU instances (like `g4dn.xlarge`) come with GPUs attached, so no additional `machineSpec` configuration is needed:
 
 ```yaml
 apiVersion: claudie.io/v1beta1
@@ -57,6 +59,71 @@ spec:
             - gpu-aws
 ```
 
+## GCP GPU Example
+
+For [GCP](providers/gcp.md), you must explicitly specify the GPU type and count using the `machineSpec` block. GCP requires both `nvidiaGpuCount` and `nvidiaGpuType` to attach GPUs to instances:
+
+```yaml
+apiVersion: claudie.io/v1beta1
+kind: InputManifest
+metadata:
+  name: gcp-gpu-example
+  labels:
+    app.kubernetes.io/part-of: claudie
+spec:
+  providers:
+    - name: gcp-1
+      providerType: gcp
+      secretRef:
+        name: gcp-secret
+        namespace: secrets
+
+  nodePools:
+    dynamic:
+    - name: control-gcp
+      providerSpec:
+        name: gcp-1
+        region: us-central1
+        zone: us-central1-a
+      count: 1
+      serverType: e2-medium
+      image: ubuntu-2404-noble-amd64-v20251001
+
+    - name: gpu-gcp
+      providerSpec:
+        name: gcp-1
+        region: us-central1
+        zone: us-central1-a
+      count: 2
+      # Use n1-standard machine types for GPU attachment
+      serverType: n1-standard-4
+      image: ubuntu-2404-noble-amd64-v20251001
+      storageDiskSize: 50
+      # GPU configuration required for GCP
+      machineSpec:
+        nvidiaGpuCount: 1
+        nvidiaGpuType: nvidia-tesla-t4
+
+  kubernetes:
+    clusters:
+      - name: gpu-example
+        version: v1.31.0
+        network: 172.16.2.0/24
+        pools:
+          control:
+            - control-gcp
+          compute:
+            - gpu-gcp
+```
+
+!!! note "GCP GPU Requirements"
+    - The `nvidiaGpuType` field is required when `nvidiaGpuCount > 0` for GCP providers
+    - Available GPU types vary by zone. Check [GCP GPU regions and zones](https://cloud.google.com/compute/docs/gpus/gpu-regions-zones) for availability
+    - Common GPU types: `nvidia-tesla-t4`, `nvidia-tesla-v100`, `nvidia-tesla-a100`, `nvidia-l4`
+    - GPU instances cannot be live migrated, so they will be terminated during maintenance events
+
+## Deploying the GPU Operator
+
 After the `InputManifest` has been successfully built by Claudie, deploy the `gpu-operator` to the `gpu-example` Kubernetes cluster.
 
 1. Create a namespace for the gpu-operator.

diff --git a/docs/input-manifest/providers/gcp.md b/docs/input-manifest/providers/gcp.md
@@ -69,6 +69,53 @@ If you wish to use GCP as your DNS provider where Claudie creates DNS records po
 !!! warning "GCP is not my domain registrar"
     If you haven't acquired a domain via GCP and wish to utilize GCP for hosting your zone, you can refer to [this guide](https://cloud.google.com/dns/docs/update-name-servers) on GCP nameservers. However, if you prefer not to use the entire domain, an alternative option is to delegate a subdomain to GCP.
 
+## GPU Support
+
+GCP requires explicit configuration to attach GPUs to compute instances. Unlike some other providers where GPU-enabled instance types automatically include GPUs, GCP uses a separate `guest_accelerator` mechanism that requires both GPU count and GPU type to be specified.
+
+### Configuration
+
+To use GPUs with GCP nodepools, you must specify both `nvidiaGpuCount` and `nvidiaGpuType` in the `machineSpec` block:
+
+```yaml
+nodePools:
+  dynamic:
+    - name: gpu-nodepool
+      providerSpec:
+        name: gcp-1
+        region: us-central1
+        zone: us-central1-a
+      count: 2
+      serverType: n1-standard-4
+      image: ubuntu-2404-noble-amd64-v20251001
+      machineSpec:
+        nvidiaGpuCount: 1
+        nvidiaGpuType: nvidia-tesla-t4
+```
+
+### Available GPU Types
+
+Common NVIDIA GPU accelerator types available on GCP:
+
+| GPU Type | Description |
+|----------|-------------|
+| `nvidia-tesla-t4` | NVIDIA Tesla T4 (cost-effective for inference) |
+| `nvidia-tesla-v100` | NVIDIA Tesla V100 (high performance training) |
+| `nvidia-tesla-a100` | NVIDIA A100 (latest generation) |
+| `nvidia-l4` | NVIDIA L4 (successor to T4) |
+| `nvidia-tesla-p100` | NVIDIA Tesla P100 |
+| `nvidia-tesla-k80` | NVIDIA Tesla K80 (legacy) |
+
+!!! note "GPU Availability"
+    GPU availability varies by zone. Check [GCP GPU regions and zones](https://cloud.google.com/compute/docs/gpus/gpu-regions-zones) for current availability in your desired region.
+
+!!! warning "GPU Instance Limitations"
+    - GPU instances cannot be live migrated and will be terminated during maintenance events
+    - Use `n1-standard-*` or `n1-highmem-*` machine types with GPUs (not `e2-*` types)
+    - Some GPU types have minimum vCPU and memory requirements
+
+For a complete GPU deployment example including the NVIDIA GPU Operator installation, see the [GPU Example](../gpu-example.md).
+
 ## Input manifest examples
 ### Single provider, multi region cluster example
 

diff --git a/internal/api/manifest/manifest.go b/internal/api/manifest/manifest.go
@@ -135,9 +135,17 @@ type MachineSpec struct {
 	// Memory specifies the memory the provided instance type will have.
 	// +optional
 	Memory int `validate:"required_with=CpuCount,gte=0" yaml:"memory" json:"memory"`
-	// Nvidia specifies the number of NVIDIA GPUs the provided instance type will have.
+	// NvidiaGpuCount specifies the number of NVIDIA GPUs the provided instance type will have.
 	// +optional
-	NvidiaGpu int `validate:"gte=0" yaml:"nvidiaGpu" json:"nvidiaGpu"`
+	NvidiaGpuCount int `validate:"gte=0" yaml:"nvidiaGpuCount" json:"nvidiaGpuCount"`
+	// NvidiaGpuType specifies the NVIDIA GPU accelerator type (required for GCP when using GPUs).
+	// Examples: nvidia-tesla-k80, nvidia-tesla-v100, nvidia-tesla-a100, nvidia-l4
+	// +optional
+	NvidiaGpuType string `validate:"omitempty" yaml:"nvidiaGpuType" json:"nvidiaGpuType,omitempty"`
+	// NvidiaGpu is deprecated, use NvidiaGpuCount instead. Kept for backward compatibility.
+	// +optional
+	// Deprecated: Use NvidiaGpuCount instead.
+	NvidiaGpu int `validate:"gte=0" yaml:"nvidiaGpu" json:"nvidiaGpu,omitempty"`
 }
 
 // DynamicNodePool List of dynamically to-be-created nodepools of not yet existing machines, used for Kubernetes or loadbalancer clusters.

diff --git a/internal/api/manifest/utils.go b/internal/api/manifest/utils.go
@@ -293,10 +293,16 @@ func (ds *Manifest) CreateNodepools(pools []string, isControl bool) ([]*spec.Nod
 
 			var machineSpec *spec.MachineSpec
 			if nodePool.MachineSpec != nil {
+				// Use NvidiaGpuCount as primary, fall back to deprecated NvidiaGpu for backward compatibility
+				gpuCount := int32(nodePool.MachineSpec.NvidiaGpuCount)
+				if gpuCount == 0 && nodePool.MachineSpec.NvidiaGpu > 0 {
+					gpuCount = int32(nodePool.MachineSpec.NvidiaGpu)
+				}
 				machineSpec = &spec.MachineSpec{
-					CpuCount:  int32(nodePool.MachineSpec.CpuCount),
-					Memory:    int32(nodePool.MachineSpec.Memory),
-					NvidiaGpu: int32(nodePool.MachineSpec.NvidiaGpu),
+					CpuCount:       int32(nodePool.MachineSpec.CpuCount),
+					Memory:         int32(nodePool.MachineSpec.Memory),
+					NvidiaGpuCount: gpuCount,
+					NvidiaGpuType:  nodePool.MachineSpec.NvidiaGpuType,
 				}
 			}
 

diff --git a/internal/api/manifest/validate_node_pool.go b/internal/api/manifest/validate_node_pool.go
@@ -125,6 +125,11 @@ func (d *DynamicNodePool) Validate(m *Manifest) error {
 		return fmt.Errorf("max available count for a nodepool is 255")
 	}
 
+	// Validate GCP-specific GPU configuration
+	if err := d.validateGCPGpuConfig(m); err != nil {
+		return err
+	}
+
 	validate := validator.New()
 
 	if err := validate.RegisterValidation("external_net", validateExternalNet); err != nil {
@@ -137,6 +142,36 @@ func (d *DynamicNodePool) Validate(m *Manifest) error {
 	return nil
 }
 
+// validateGCPGpuConfig validates that GCP nodepools with GPUs have the required type specified.
+// GCP requires the guest_accelerator block with both type and count to attach GPUs to instances.
+func (d *DynamicNodePool) validateGCPGpuConfig(m *Manifest) error {
+	providerType, err := m.GetProviderType(d.ProviderSpec.Name)
+	if err != nil {
+		// Provider validation is handled elsewhere
+		return nil
+	}
+
+	if providerType != "gcp" {
+		return nil
+	}
+
+	if d.MachineSpec == nil {
+		return nil
+	}
+
+	// Check both NvidiaGpuCount (new) and NvidiaGpu (deprecated) for backward compatibility
+	gpuCount := d.MachineSpec.NvidiaGpuCount
+	if gpuCount == 0 {
+		gpuCount = d.MachineSpec.NvidiaGpu
+	}
+
+	if gpuCount > 0 && d.MachineSpec.NvidiaGpuType == "" {
+		return fmt.Errorf("nvidiaGpuType is required for GCP when nvidiaGpuCount > 0")
+	}
+
+	return nil
+}
+
 func (s *StaticNodePool) Validate() error {
 	if err := validator.New().Struct(s); err != nil {
 		return prettyPrintValidationError(err)

diff --git a/internal/api/manifest/validate_test.go b/internal/api/manifest/validate_test.go
@@ -329,3 +329,112 @@ func TestOptionalZone(t *testing.T) {
 	// Nodepools with zone should still pass validation
 	r.NoError(testNodepoolWithZone.Validate(&Manifest{}))
 }
+
+// TestGCPGpuValidation tests that GCP nodepools with GPUs require nvidiaGpuType to be specified.
+func TestGCPGpuValidation(t *testing.T) {
+	r := require.New(t)
+
+	// Create a manifest with a GCP provider
+	gcpManifest := &Manifest{
+		Providers: Provider{
+			GCP: []GCP{{
+				Name:        "gcp-1",
+				Credentials: "fake-credentials",
+				GCPProject:  "fake-project",
+			}},
+		},
+	}
+
+	// Create a manifest with a Hetzner provider (non-GCP)
+	hetznerManifest := &Manifest{
+		Providers: Provider{
+			Hetzner: []Hetzner{{
+				Name:        "hetzner-1",
+				Credentials: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+			}},
+		},
+	}
+
+	// Test case 1: GCP nodepool with GPU count but no type - should fail
+	gcpNodepoolGpuNoType := &DynamicNodePool{
+		Name:       "gpu-np",
+		ServerType: "n1-standard-4",
+		Image:      "ubuntu-2204",
+		Count:      1,
+		ProviderSpec: ProviderSpec{
+			Name:   "gcp-1",
+			Region: "us-central1",
+			Zone:   "us-central1-a",
+		},
+		MachineSpec: &MachineSpec{
+			NvidiaGpuCount: 1,
+		},
+	}
+	r.Error(gcpNodepoolGpuNoType.Validate(gcpManifest), "GCP nodepool with GPU count but no type should fail validation")
+
+	// Test case 2: GCP nodepool with GPU count and type - should pass
+	gcpNodepoolGpuWithType := &DynamicNodePool{
+		Name:       "gpu-np",
+		ServerType: "n1-standard-4",
+		Image:      "ubuntu-2204",
+		Count:      1,
+		ProviderSpec: ProviderSpec{
+			Name:   "gcp-1",
+			Region: "us-central1",
+			Zone:   "us-central1-a",
+		},
+		MachineSpec: &MachineSpec{
+			NvidiaGpuCount: 1,
+			NvidiaGpuType:  "nvidia-tesla-t4",
+		},
+	}
+	r.NoError(gcpNodepoolGpuWithType.Validate(gcpManifest), "GCP nodepool with GPU count and type should pass validation")
+
+	// Test case 3: GCP nodepool without GPU - should pass
+	gcpNodepoolNoGpu := &DynamicNodePool{
+		Name:       "regular-np",
+		ServerType: "e2-medium",
+		Image:      "ubuntu-2204",
+		Count:      1,
+		ProviderSpec: ProviderSpec{
+			Name:   "gcp-1",
+			Region: "us-central1",
+			Zone:   "us-central1-a",
+		},
+	}
+	r.NoError(gcpNodepoolNoGpu.Validate(gcpManifest), "GCP nodepool without GPU should pass validation")
+
+	// Test case 4: Non-GCP nodepool with GPU count but no type - should pass (only GCP requires type)
+	hetznerNodepoolGpuNoType := &DynamicNodePool{
+		Name:       "gpu-np",
+		ServerType: "cx21",
+		Image:      "ubuntu-22.04",
+		Count:      1,
+		ProviderSpec: ProviderSpec{
+			Name:   "hetzner-1",
+			Region: "fsn1",
+			Zone:   "fsn1-dc14",
+		},
+		MachineSpec: &MachineSpec{
+			NvidiaGpuCount: 1,
+		},
+	}
+	r.NoError(hetznerNodepoolGpuNoType.Validate(hetznerManifest), "Non-GCP nodepool with GPU count but no type should pass validation")
+
+	// Test case 5: Non-GCP nodepool with deprecated nvidiaGpu field but no type - should pass (backward compatibility)
+	hetznerNodepoolDeprecatedGpu := &DynamicNodePool{
+		Name:       "gpu-np-dep",
+		ServerType: "cx21",
+		Image:      "ubuntu-22.04",
+		Count:      1,
+		ProviderSpec: ProviderSpec{
+			Name:   "hetzner-1",
+			Region: "fsn1",
+			Zone:   "fsn1-dc14",
+		},
+		MachineSpec: &MachineSpec{
+			NvidiaGpu: 1, // Using deprecated field
+		},
+	}
+	r.NoError(hetznerNodepoolDeprecatedGpu.Validate(hetznerManifest), "Non-GCP nodepool with deprecated nvidiaGpu but no type should pass validation")
+}