-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Open
Labels
bugSomething isn't workingSomething isn't workingneeds triageNew item requiring triageNew item requiring triage
Description
Component(s)
Opentelemetry collector k8s
What happened?
Hi,
I noticed with one of my exporters, some metrics of this exporter are being dropped before arriving into prometheus. I use the prometheus receiver to discover exporters and use a prometheus remote write exporter.
The problem is : i have some metrics exposed by envoy gateway, 99% of the metrics arrived well in promtheus but some metrics dont arrive in prometheus and being dropped before.
Here is the error i noticed:
{"level":"error","ts":"2026-01-02T14:03:10.488Z","caller":"internal/queue_sender.go:50","msg":"Exporting failed. Dropping data.","resource":{"service.instance.id":"a3e479c0-add8-4836-9fa4-2021c6786f6f","service.name":"otelcol-contrib","service.version":"0.142.0"},"otelcol.component.id":"otlp/dp","otelcol.component.kind":"exporter","otelcol.signal":"logs","error":"not retryable error: Permanent error: rpc error: code = Internal desc = server closed the stream without sending trailers","dropped_items":1,"stacktrace":"go.opentelemetry.io/collector/exporter/exporterhelper/internal.NewQueueSender.func1\n\tgo.opentelemetry.io/collector/exporter/[email protected]/internal/queue_sender.go:50\ngo.opentelemetry.io/collector/exporter/exporterhelper/internal/queuebatch.(*disabledBatcher[...]).Consume\n\tgo.opentelemetry.io/collector/exporter/[email protected]/internal/queuebatch/disabled_batcher.go:23\ngo.opentelemetry.io/collector/exporter/exporterhelper/internal/queue.(*asyncQueue[...]).Start.func1\n\tgo.opentelemetry.io/collector/exporter/[email protected]/internal/queue/async_queue.go:49"}
Here is my otel conf:
mode: deployment
image:
repository: docker.io/otel/opentelemetry-collector-contrib
# We only want one of these collectors - any more and we'd produce duplicate data
replicaCount: 1
ports:
metrics:
enabled: true
presets:
kubernetesAttributes:
enabled: true
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8888"
prometheus.io/path: "/metrics"
# Service Account configuration for k8sobjects receiver
serviceAccount:
create: true
name: "otel-collector-events"
annotations: {}
# RBAC configuration for k8sobjects and k8s_cluster receivers
clusterRole:
create: true
rules:
# For k8sobjects receiver (events)
- apiGroups: [""]
resources: ["events", "pods"]
verbs: ["get", "list", "watch"]
- apiGroups: ["events.k8s.io"]
resources: ["events"]
verbs: ["get", "list", "watch"]
# For k8s_cluster receiver (cluster state metrics)
- apiGroups: [""]
resources:
- "nodes"
- "nodes/stats"
- "namespaces"
- "pods"
- "replicationcontrollers"
- "resourcequotas"
- "services"
- "persistentvolumes"
- "persistentvolumeclaims"
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources:
- "deployments"
- "daemonsets"
- "replicasets"
- "statefulsets"
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources:
- "jobs"
- "cronjobs"
verbs: ["get", "list", "watch"]
- apiGroups: ["autoscaling"]
resources:
- "horizontalpodautoscalers"
verbs: ["get", "list", "watch"]
config:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
cors:
allowed_origins:
- "*"
# K8s cluster receiver for cluster-level state metrics (replaces kube-state-metrics)
k8s_cluster:
auth_type: serviceAccount
collection_interval: 30s
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- storage
- ephemeral-storage
metrics:
k8s.container.cpu_limit:
enabled: true
k8s.container.cpu_request:
enabled: true
k8s.container.memory_limit:
enabled: true
k8s.container.memory_request:
enabled: true
k8s.container.storage_limit:
enabled: true
k8s.container.storage_request:
enabled: true
k8s.container.ephemeralstorage_limit:
enabled: true
k8s.container.ephemeralstorage_request:
enabled: true
k8s.deployment.available:
enabled: true
k8s.deployment.desired:
enabled: true
k8s.pod.phase:
enabled: true
k8s.replicaset.available:
enabled: true
k8s.replicaset.desired:
enabled: true
k8s.namespace.phase:
enabled: true
k8s.replication_controller.available:
enabled: true
k8s.replication_controller.desired:
enabled: true
k8s.resource_quota.hard_limit:
enabled: true
k8s.resource_quota.used:
enabled: true
k8s.statefulset.current_pods:
enabled: true
k8s.statefulset.desired_pods:
enabled: true
k8s.statefulset.ready_pods:
enabled: true
k8s.statefulset.updated_pods:
enabled: true
k8s.daemonset.current_scheduled_nodes:
enabled: true
k8s.daemonset.desired_scheduled_nodes:
enabled: true
k8s.daemonset.misscheduled_nodes:
enabled: true
k8s.daemonset.ready_nodes:
enabled: true
k8s.hpa.current_replicas:
enabled: true
k8s.hpa.desired_replicas:
enabled: true
k8s.hpa.max_replicas:
enabled: true
k8s.hpa.min_replicas:
enabled: true
k8s.job.active_pods:
enabled: true
k8s.job.desired_successful_pods:
enabled: true
k8s.job.failed_pods:
enabled: true
k8s.job.max_parallel_pods:
enabled: true
k8s.job.successful_pods:
enabled: true
k8s.cronjob.active_jobs:
enabled: true
# Explicitly configured - what the kubernetesEvents preset creates
k8sobjects:
auth_type: serviceAccount
objects:
- name: events
mode: watch
group: events.k8s.io
# Prometheus receiver to scrape metrics from all the pods exposing Prometheus metrics
prometheus:
config:
scrape_configs:
- job_name: k8s-pods-annotated
scrape_interval: 10s
kubernetes_sd_configs:
- role: pod
relabel_configs:
# prometheus.io/scrape = true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: "true"
# prometheus.io/port
- source_labels:
- __meta_kubernetes_pod_ip
- __meta_kubernetes_pod_annotation_prometheus_io_port
action: replace
regex: (.+);(.+)
replacement: $1:$2
target_label: __address__
# prometheus.io/path
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
regex: (.+)
target_label: __metrics_path__
# Add namespace label
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
processors:
batch:
timeout: 10s
send_batch_size: 512
send_batch_max_size: 1024
memory_limiter:
check_interval: 2s
limit_percentage: 75
spike_limit_percentage: 20
k8sattributes:
passthrough: true
resource:
attributes:
- key: app
value: xxxxx
action: upsert
- key: environment
value: dev
action: upsert
- key: cluster
value: xxx
action: upsert
- key: cluster.name
value: xxxx
action: upsert
- key: node.name
value: ${KUBE_NODE_NAME}
action: upsert
- key: node
value: ${KUBE_NODE_NAME}
action: upsert
- key: compartment.name
value: xxxx
action: upsert
- key: type
value: logs
action: upsert
- key: job
value: xxxx
action: upsert
exporters:
# Debug exporter for troubleshooting
debug:
verbosity: detailed
sampling_initial: 5
sampling_thereafter: 200
# Logs + Traces → Data Prepper (OTLP gRPC)
otlp/dp:
xxxxx
tls:
insecure: false
insecure_skip_verify: true
sending_queue:
enabled: true
queue_size: 5000
num_consumers: 10
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
otlp/traces:
xxxx
tls:
insecure: false
insecure_skip_verify: true
sending_queue:
enabled: true
queue_size: 5000
num_consumers: 10
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
# Metrics → Prometheus Remote Write
prometheusremotewrite/prom:
endpoint: https://xxxxx/api/v1/write
tls:
insecure: false
resource_to_telemetry_conversion:
enabled: true
service:
telemetry:
logs:
level: "DEBUG"
development: true
encoding: "json"
pipelines:
logs:
receivers: [k8sobjects]
processors: [resource, memory_limiter, batch]
exporters: [otlp/dp]
traces:
receivers: [otlp]
processors: [resource, memory_limiter, batch]
exporters: [debug, otlp/traces]
metrics:
receivers: [otlp, k8s_cluster, prometheus]
processors: [resource, memory_limiter, batch]
exporters: [prometheusremotewrite/prom]
extraEnvs:
- name: KUBE_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: K8S_CLUSTER_NAME
value: "xxxx"
- name: ENVIRONMENT
value: "dev"
- name: COMPARTMENT
value: "xxxx"
- name: APP
value: "xxxxt"
- name: TYPE
value: "logs"
Collector version
0.142.0
Environment information
Environment
OS: (e.g., "Ubuntu 20.04")
Compiler(if manually compiled): (e.g., "go 14.2")
OpenTelemetry Collector configuration
mode: deployment
image:
repository: docker.io/otel/opentelemetry-collector-contrib
# We only want one of these collectors - any more and we'd produce duplicate data
replicaCount: 1
ports:
metrics:
enabled: true
presets:
kubernetesAttributes:
enabled: true
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8888"
prometheus.io/path: "/metrics"
# Service Account configuration for k8sobjects receiver
serviceAccount:
create: true
name: "otel-collector-events"
annotations: {}
# RBAC configuration for k8sobjects and k8s_cluster receivers
clusterRole:
create: true
rules:
# For k8sobjects receiver (events)
- apiGroups: [""]
resources: ["events", "pods"]
verbs: ["get", "list", "watch"]
- apiGroups: ["events.k8s.io"]
resources: ["events"]
verbs: ["get", "list", "watch"]
# For k8s_cluster receiver (cluster state metrics)
- apiGroups: [""]
resources:
- "nodes"
- "nodes/stats"
- "namespaces"
- "pods"
- "replicationcontrollers"
- "resourcequotas"
- "services"
- "persistentvolumes"
- "persistentvolumeclaims"
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources:
- "deployments"
- "daemonsets"
- "replicasets"
- "statefulsets"
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources:
- "jobs"
- "cronjobs"
verbs: ["get", "list", "watch"]
- apiGroups: ["autoscaling"]
resources:
- "horizontalpodautoscalers"
verbs: ["get", "list", "watch"]
config:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
cors:
allowed_origins:
- "*"
# K8s cluster receiver for cluster-level state metrics (replaces kube-state-metrics)
k8s_cluster:
auth_type: serviceAccount
collection_interval: 30s
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- storage
- ephemeral-storage
metrics:
k8s.container.cpu_limit:
enabled: true
k8s.container.cpu_request:
enabled: true
k8s.container.memory_limit:
enabled: true
k8s.container.memory_request:
enabled: true
k8s.container.storage_limit:
enabled: true
k8s.container.storage_request:
enabled: true
k8s.container.ephemeralstorage_limit:
enabled: true
k8s.container.ephemeralstorage_request:
enabled: true
k8s.deployment.available:
enabled: true
k8s.deployment.desired:
enabled: true
k8s.pod.phase:
enabled: true
k8s.replicaset.available:
enabled: true
k8s.replicaset.desired:
enabled: true
k8s.namespace.phase:
enabled: true
k8s.replication_controller.available:
enabled: true
k8s.replication_controller.desired:
enabled: true
k8s.resource_quota.hard_limit:
enabled: true
k8s.resource_quota.used:
enabled: true
k8s.statefulset.current_pods:
enabled: true
k8s.statefulset.desired_pods:
enabled: true
k8s.statefulset.ready_pods:
enabled: true
k8s.statefulset.updated_pods:
enabled: true
k8s.daemonset.current_scheduled_nodes:
enabled: true
k8s.daemonset.desired_scheduled_nodes:
enabled: true
k8s.daemonset.misscheduled_nodes:
enabled: true
k8s.daemonset.ready_nodes:
enabled: true
k8s.hpa.current_replicas:
enabled: true
k8s.hpa.desired_replicas:
enabled: true
k8s.hpa.max_replicas:
enabled: true
k8s.hpa.min_replicas:
enabled: true
k8s.job.active_pods:
enabled: true
k8s.job.desired_successful_pods:
enabled: true
k8s.job.failed_pods:
enabled: true
k8s.job.max_parallel_pods:
enabled: true
k8s.job.successful_pods:
enabled: true
k8s.cronjob.active_jobs:
enabled: true
# Explicitly configured - what the kubernetesEvents preset creates
k8sobjects:
auth_type: serviceAccount
objects:
- name: events
mode: watch
group: events.k8s.io
# Prometheus receiver to scrape metrics from all the pods exposing Prometheus metrics
prometheus:
config:
scrape_configs:
- job_name: k8s-pods-annotated
scrape_interval: 10s
kubernetes_sd_configs:
- role: pod
relabel_configs:
# prometheus.io/scrape = true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: "true"
# prometheus.io/port
- source_labels:
- __meta_kubernetes_pod_ip
- __meta_kubernetes_pod_annotation_prometheus_io_port
action: replace
regex: (.+);(.+)
replacement: $1:$2
target_label: __address__
# prometheus.io/path
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
regex: (.+)
target_label: __metrics_path__
# Add namespace label
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
processors:
batch:
timeout: 10s
send_batch_size: 512
send_batch_max_size: 1024
memory_limiter:
check_interval: 2s
limit_percentage: 75
spike_limit_percentage: 20
k8sattributes:
passthrough: true
resource:
attributes:
- key: app
value: xxxxx
action: upsert
- key: environment
value: dev
action: upsert
- key: cluster
value: xxx
action: upsert
- key: cluster.name
value: xxxx
action: upsert
- key: node.name
value: ${KUBE_NODE_NAME}
action: upsert
- key: node
value: ${KUBE_NODE_NAME}
action: upsert
- key: compartment.name
value: xxxx
action: upsert
- key: type
value: logs
action: upsert
- key: job
value: xxxx
action: upsert
exporters:
# Debug exporter for troubleshooting
debug:
verbosity: detailed
sampling_initial: 5
sampling_thereafter: 200
# Logs + Traces → Data Prepper (OTLP gRPC)
otlp/dp:
xxxxx
tls:
insecure: false
insecure_skip_verify: true
sending_queue:
enabled: true
queue_size: 5000
num_consumers: 10
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
otlp/traces:
xxxx
tls:
insecure: false
insecure_skip_verify: true
sending_queue:
enabled: true
queue_size: 5000
num_consumers: 10
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
# Metrics → Prometheus Remote Write
prometheusremotewrite/prom:
endpoint: https://xxxxx/api/v1/write
tls:
insecure: false
resource_to_telemetry_conversion:
enabled: true
service:
telemetry:
logs:
level: "DEBUG"
development: true
encoding: "json"
pipelines:
logs:
receivers: [k8sobjects]
processors: [resource, memory_limiter, batch]
exporters: [otlp/dp]
traces:
receivers: [otlp]
processors: [resource, memory_limiter, batch]
exporters: [debug, otlp/traces]
metrics:
receivers: [otlp, k8s_cluster, prometheus]
processors: [resource, memory_limiter, batch]
exporters: [prometheusremotewrite/prom]
extraEnvs:
- name: KUBE_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: K8S_CLUSTER_NAME
value: "xxxx"
- name: ENVIRONMENT
value: "dev"
- name: COMPARTMENT
value: "xxxx"
- name: APP
value: "xxxxt"
- name: TYPE
value: "logs"Log output
Additional context
No response
Tip
React with 👍 to help prioritize this issue. Please use comments to provide useful context, avoiding +1 or me too, to help us triage it. Learn more here.
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workingneeds triageNew item requiring triageNew item requiring triage