pytorch · nefainl · Feb 21, 2026 · Feb 21, 2026 · Feb 24, 2026 · Feb 25, 2026
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -1242,14 +1242,16 @@ def test_replace_conv1d_with_linear(self) -> None:
 
         # Conv and linear compute the same dot product but accumulate fp32
         # terms in different order, so non-associativity of floating-point
-        # addition produces diffs up to ~1.2e-05. Use rtol=2e-05.
+        # addition produces diffs that can slightly exceed ~1.2e-05 on some
+        # runners. Use slightly looser tolerances (match conv2d test).
         inputs = [x, weights, bias]
         validate(
             gm_before,
             graph_after_passes,
             inputs,
             "ReplaceTrivialConvWithLinear",
-            rtol=2e-5,
+            rtol=3e-5,
+            atol=2e-6,
         )
 
         # Assert that conv1d is trivially converted to linear
@@ -1286,14 +1288,16 @@ def test_replace_conv2d_with_linear(self) -> None:
 
         # Conv and linear compute the same dot product but accumulate fp32
         # terms in different order, so non-associativity of floating-point
-        # addition produces diffs up to ~1.2e-05. Use rtol=2e-05.
+        # addition produces diffs that can slightly exceed ~1.2e-05 on some
+        # runners (e.g. ~1.53e-05). Use slightly looser tolerances.
         inputs = [x, weights, bias]
         validate(
             gm_before,
             graph_after_passes,
             inputs,
             "ReplaceTrivialConvWithLinear",
-            rtol=2e-5,
+            rtol=3e-5,
+            atol=2e-6,
         )
 
         # Assert that conv2d is trivially converted to linear

diff --git a/export/tests/test_target_recipes.py b/export/tests/test_target_recipes.py
@@ -357,7 +357,9 @@ def _get_model_test_configs(
                 "android-arm64-snapdragon-fp16": (1e-2, 5e-2, None),
             },
             "mv3": {
-                "ios-arm64-coreml-fp16": (2e-1, 2e-1, 20),
+                # CoreML fp16 vs reference can marginally exceed 0.2 atol on some
+                # elements (e.g. ~0.228); loosen slightly for CI stability.
+                "ios-arm64-coreml-fp16": (2.5e-1, 2.5e-1, 20),
                 "ios-arm64-coreml-int8": (None, None, None),
                 "android-arm64-snapdragon-fp16": (None, None, None),
             },

@@ -1280,37 +1280,34 @@ bool tensor_is_default_dim_order(executorch::aten::Tensor t);
 bool tensor_is_channels_last_dim_order(executorch::aten::Tensor t);
 
 /**
- * Asserts that four tensors have the same dim_order
+ * Returns true if all tensors are in a compatible layout for portable kernels.
  *
- * Note that this macro only tests dim order, but not others like actual data,
- * sizes, etc.
+ * First, the legacy rule: either every tensor is contiguous-order
+ * (`is_contiguous_dim_order`) or every tensor is channels-last-order
+ * (`is_channels_last_dim_order`). That matches mixed-rank argument lists
+ * (e.g. batch norm with reduced outputs), broadcast shapes, and typical
+ * elementwise ops.
  *
+ * If that fails, falls back to semantic equivalence for tensors with the same
+ * rank as the first tensor: matching dim_order labels, or matching strides on
+ * non-size-1 dimensions (degenerate-shape / ambiguous dim_order cases).
+ * Tensors with a different rank than the first must match the first tensor's
+ * format family (both contiguous-order, or both channels-last-order).
+ *
+ * Does not validate sizes, dtypes, or data.
  */
 bool tensors_have_same_dim_order(
     const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list);
 
-/**
- * Asserts that two tensors have the same dim_order
- *
- * Note that this macro only tests dim order, but not others like actual data,
- * sizes, etc.
- */
-
+/** @see tensors_have_same_dim_order(ArrayRef) */
 inline bool tensors_have_same_dim_order(
     const executorch::aten::Tensor& a,
     const executorch::aten::Tensor& b) {
   executorch::aten::Tensor tensor_list[2] = {a, b};
   return tensors_have_same_dim_order(tensor_list);
 }
 
-/**
- * Asserts that three tensors have the same dim_order
- *
- * Note that this macro only tests dim order, but not others like actual data,
- * sizes, etc.
- *
- */
-
+/** @see tensors_have_same_dim_order(ArrayRef) */
 inline bool tensors_have_same_dim_order(
     const executorch::aten::Tensor& a,
     const executorch::aten::Tensor& b,
@@ -1319,14 +1316,7 @@ inline bool tensors_have_same_dim_order(
   return tensors_have_same_dim_order(tensor_list);
 }
 
-/**
- * Asserts that four tensors have the same dim_order
- *
- * Note that this macro only tests dim order, but not others like actual data,
- * sizes, etc.
- *
- */
-
+/** @see tensors_have_same_dim_order(ArrayRef) */
 inline bool tensors_have_same_dim_order(
     const executorch::aten::Tensor& a,
     const executorch::aten::Tensor& b,

@@ -78,6 +78,46 @@ inline bool tensor_is_default_or_channels_last_dim_order(at::Tensor t) {
   return ret_val;
 }
 
+namespace {
+
+// Same-rank semantic layout match (dim_order labels, else strides with
+// size-1 dims skipped). Used when the legacy format-family check fails.
+bool two_tensors_semantic_same_layout(
+    const executorch::aten::Tensor& a,
+    const executorch::aten::Tensor& b) {
+  if (a.dim() != b.dim()) {
+    return false;
+  }
+  const int ndim = static_cast<int>(a.dim());
+  executorch::aten::DimOrderType order_a[kTensorDimensionLimit];
+  executorch::aten::DimOrderType order_b[kTensorDimensionLimit];
+  if (get_dim_order(a, order_a, a.dim()) != Error::Ok ||
+      get_dim_order(b, order_b, b.dim()) != Error::Ok) {
+    return false;
+  }
+  bool labels_match = true;
+  for (int i = 0; i < ndim; ++i) {
+    if (order_a[i] != order_b[i]) {
+      labels_match = false;
+      break;
+    }
+  }
+  if (labels_match) {
+    return true;
+  }
+  for (int i = 0; i < ndim; ++i) {
+    if (a.size(i) == 1 && b.size(i) == 1) {
+      continue;
+    }
+    if (a.stride(i) != b.stride(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace
+
 bool tensors_have_same_dim_order(
     const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list) {
   if (tensor_list.size() < 2) {
@@ -110,12 +150,50 @@ bool tensors_have_same_dim_order(
         is_channels_last_dim_order(other_dim_order, tensor_list[i].dim());
   }
 
-  ET_CHECK_OR_RETURN_FALSE(
-      all_contiguous || all_channels_last,
-      "%zd input tensors have different dim orders",
-      tensor_list.size());
+  if (all_contiguous || all_channels_last) {
+    return true;
+  }
+
+  const executorch::aten::Tensor& ref = tensor_list[0];
+  const bool ref_contiguous =
+      is_contiguous_dim_order(first_dim_order, ref.dim());
+  const bool ref_channels_last =
+      is_channels_last_dim_order(first_dim_order, ref.dim());
 
-  return all_contiguous || all_channels_last;
+  for (size_t i = 1; i < tensor_list.size(); ++i) {
+    const executorch::aten::Tensor& t = tensor_list[i];
+    if (t.dim() == ref.dim()) {
+      if (!two_tensors_semantic_same_layout(ref, t)) {
+        ET_LOG(
+            Error,
+            "%zd input tensors have different dim orders",
+            tensor_list.size());
+        return false;
+      }
+    } else {
+      if (get_dim_order(t, other_dim_order, t.dim()) != Error::Ok) {
+        ET_LOG(
+            Error,
+            "%zd input tensors have different dim orders",
+            tensor_list.size());
+        return false;
+      }
+      const bool t_contiguous =
+          is_contiguous_dim_order(other_dim_order, t.dim());
+      const bool t_channels_last =
+          is_channels_last_dim_order(other_dim_order, t.dim());
+      const bool ok = (ref_contiguous && t_contiguous) ||
+          (ref_channels_last && t_channels_last);
+      if (!ok) {
+        ET_LOG(
+            Error,
+            "%zd input tensors have different dim orders",
+            tensor_list.size());
+        return false;
+      }
+    }
+  }
+  return true;
 }
 
 namespace internal {

@@ -109,11 +109,56 @@ bool tensor_is_channels_last_dim_order(torch::executor::Tensor t) {
   return ret_val;
 }
 
-bool tensors_have_same_dim_order(
-    const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list) {
-  if (tensor_list.size() < 2) {
+namespace {
+
+// Helper: check if two tensors have semantically equivalent memory layouts.
+// First tries exact dim_order label match; if labels differ, falls back to
+// stride comparison that ignores size-1 dimensions (PyTorch semantics).
+// In ExecuTorch, strides are derived from dim_order + sizes at tensor
+// construction (TensorImpl), so this comparison is equivalent to comparing
+// the actual memory layout.
+bool two_tensors_same_dim_order(
+    const executorch::aten::Tensor& a,
+    const executorch::aten::Tensor& b) {
+  if (a.dim() != b.dim()) {
+    return false;
+  }
+  const int ndim = static_cast<int>(a.dim());
+
+  // Fast path: check if dim_order labels match exactly
+  bool labels_match = true;
+  for (int i = 0; i < ndim; ++i) {
+    if (a.dim_order()[i] != b.dim_order()[i]) {
+      labels_match = false;
+      break;
+    }
+  }
+  if (labels_match) {
     return true;
   }
+
+  // Semantic equivalence: compare strides, ignoring size-1 dimensions.
+  // Two tensors are equivalent if their strides match for all dimensions
+  // where both tensors have size > 1. Size-1 dims don't affect memory
+  // traversal order (PyTorch's is_contiguous uses this logic).
+  for (int i = 0; i < ndim; ++i) {
+    // Skip dimensions where both tensors have size 1
+    if (a.sizes()[i] == 1 && b.sizes()[i] == 1) {
+      continue;
+    }
+    // For non-trivial dimensions, strides must match
+    if (a.strides()[i] != b.strides()[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Tier A: every tensor is contiguous-order or every tensor is channels-last
+// (original portable contract). Handles mixed rank, broadcast shapes, and
+// reduced aux outputs (e.g. batch norm mean tensors).
+bool tensors_share_legacy_format_family(
+    const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list) {
   bool all_contiguous = true;
   bool all_channels_last = true;
   for (const auto i : c10::irange(tensor_list.size())) {
@@ -126,12 +171,53 @@ bool tensors_have_same_dim_order(
                             tensor_list[i].dim_order().data(),
                             tensor_list[i].dim_order().size());
   }
+  return all_contiguous || all_channels_last;
+}
 
-  ET_CHECK_OR_RETURN_FALSE(
-      all_contiguous || all_channels_last,
-      "%zd input tensors have different dim orders",
-      tensor_list.size());
+} // namespace
 
+bool tensors_have_same_dim_order(
+    const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list) {
+  if (tensor_list.size() < 2) {
+    return true;
+  }
+
+  if (tensors_share_legacy_format_family(tensor_list)) {
+    return true;
+  }
+
+  const executorch::aten::Tensor& ref = tensor_list[0];
+  const bool ref_contiguous =
+      is_contiguous_dim_order(ref.dim_order().data(), ref.dim_order().size());
+  const bool ref_channels_last = is_channels_last_dim_order(
+      ref.dim_order().data(), ref.dim_order().size());
+
+  for (size_t i = 1; i < tensor_list.size(); ++i) {
+    const executorch::aten::Tensor& t = tensor_list[i];
+    if (t.dim() == ref.dim()) {
+      if (!two_tensors_same_dim_order(ref, t)) {
+        ET_LOG(
+            Error,
+            "%zd input tensors have different dim orders",
+            tensor_list.size());
+        return false;
+      }
+    } else {
+      const bool t_contiguous =
+          is_contiguous_dim_order(t.dim_order().data(), t.dim_order().size());
+      const bool t_channels_last = is_channels_last_dim_order(
+          t.dim_order().data(), t.dim_order().size());
+      const bool ok = (ref_contiguous && t_contiguous) ||
+          (ref_channels_last && t_channels_last);
+      if (!ok) {
+        ET_LOG(
+            Error,
+            "%zd input tensors have different dim orders",
+            tensor_list.size());
+        return false;
+      }
+    }
+  }
   return true;
 }