diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
index ee13726a94b..2411ea9846e 100644
--- a/backends/cadence/aot/tests/test_replace_ops_passes.py
+++ b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -1242,14 +1242,16 @@ def test_replace_conv1d_with_linear(self) -> None:
 
         # Conv and linear compute the same dot product but accumulate fp32
         # terms in different order, so non-associativity of floating-point
-        # addition produces diffs up to ~1.2e-05. Use rtol=2e-05.
+        # addition produces diffs that can slightly exceed ~1.2e-05 on some
+        # runners. Use slightly looser tolerances (match conv2d test).
         inputs = [x, weights, bias]
         validate(
             gm_before,
             graph_after_passes,
             inputs,
             "ReplaceTrivialConvWithLinear",
-            rtol=2e-5,
+            rtol=3e-5,
+            atol=2e-6,
         )
 
         # Assert that conv1d is trivially converted to linear
@@ -1286,14 +1288,16 @@ def test_replace_conv2d_with_linear(self) -> None:
 
         # Conv and linear compute the same dot product but accumulate fp32
         # terms in different order, so non-associativity of floating-point
-        # addition produces diffs up to ~1.2e-05. Use rtol=2e-05.
+        # addition produces diffs that can slightly exceed ~1.2e-05 on some
+        # runners (e.g. ~1.53e-05). Use slightly looser tolerances.
         inputs = [x, weights, bias]
         validate(
             gm_before,
             graph_after_passes,
             inputs,
             "ReplaceTrivialConvWithLinear",
-            rtol=2e-5,
+            rtol=3e-5,
+            atol=2e-6,
         )
 
         # Assert that conv2d is trivially converted to linear
diff --git a/export/tests/test_target_recipes.py b/export/tests/test_target_recipes.py
index 48f7dfc67db..1d68e8d95e9 100644
--- a/export/tests/test_target_recipes.py
+++ b/export/tests/test_target_recipes.py
@@ -357,7 +357,9 @@ def _get_model_test_configs(
                 "android-arm64-snapdragon-fp16": (1e-2, 5e-2, None),
             },
             "mv3": {
-                "ios-arm64-coreml-fp16": (2e-1, 2e-1, 20),
+                # CoreML fp16 vs reference can marginally exceed 0.2 atol on some
+                # elements (e.g. ~0.228); loosen slightly for CI stability.
+                "ios-arm64-coreml-fp16": (2.5e-1, 2.5e-1, 20),
                 "ios-arm64-coreml-int8": (None, None, None),
                 "android-arm64-snapdragon-fp16": (None, None, None),
             },
diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h
index 26b97e5a7a2..881b8fc637f 100644
--- a/runtime/core/exec_aten/util/tensor_util.h
+++ b/runtime/core/exec_aten/util/tensor_util.h
@@ -1280,22 +1280,26 @@ bool tensor_is_default_dim_order(executorch::aten::Tensor t);
 bool tensor_is_channels_last_dim_order(executorch::aten::Tensor t);
 
 /**
- * Asserts that four tensors have the same dim_order
+ * Returns true if all tensors are in a compatible layout for portable kernels.
  *
- * Note that this macro only tests dim order, but not others like actual data,
- * sizes, etc.
+ * First, the legacy rule: either every tensor is contiguous-order
+ * (`is_contiguous_dim_order`) or every tensor is channels-last-order
+ * (`is_channels_last_dim_order`). That matches mixed-rank argument lists
+ * (e.g. batch norm with reduced outputs), broadcast shapes, and typical
+ * elementwise ops.
  *
+ * If that fails, falls back to semantic equivalence for tensors with the same
+ * rank as the first tensor: matching dim_order labels, or matching strides on
+ * non-size-1 dimensions (degenerate-shape / ambiguous dim_order cases).
+ * Tensors with a different rank than the first must match the first tensor's
+ * format family (both contiguous-order, or both channels-last-order).
+ *
+ * Does not validate sizes, dtypes, or data.
  */
 bool tensors_have_same_dim_order(
     const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list);
 
-/**
- * Asserts that two tensors have the same dim_order
- *
- * Note that this macro only tests dim order, but not others like actual data,
- * sizes, etc.
- */
-
+/** @see tensors_have_same_dim_order(ArrayRef) */
 inline bool tensors_have_same_dim_order(
     const executorch::aten::Tensor& a,
     const executorch::aten::Tensor& b) {
@@ -1303,14 +1307,7 @@ inline bool tensors_have_same_dim_order(
   return tensors_have_same_dim_order(tensor_list);
 }
 
-/**
- * Asserts that three tensors have the same dim_order
- *
- * Note that this macro only tests dim order, but not others like actual data,
- * sizes, etc.
- *
- */
-
+/** @see tensors_have_same_dim_order(ArrayRef) */
 inline bool tensors_have_same_dim_order(
     const executorch::aten::Tensor& a,
     const executorch::aten::Tensor& b,
@@ -1319,14 +1316,7 @@ inline bool tensors_have_same_dim_order(
   return tensors_have_same_dim_order(tensor_list);
 }
 
-/**
- * Asserts that four tensors have the same dim_order
- *
- * Note that this macro only tests dim order, but not others like actual data,
- * sizes, etc.
- *
- */
-
+/** @see tensors_have_same_dim_order(ArrayRef) */
 inline bool tensors_have_same_dim_order(
     const executorch::aten::Tensor& a,
     const executorch::aten::Tensor& b,
diff --git a/runtime/core/exec_aten/util/tensor_util_aten.cpp b/runtime/core/exec_aten/util/tensor_util_aten.cpp
index b8d8e266016..a8d17b47e0f 100644
--- a/runtime/core/exec_aten/util/tensor_util_aten.cpp
+++ b/runtime/core/exec_aten/util/tensor_util_aten.cpp
@@ -78,6 +78,46 @@ inline bool tensor_is_default_or_channels_last_dim_order(at::Tensor t) {
   return ret_val;
 }
 
+namespace {
+
+// Same-rank semantic layout match (dim_order labels, else strides with
+// size-1 dims skipped). Used when the legacy format-family check fails.
+bool two_tensors_semantic_same_layout(
+    const executorch::aten::Tensor& a,
+    const executorch::aten::Tensor& b) {
+  if (a.dim() != b.dim()) {
+    return false;
+  }
+  const int ndim = static_cast<int>(a.dim());
+  executorch::aten::DimOrderType order_a[kTensorDimensionLimit];
+  executorch::aten::DimOrderType order_b[kTensorDimensionLimit];
+  if (get_dim_order(a, order_a, a.dim()) != Error::Ok ||
+      get_dim_order(b, order_b, b.dim()) != Error::Ok) {
+    return false;
+  }
+  bool labels_match = true;
+  for (int i = 0; i < ndim; ++i) {
+    if (order_a[i] != order_b[i]) {
+      labels_match = false;
+      break;
+    }
+  }
+  if (labels_match) {
+    return true;
+  }
+  for (int i = 0; i < ndim; ++i) {
+    if (a.size(i) == 1 && b.size(i) == 1) {
+      continue;
+    }
+    if (a.stride(i) != b.stride(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace
+
 bool tensors_have_same_dim_order(
     const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list) {
   if (tensor_list.size() < 2) {
@@ -110,12 +150,50 @@ bool tensors_have_same_dim_order(
         is_channels_last_dim_order(other_dim_order, tensor_list[i].dim());
   }
 
-  ET_CHECK_OR_RETURN_FALSE(
-      all_contiguous || all_channels_last,
-      "%zd input tensors have different dim orders",
-      tensor_list.size());
+  if (all_contiguous || all_channels_last) {
+    return true;
+  }
+
+  const executorch::aten::Tensor& ref = tensor_list[0];
+  const bool ref_contiguous =
+      is_contiguous_dim_order(first_dim_order, ref.dim());
+  const bool ref_channels_last =
+      is_channels_last_dim_order(first_dim_order, ref.dim());
 
-  return all_contiguous || all_channels_last;
+  for (size_t i = 1; i < tensor_list.size(); ++i) {
+    const executorch::aten::Tensor& t = tensor_list[i];
+    if (t.dim() == ref.dim()) {
+      if (!two_tensors_semantic_same_layout(ref, t)) {
+        ET_LOG(
+            Error,
+            "%zd input tensors have different dim orders",
+            tensor_list.size());
+        return false;
+      }
+    } else {
+      if (get_dim_order(t, other_dim_order, t.dim()) != Error::Ok) {
+        ET_LOG(
+            Error,
+            "%zd input tensors have different dim orders",
+            tensor_list.size());
+        return false;
+      }
+      const bool t_contiguous =
+          is_contiguous_dim_order(other_dim_order, t.dim());
+      const bool t_channels_last =
+          is_channels_last_dim_order(other_dim_order, t.dim());
+      const bool ok = (ref_contiguous && t_contiguous) ||
+          (ref_channels_last && t_channels_last);
+      if (!ok) {
+        ET_LOG(
+            Error,
+            "%zd input tensors have different dim orders",
+            tensor_list.size());
+        return false;
+      }
+    }
+  }
+  return true;
 }
 
 namespace internal {
diff --git a/runtime/core/exec_aten/util/tensor_util_portable.cpp b/runtime/core/exec_aten/util/tensor_util_portable.cpp
index 9626974ad7d..da68f98f150 100644
--- a/runtime/core/exec_aten/util/tensor_util_portable.cpp
+++ b/runtime/core/exec_aten/util/tensor_util_portable.cpp
@@ -109,11 +109,56 @@ bool tensor_is_channels_last_dim_order(torch::executor::Tensor t) {
   return ret_val;
 }
 
-bool tensors_have_same_dim_order(
-    const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list) {
-  if (tensor_list.size() < 2) {
+namespace {
+
+// Helper: check if two tensors have semantically equivalent memory layouts.
+// First tries exact dim_order label match; if labels differ, falls back to
+// stride comparison that ignores size-1 dimensions (PyTorch semantics).
+// In ExecuTorch, strides are derived from dim_order + sizes at tensor
+// construction (TensorImpl), so this comparison is equivalent to comparing
+// the actual memory layout.
+bool two_tensors_same_dim_order(
+    const executorch::aten::Tensor& a,
+    const executorch::aten::Tensor& b) {
+  if (a.dim() != b.dim()) {
+    return false;
+  }
+  const int ndim = static_cast<int>(a.dim());
+
+  // Fast path: check if dim_order labels match exactly
+  bool labels_match = true;
+  for (int i = 0; i < ndim; ++i) {
+    if (a.dim_order()[i] != b.dim_order()[i]) {
+      labels_match = false;
+      break;
+    }
+  }
+  if (labels_match) {
     return true;
   }
+
+  // Semantic equivalence: compare strides, ignoring size-1 dimensions.
+  // Two tensors are equivalent if their strides match for all dimensions
+  // where both tensors have size > 1. Size-1 dims don't affect memory
+  // traversal order (PyTorch's is_contiguous uses this logic).
+  for (int i = 0; i < ndim; ++i) {
+    // Skip dimensions where both tensors have size 1
+    if (a.sizes()[i] == 1 && b.sizes()[i] == 1) {
+      continue;
+    }
+    // For non-trivial dimensions, strides must match
+    if (a.strides()[i] != b.strides()[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Tier A: every tensor is contiguous-order or every tensor is channels-last
+// (original portable contract). Handles mixed rank, broadcast shapes, and
+// reduced aux outputs (e.g. batch norm mean tensors).
+bool tensors_share_legacy_format_family(
+    const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list) {
   bool all_contiguous = true;
   bool all_channels_last = true;
   for (const auto i : c10::irange(tensor_list.size())) {
@@ -126,12 +171,53 @@ bool tensors_have_same_dim_order(
                             tensor_list[i].dim_order().data(),
                             tensor_list[i].dim_order().size());
   }
+  return all_contiguous || all_channels_last;
+}
 
-  ET_CHECK_OR_RETURN_FALSE(
-      all_contiguous || all_channels_last,
-      "%zd input tensors have different dim orders",
-      tensor_list.size());
+} // namespace
 
+bool tensors_have_same_dim_order(
+    const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list) {
+  if (tensor_list.size() < 2) {
+    return true;
+  }
+
+  if (tensors_share_legacy_format_family(tensor_list)) {
+    return true;
+  }
+
+  const executorch::aten::Tensor& ref = tensor_list[0];
+  const bool ref_contiguous =
+      is_contiguous_dim_order(ref.dim_order().data(), ref.dim_order().size());
+  const bool ref_channels_last = is_channels_last_dim_order(
+      ref.dim_order().data(), ref.dim_order().size());
+
+  for (size_t i = 1; i < tensor_list.size(); ++i) {
+    const executorch::aten::Tensor& t = tensor_list[i];
+    if (t.dim() == ref.dim()) {
+      if (!two_tensors_same_dim_order(ref, t)) {
+        ET_LOG(
+            Error,
+            "%zd input tensors have different dim orders",
+            tensor_list.size());
+        return false;
+      }
+    } else {
+      const bool t_contiguous =
+          is_contiguous_dim_order(t.dim_order().data(), t.dim_order().size());
+      const bool t_channels_last = is_channels_last_dim_order(
+          t.dim_order().data(), t.dim_order().size());
+      const bool ok = (ref_contiguous && t_contiguous) ||
+          (ref_channels_last && t_channels_last);
+      if (!ok) {
+        ET_LOG(
+            Error,
+            "%zd input tensors have different dim orders",
+            tensor_list.size());
+        return false;
+      }
+    }
+  }
   return true;
 }
 
diff --git a/runtime/core/exec_aten/util/test/tensor_util_test.cpp b/runtime/core/exec_aten/util/test/tensor_util_test.cpp
index 170a33ec198..945c273cfb4 100644
--- a/runtime/core/exec_aten/util/test/tensor_util_test.cpp
+++ b/runtime/core/exec_aten/util/test/tensor_util_test.cpp
@@ -622,3 +622,143 @@ TEST_F(TensorUtilTest, SameShapesDifferentDimOrder) {
   EXPECT_FALSE(tensors_have_same_dim_order(a, c, b));
   EXPECT_FALSE(tensors_have_same_dim_order(c, b, a));
 }
+
+// Semantic equivalence tests for tensors_have_same_dim_order.
+// These tests verify that tensors with different dim_order labels but
+// semantically equivalent memory layouts are correctly identified.
+
+TEST_F(TensorUtilTest, SemanticEquivalenceDegenerateC1) {
+  using namespace torch::executor;
+  // C=1: NCHW [2,1,4,4] and NHWC [2,1,4,4] have different dim_order labels
+  // but are semantically equivalent because the C dimension has size 1.
+  std::vector<int32_t> sizes = {2, 1, 4, 4};
+  Tensor nchw = tf_float_.ones(sizes);
+  Tensor nhwc = tf_float_.full_channels_last(sizes, 1.0f);
+
+  // Semantic equivalence: should return true because C=1 makes
+  // layouts identical in memory.
+  EXPECT_TRUE(tensors_have_same_dim_order(nchw, nhwc));
+}
+
+TEST_F(TensorUtilTest, SemanticEquivalenceDegenerateHW1) {
+  using namespace torch::executor;
+  // H=W=1: NCHW [2,3,1,1] and NHWC [2,3,1,1] have different dim_order labels
+  // but are semantically equivalent because H and W dimensions have size 1.
+  std::vector<int32_t> sizes = {2, 3, 1, 1};
+  Tensor nchw = tf_float_.ones(sizes);
+  Tensor nhwc = tf_float_.full_channels_last(sizes, 1.0f);
+
+  // Semantic equivalence: should return true because H=W=1 makes
+  // layouts identical in memory.
+  EXPECT_TRUE(tensors_have_same_dim_order(nchw, nhwc));
+}
+
+TEST_F(TensorUtilTest, SemanticEquivalenceDegenerateC1W1) {
+  using namespace torch::executor;
+  // C=1 and W=1: NCHW [2,1,4,1] and NHWC [2,1,4,1] have different dim_order
+  // labels but are semantically equivalent because the C and W dimensions
+  // both have size 1.
+  std::vector<int32_t> sizes = {2, 1, 4, 1};
+  Tensor nchw = tf_float_.ones(sizes);
+  Tensor nhwc = tf_float_.full_channels_last(sizes, 1.0f);
+
+  EXPECT_TRUE(tensors_have_same_dim_order(nchw, nhwc));
+}
+
+TEST_F(TensorUtilTest, SemanticEquivalenceNonDegenerateFails) {
+  using namespace torch::executor;
+  // Non-degenerate: NCHW [2,3,4,4] and NHWC [2,3,4,4] have different layouts.
+  // No size-1 dimensions, so semantic equivalence should fail.
+  std::vector<int32_t> sizes = {2, 3, 4, 4};
+  Tensor nchw = tf_float_.ones(sizes);
+  Tensor nhwc = tf_float_.full_channels_last(sizes, 1.0f);
+
+  // Different layouts, should return false
+  EXPECT_FALSE(tensors_have_same_dim_order(nchw, nhwc));
+}
+
+TEST_F(TensorUtilTest, SemanticEquivalencePartialDegenerateFails) {
+  using namespace torch::executor;
+  // Partial degenerate: only H=1, but C and W are non-trivial.
+  // This tests a case where only one spatial dim is 1.
+  std::vector<int32_t> sizes = {2, 3, 1, 4};
+  Tensor nchw = tf_float_.ones(sizes);
+  Tensor nhwc = tf_float_.full_channels_last(sizes, 1.0f);
+
+  // NCHW strides: [12, 4, 4, 1]
+  // NHWC strides: [12, 1, 12, 3]
+  // At dim 1 (C): sizes both 3, strides 4 vs 1 -> different
+  // Should return false
+  EXPECT_FALSE(tensors_have_same_dim_order(nchw, nhwc));
+}
+
+TEST_F(TensorUtilTest, DifferentRankSameLegacyFormatFamilyPasses) {
+  using namespace torch::executor;
+  // Legacy rule: all contiguous-order (or all channels-last) passes even when
+  // ranks differ (e.g. reduced outputs vs full activations).
+  Tensor a = tf_float_.ones({2, 3, 4, 4});
+  Tensor b = tf_float_.ones({2, 3, 4});
+
+  EXPECT_TRUE(tensors_have_same_dim_order(a, b));
+}
+
+TEST_F(TensorUtilTest, SemanticEquivalenceSameLabelsSameResult) {
+  using namespace torch::executor;
+  // Regression: same dim_order labels should still work (fast path)
+  std::vector<int32_t> sizes = {2, 3, 4, 4};
+  Tensor a = tf_float_.ones(sizes);
+  Tensor b = tf_float_.ones(sizes);
+
+  EXPECT_TRUE(tensors_have_same_dim_order(a, b));
+}
+
+TEST_F(TensorUtilTest, SemanticEquivalenceChannelsLastSameResult) {
+  using namespace torch::executor;
+  // Regression: two channels_last tensors should still work (fast path)
+  std::vector<int32_t> sizes = {2, 3, 4, 4};
+  Tensor a = tf_float_.full_channels_last(sizes, 1.0f);
+  Tensor b = tf_float_.full_channels_last(sizes, 2.0f);
+
+  EXPECT_TRUE(tensors_have_same_dim_order(a, b));
+}
+
+TEST_F(TensorUtilTest, SemanticEquivalenceThreeTensors) {
+  using namespace torch::executor;
+  // Test 3-tensor overload with semantic equivalence
+  std::vector<int32_t> sizes = {2, 1, 4, 4}; // C=1 degenerate
+  Tensor nchw1 = tf_float_.ones(sizes);
+  Tensor nchw2 = tf_float_.ones(sizes);
+  Tensor nhwc = tf_float_.full_channels_last(sizes, 1.0f);
+
+  // All three should be semantically equivalent
+  EXPECT_TRUE(tensors_have_same_dim_order(nchw1, nchw2, nhwc));
+}
+
+TEST_F(TensorUtilTest, SemanticEquivalenceAllOnes) {
+  using namespace torch::executor;
+  // All size-1 dimensions: NCHW and NHWC should be equivalent
+  std::vector<int32_t> sizes = {1, 1, 1, 1};
+  Tensor nchw = tf_float_.ones(sizes);
+  Tensor nhwc = tf_float_.full_channels_last(sizes, 1.0f);
+
+  // All dims are size-1, so all are skipped -> equivalent
+  EXPECT_TRUE(tensors_have_same_dim_order(nchw, nhwc));
+}
+
+TEST_F(TensorUtilTest, SemanticEquivalenceZeroDim) {
+  using namespace torch::executor;
+  // 0-dim tensors (scalars) should be equivalent
+  Tensor a = tf_float_.ones({});
+  Tensor b = tf_float_.ones({});
+
+  EXPECT_TRUE(tensors_have_same_dim_order(a, b));
+}
+
+TEST_F(TensorUtilTest, SemanticEquivalenceOneDim) {
+  using namespace torch::executor;
+  // 1-dim tensors should be equivalent (only one possible dim_order)
+  Tensor a = tf_float_.ones({5});
+  Tensor b = tf_float_.ones({5});
+
+  EXPECT_TRUE(tensors_have_same_dim_order(a, b));
+}