Handle swizzle1d in isResharding (#6028)

Priya2698 · web-flow · commit 5b210dd66dff · 2026-03-09T16:15:03.000-07:00
diff --git a/csrc/multidevice/device_mesh.cpp b/csrc/multidevice/device_mesh.cpp
@@ -175,9 +175,9 @@ std::vector<DeviceIdxType> DeviceMesh::getSlice(
   indices.reserve(rank());
   for (int64_t i : arange(rank())) {
     if (i == axis) {
-      indices.push_back(at::indexing::Slice());
+      indices.emplace_back(at::indexing::Slice());
     } else {
-      indices.push_back(index[i]);
+      indices.emplace_back(index[i]);
     }
   }
   at::Tensor slice = devices_.index(indices);
diff --git a/csrc/multidevice/device_mesh.h b/csrc/multidevice/device_mesh.h
@@ -96,8 +96,9 @@ class DeviceMesh final {
   }
 
   // Returns the rank (number of dimensions) of the mesh.
+  // Returns -1 if the mesh is empty.
   int64_t rank() const {
-    return devices_.dim();
+    return size() > 0 ? devices_.dim() : -1;
   }
 
   bool operator==(const DeviceMesh& other) const {
diff --git a/csrc/multidevice/resharding.cpp b/csrc/multidevice/resharding.cpp
@@ -59,7 +59,8 @@ const std::vector<IterDomain*>& getDomainOf(
 std::pair<Val*, bool> computeLoopIndex(
     IterDomain* id,
     const std::vector<IterDomain*>& sources,
-    std::unordered_map<IterDomain*, std::pair<Val*, bool>>& id_to_index) {
+    std::unordered_map<IterDomain*, std::pair<Val*, bool>>& id_to_index,
+    const std::unordered_map<ParallelType, Val*>& pt_to_index) {
   if (id == nullptr) {
     return {nullptr, false};
   }
@@ -86,7 +87,9 @@ std::pair<Val*, bool> computeLoopIndex(
           div(in_info.first, inner->extent()), in_info.second};
       id_to_index[inner] = {
           mod(in_info.first, inner->extent()), in_info.second};
-    } else if (auto* merge = dynamic_cast<Merge*>(transform)) {
+      continue;
+    }
+    if (auto* merge = dynamic_cast<Merge*>(transform)) {
       auto* outer = merge->outer()->as<IterDomain>();
       auto* inner = merge->inner()->as<IterDomain>();
       auto* out = merge->out()->as<IterDomain>();
@@ -96,9 +99,22 @@ std::pair<Val*, bool> computeLoopIndex(
       id_to_index[out] = {
           add(mul(outer_info.first, inner->extent()), inner_info.first),
           outer_info.second || inner_info.second};
-    } else {
-      NVF_THROW("Unexpected transform: ", transform);
+      continue;
     }
+    if (auto* swizzle = dynamic_cast<Swizzle1D*>(transform)) {
+      auto* in = swizzle->in()->as<IterDomain>();
+      auto* out = swizzle->out()->as<IterDomain>();
+
+      const auto& in_info = id_to_index.at(in);
+      Val* extent = out->extent();
+      Val* pt_val = pt_to_index.at(swizzle->parallelType());
+      // Inverse of the swizzle formula in_idx = (out_idx + pt_val) % extent:
+      //   out_idx = (in_idx - pt_val + extent) % extent
+      id_to_index[out] = {
+          mod(add(sub(in_info.first, pt_val), extent), extent), in_info.second};
+      continue;
+    }
+    NVF_THROW("Unexpected transform: ", transform);
   }
 
   return id_to_index.at(id);
@@ -241,9 +257,26 @@ bool haveDifferentShardings(
   std::vector<Val*> assumptions;
   assumptions.reserve(
       (producer->getLogicalDomain().size() +
-       consumer->getMaybeRootDomain().size()) *
+       consumer->getMaybeRootDomain().size() + kParallelTypeDIDs.size()) *
       2);
 
+  // Create symbolic Vals for each device parallel type present in the mesh,
+  // representing the device's index within the team for that type. These are
+  // used by computeLoopIndex to symbolically compute Swizzle1D outputs.
+  std::unordered_map<ParallelType, Val*> pt_to_index;
+  const DeviceMesh& mesh = producer->getDeviceMesh();
+  for (ParallelType pt : kParallelTypeDIDs) {
+    if (!mesh.hasParallelType(pt)) {
+      continue;
+    }
+    Val* device_idx = IrBuilder::create<Val>(DataType::Index);
+    pt_to_index[pt] = device_idx;
+    Val* team_size = IrBuilder::create<Val>(mesh.size(pt), DataType::Index);
+    assumptions.push_back(
+        SimplifyingIrBuilder::leExpr(fusion->zeroVal(), device_idx));
+    assumptions.push_back(SimplifyingIrBuilder::ltExpr(device_idx, team_size));
+  }
+
   auto create_index = [&](IterDomain* id, bool mapped) {
     auto* index = IrBuilder::create<Val>(DataType::Index);
     NVF_ERROR(id_to_index.emplace(id, std::make_pair(index, mapped)).second);
@@ -311,7 +344,10 @@ bool haveDifferentShardings(
     Val* p_index = nullptr;
     bool p_mapped = false;
     std::tie(p_index, p_mapped) = computeLoopIndex(
-        p_id, getDomainOf(producer, DomainType::kLogical), id_to_index);
+        p_id,
+        getDomainOf(producer, DomainType::kLogical),
+        id_to_index,
+        pt_to_index);
     if (!p_mapped) {
       p_index = nullptr;
     }
@@ -320,7 +356,10 @@ bool haveDifferentShardings(
     Val* c_index = nullptr;
     bool c_mapped = false;
     std::tie(c_index, c_mapped) = computeLoopIndex(
-        c_id, getDomainOf(consumer, DomainType::kRoot), id_to_index);
+        c_id,
+        getDomainOf(consumer, DomainType::kRoot),
+        id_to_index,
+        pt_to_index);
     if (!c_mapped) {
       c_index = nullptr;
     }
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
@@ -455,8 +455,11 @@ TEST_F(MultiDeviceHostIrTest, SymmetricContiguousView) {
   FusionGuard::setCurFusion(hic.get());
 
   // Create input and output TensorViews
+  DeviceMesh mesh = DeviceMesh::createForNumDevices(communicator_size);
+
   TensorView* input_tv = makeContigConcreteTensor(sharded_sizes);
   input_tv->setMemoryType(MemoryType::Symmetric);
+  input_tv->setDeviceMesh(mesh);
   input_tv->axis(0)->parallelize(ParallelType::DIDx);
 
   TensorView* output_tv = makeContigConcreteTensor(unsharded_sizes);
diff --git a/tests/cpp/test_resharding.cpp b/tests/cpp/test_resharding.cpp
@@ -631,4 +631,50 @@ TEST_F(ReshardingSelectOpTest, ReshardingSelectIntoNonDeviceDim) {
   EXPECT_TRUE(isResharding(tv1->definition()));
 }
 
+TEST_F(ReshardingTest, Swizzle1D_DIDToStream) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  const int d = 2;
+  auto mesh = DeviceMesh::createForNumDevices(d);
+
+  TensorView* in = makeContigTensor(1);
+  in->setDeviceMesh(mesh);
+  in->outer_split(0, d);
+  in->axis(0)->parallelize(ParallelType::DIDx);
+
+  TensorView* out = set(in);
+  out->setDeviceMesh(mesh);
+  out->outer_split(0, d);
+  out->swizzle1d(0, ParallelType::DIDx);
+  out->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_TRUE(haveDifferentShardings(
+      in, DomainType::kLoop, out, DomainType::kLoop, {ParallelType::Stream}));
+
+  EXPECT_TRUE(haveDifferentShardings(
+      in, DomainType::kLoop, out, DomainType::kLoop, {ParallelType::DIDx}));
+}
+
+TEST_F(ReshardingTest, Swizzle1D_ConsistentSwizzle) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  const int d = 2;
+  auto mesh = DeviceMesh::createForNumDevices(d);
+
+  TensorView* in = makeContigTensor(1);
+  in->setDeviceMesh(mesh);
+  in->outer_split(0, d);
+  in->swizzle1d(0, ParallelType::DIDx);
+  in->axis(0)->parallelize(ParallelType::Stream);
+
+  TensorView* out = set(in);
+  out->setDeviceMesh(mesh);
+  out->outer_split(0, d);
+  out->swizzle1d(0, ParallelType::DIDx);
+  out->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_FALSE(haveDifferentShardings(
+      in, DomainType::kLoop, out, DomainType::kLoop, {ParallelType::Stream}));
+}
+
 } // namespace nvfuser

Original file line number	Diff line number	Diff line change
`@@ -175,9 +175,9 @@ std::vector<DeviceIdxType> DeviceMesh::getSlice(`
`175`	`175`	`indices.reserve(rank());`
`176`	`176`	`for (int64_t i : arange(rank())) {`
`177`	`177`	`if (i == axis) {`
`178`		`- indices.push_back(at::indexing::Slice());`
	`178`	`+ indices.emplace_back(at::indexing::Slice());`
`179`	`179`	`} else {`
`180`		`- indices.push_back(index[i]);`
	`180`	`+ indices.emplace_back(index[i]);`
`181`	`181`	`}`
`182`	`182`	`}`
`183`	`183`	`at::Tensor slice = devices_.index(indices);`
Original file line number	Diff line number	Diff line change
`@@ -96,8 +96,9 @@ class DeviceMesh final {`
`96`	`96`	`}`
`97`	`97`
`98`	`98`	`// Returns the rank (number of dimensions) of the mesh.`
	`99`	`+ // Returns -1 if the mesh is empty.`
`99`	`100`	`int64_t rank() const {`
`100`		`- return devices_.dim();`
	`101`	`+ return size() > 0 ? devices_.dim() : -1;`
`101`	`102`	`}`
`102`	`103`
`103`	`104`	`bool operator==(const DeviceMesh& other) const {`