[Mlir-commits] [mlir] [mlir][mesh] Add resharding spmdization on a 1D device mesh (PR #76179)

Tue Jan 2 10:45:15 PST 2024

https://github.com/sogartar updated https://github.com/llvm/llvm-project/pull/76179

>From 28111bed8dbc8bf057070baa56edfb530728dcb1 Mon Sep 17 00:00:00 2001
From: Boian Petkantchin <boian.petkantchin at amd.com>
Date: Wed, 6 Dec 2023 07:58:53 -0800
Subject: [PATCH 1/6] [mlir][mesh] Add resharding spmdization on a 1D device
 mesh

The current implementation supports only sharding of tensor axes that have size
divisible by the mesh axis size.
---
 mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td |  11 +-
 mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h   |   3 +
 mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td  |  48 +-
 .../Mesh/Transforms/ReshardingSpmdizationDoc  | 621 +++++++++++++++++
 .../Dialect/Mesh/Transforms/Spmdization.h     |  35 +
 mlir/include/mlir/Support/MathExtras.h        |  11 +
 mlir/lib/Dialect/Mesh/IR/MeshOps.cpp          | 213 ++++--
 .../Dialect/Mesh/Transforms/CMakeLists.txt    |   3 +
 .../Dialect/Mesh/Transforms/Spmdization.cpp   | 639 ++++++++++++++++++
 mlir/test/Dialect/Mesh/invalid.mlir           |  96 +++
 mlir/test/Dialect/Mesh/ops.mlir               |  49 ++
 .../Dialect/Mesh/resharding-spmdization.mlir  | 154 +++++
 mlir/test/lib/Dialect/Mesh/CMakeLists.txt     |   1 +
 .../Mesh/TestReshardingSpmdization.cpp        | 124 ++++
 mlir/tools/mlir-opt/mlir-opt.cpp              |   2 +
 15 files changed, 1955 insertions(+), 55 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
 create mode 100644 mlir/include/mlir/Dialect/Mesh/Transforms/Spmdization.h
 create mode 100644 mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
 create mode 100644 mlir/test/Dialect/Mesh/resharding-spmdization.mlir
 create mode 100644 mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp

diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
index 9d39b1b3329fb4..a9d30dfbb9a76e 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
@@ -33,6 +33,10 @@ def Mesh_Dialect : Dialect {
   let useDefaultAttributePrinterParser = 1;
   let hasConstantMaterializer = 1;
 }
+
+def Mesh_MeshAxis : I<16>;
+def Mesh_MeshAxesAttr : DenseArrayAttrBase<"DenseI16ArrayAttr", "int16_t", "i16">;
+
 //===----------------------------------------------------------------------===//
 // Mesh Enums.
 //===----------------------------------------------------------------------===//
@@ -125,7 +129,7 @@ def MeshSharding : AttrDef<Mesh_Dialect, "MeshSharding"> {
 
     // The tensor is sharded on the first dimension along axis 0 of @mesh0 and
     // it is also a partial_sum along mesh axis 1.
-    tensor<4x8xf32, #mesh.shard<@mesh0, [[0], [], [1]]>
+    tensor<4x8xf32, #mesh.shard<@mesh0, [[0], []], partial = sum[1]>
 
     // The tensor is sharded on the first dimension along axis 0 of @mesh0 and
     // it is also a partial_max along mesh axis 1.
@@ -158,6 +162,11 @@ def MeshSharding : AttrDef<Mesh_Dialect, "MeshSharding"> {
     }]>
   ];
 
+  let extraClassDeclaration = [{
+    bool operator==(::mlir::Attribute rhs) const;
+    bool operator==(::mlir::mesh::MeshShardingAttr rhs) const;
+  }];
+
   let genVerifyDecl = 1;
 }
 
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
index 9077d2eb0189b7..ce7d5d045122d9 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
@@ -30,6 +30,9 @@
 namespace mlir {
 namespace mesh {
 
+using MeshAxis = int16_t;
+using MeshAxesAttr = DenseI16ArrayAttr;
+
 bool isReductionLoop(IteratorType iType);
 
 bool areReductionAndPartialMatch(IteratorType iType, Partial partial);
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
index 784f3eb97763ad..1ed54b6519e4d8 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_MESH_IR_MESHOPS_TD
 
 include "mlir/Dialect/Mesh/IR/MeshBase.td"
+include "mlir/Dialect/Shape/IR/ShapeBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/BuiltinTypes.td"
@@ -95,6 +96,28 @@ def Mesh_ClusterOp : Mesh_Op<"cluster", [Symbol]> {
   let hasVerifier = 1;
 }
 
+def Mesh_ClusterShapeOp : Mesh_Op<"cluster_shape", [Pure, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+  let summary = "Get the shape of the cluster.";
+  let arguments = (ins
+    FlatSymbolRefAttr:$mesh,
+    DefaultValuedAttr<Mesh_MeshAxesAttr, "{}">:$axes
+  );
+
+  let results = (outs
+    Variadic<Index>:$result
+  );
+
+  let assemblyFormat = [{
+    $mesh (`axes` `=` $axes^)?
+    attr-dict `:` type($result)
+  }];
+
+  let builders = [
+    OpBuilder<(ins "::mlir::mesh::ClusterOp":$mesh)>,
+    OpBuilder<(ins "StringRef":$mesh, "ArrayRef<int16_t>":$axes)>
+  ];
+}
+
 def Mesh_ShardOp : Mesh_Op<"shard", [Pure, SameOperandsAndResultType]> {
   let summary = "Annotate on how a tensor is sharded across a mesh cluster.";
   let description = [{
@@ -186,6 +209,29 @@ def Mesh_ShardOp : Mesh_Op<"shard", [Pure, SameOperandsAndResultType]> {
   }];
 }
 
+def Mesh_ProcessIndexOp : Mesh_Op<"process_index", [Pure, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+  let summary = "Get the index of current device along specified mesh axis.";
+  let description = [{
+    It is used in the SPMD format of IR.
+    The `axes` mush be non-negative and less than the total number of mesh axes.
+  }];
+  let arguments = (ins
+    FlatSymbolRefAttr:$mesh,
+    DefaultValuedAttr<Mesh_MeshAxesAttr, "{}">:$axes
+  );
+  let results = (outs
+    Variadic<Index>:$result
+  );
+  let assemblyFormat = [{
+    `on` $mesh (`axes` `=` $axes^)?
+    attr-dict `:` type($result)
+  }];
+  let builders = [
+    OpBuilder<(ins "::mlir::mesh::ClusterOp":$mesh)>,
+    OpBuilder<(ins "StringRef":$mesh, "ArrayRef<int16_t>":$axes)>
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // collective communication ops
 //===----------------------------------------------------------------------===//
@@ -197,7 +243,7 @@ class Mesh_CollectiveCommunicationOpBase<
       [DeclareOpInterfaceMethods<SymbolUserOpInterface>])> {
   dag commonArgs = (ins
     FlatSymbolRefAttr:$mesh,
-    DefaultValuedAttr<DenseI16ArrayAttr, "{}">:$mesh_axes
+    DefaultValuedAttr<Mesh_MeshAxesAttr, "{}">:$mesh_axes
   );
 }
 
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc b/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
new file mode 100644
index 00000000000000..181f07177e0af9
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
@@ -0,0 +1,621 @@
+Reshadring Spmdization Examples
+
+--------------------------------------------------------------
+
+Reshard 2x3 tensor from sharding [[0, 1]] to sharding [[0, 1]] on a 2x3 mesh.
+
+unsharded 2x3 tensor
+11 12 13
+21 22 23
+
+sharded on a 2x3 mesh
+
+sharding = [[0, 1]]
+
+mesh contents:
+
+mesh axis 1
+----------->
++----+----+----+ mesh axis 0 |
+| 11 | 12 | 13 |             |
++----+----+----+             |
+| 21 | 22 | 23 |             |
++----+----+----+             ↓
+
+Transform into
+sharding = [[1, 0]]
+
+mesh axis 1
+----------->
++----+----+----+ mesh axis 0 |
+| 11 | 13 | 22 |             |
++----+----+----+             |
+| 12 | 21 | 23 |             |
++----+----+----+             ↓
+
+Swap contents on devices that have the same linear index in the 2 shardings.
+
+--------------------------------------------------------------
+
+Reshard 2x3 tensor from sharding [[0, 1]] to sharding [[1]] on a 2x3 mesh.
+
+unsharded 2x3 tensor
+11 12 13
+21 22 23
+
+sharded on a 2x3 mesh
+
+sharding = [[0, 1]]
+
+mesh contents:
+
+mesh axis 1
+----------->
++----+----+----+ mesh axis 0 |
+| 11 | 12 | 13 |             |
++----+----+----+             |
+| 21 | 22 | 23 |             |
++----+----+----+             ↓
+
+Transform into
+sharding = [[1]]
+
+mesh axis 1
+----------->
++----+----+----+ mesh axis 0 |
+| 11 | 12 | 13 |             |
+| 21 | 22 | 23 |             |
++----+----+----+             |
+| 11 | 12 | 13 |             |
+| 21 | 22 | 23 |             |
++----+----+----+             ↓
+
+All-gather along mesh axis 0.
+
+--------------------------------------------------------------
+
+Reshard 4x6 tensor from sharding [[], [0, 1]] to sharding [[], [0]] on a 2x3 mesh.
+
+unsharded 4x6 tensor
+11 12 13 14 15 16
+21 22 23 24 25 26
+
+sharded on a 2x3 mesh
+
+sharding = [[], [0, 1]]
+
+mesh contents:
+
+mesh axis 1
+----------->
++----+----+----+ mesh axis 0 |
+| 11 | 12 | 13 |             |
+| 21 | 22 | 23 |             |
++----+----+----+             |
+| 14 | 15 | 16 |             |
+| 24 | 25 | 26 |             |
++----+----+----+             ↓
+
+Transform into
+sharding = [[], [0]]
+
+mesh axis 1
+----------->
++----------+----------+ mesh axis 0 |
+| 11 12 13 | 11 12 13 |             |
+| 21 22 23 | 21 22 23 |             |
++----------+----------+             |
+| 14 15 16 | 14 15 16 |             |
+| 24 25 26 | 24 25 26 |             |
++----------+----------+             ↓
+
+all-gather along mesh axis 1
+
+--------------------------------------------------------------
+
+Reshard 4x8 tensor from sharding [[0], [1, 2]] to sharding [[0], [2]] on a 2x2x2 mesh.
+
+unsharded 4x8 tensor
+11 12 13 14 15 16 17 18
+21 22 23 24 25 26 27 28
+31 32 33 34 35 36 37 38
+41 42 43 44 45 46 47 48
+
+sharded on a 2x2x2 mesh
+
+sharding = [[0], [1, 2]]
+
+mesh contents:
+
+mesh axis 2
+----------->
++-------+-------+ mesh axis 1 | mesh axis 0 |
+| 11 12 | 13 14 |             |             |
+| 21 22 | 23 24 |             |             |
++-------+-------+             |             |
+| 15 16 | 17 18 |             |             |
+| 25 26 | 27 28 |             |             |
++-------+-------+             ↓             |
++-------+-------+                           |
+| 31 32 | 33 34 |                           |
+| 41 42 | 43 44 |                           |
++-------+-------+                           |
+| 35 36 | 37 38 |                           |
+| 45 46 | 47 48 |                           |
++-------+-------+                           ↓
+
+Transform into
+sharding = [[0], [2]]
+
+mesh axis 2
+----------->
++-------------+-------------+ mesh axis 1 | mesh axis 0 |
+| 11 12 13 14 | 15 16 17 18 |             |             |
+| 21 22 23 24 | 25 26 27 28 |             |             |
++-------------+-------------+             |             |
+| 11 12 13 14 | 15 16 17 18 |             |             |
+| 21 22 23 24 | 25 26 27 28 |             |             |
++-------------+-------------+             ↓             |
++-------------+-------------+                           |
+| 31 32 33 34 | 35 36 37 38 |                           |
+| 41 42 43 44 | 45 46 47 48 |                           |
++-------------+-------------+                           |
+| 31 32 33 34 | 35 36 37 38 |                           |
+| 41 42 43 44 | 45 46 47 48 |                           |
++-------------+-------------+                           ↓
+
+Can't be done with just an all-gather along mesh axis 1.
+Can be handled by multiple resharding transformations
+[[0], [1, 2]] -> [[0], [2, 1]] -> [[0], [2]]
+
+--------------------------------------------------------------
+
+Reshard 6x6 tensor from sharding [[0], [1]] to sharding [[1], [0]] on a 2x3 mesh.
+
+unsharded 6x6 tensor
+11 12 13 14 15 16
+21 22 23 24 25 26
+31 32 33 34 35 36
+41 42 43 44 45 46
+51 52 53 54 55 56
+61 62 63 64 65 66
+
+sharded on a 2x3 mesh
+
+sharding = [[0], [1]]
+
+mesh axis 1
+----------->
++-------+-------+-------+ mesh axis 0 |
+| 11 12 | 13 14 | 15 16 |             |
+| 21 22 | 23 24 | 25 26 |             |
+| 31 32 | 33 34 | 35 36 |             |
++-------+-------+-------+             |
+| 41 42 | 43 44 | 45 46 |             |
+| 51 52 | 53 54 | 55 56 |             |
+| 61 62 | 63 64 | 65 66 |             |
++-------+-------+-------+             ↓
+
+transform to
+sharding = [[1], [0]]
+
+mesh axis 1
+----------->
++----------+----------+----------+ mesh axis 0 |
+| 11 12 13 | 31 32 33 | 51 52 53 |             |
+| 21 22 23 | 41 42 43 | 61 62 63 |             |
++----------+----------+----------+             |
+| 14 15 16 | 34 35 36 | 54 55 56 |             |
+| 24 25 26 | 44 45 46 | 64 65 66 |             |
++----------+----------+----------+             ↓
+
+mesh axis 0
+----------->
++----------+----------+ mesh axis 1 |
+| 11 12 13 | 14 15 16 |             |
+| 21 22 23 | 24 25 26 |             |
++----------+----------+             |
+| 31 32 33 | 34 35 36 |             |
+| 41 42 43 | 44 45 46 |             |
++----------+----------+             |
+| 51 52 53 | 54 55 56 |             |
+| 61 62 63 | 64 65 66 |             |
++----------+----------+             ↓
+
+TODO
+
+--------------------------------------------------------------
+
+Reshard 6x6 tensor from sharding [[0], [1]] to sharding [[1], [0]] on a 2x6 mesh.
+
+unsharded 6x6 tensor
+11 12 13 14 15 16
+21 22 23 24 25 26
+31 32 33 34 35 36
+41 42 43 44 45 46
+51 52 53 54 55 56
+61 62 63 64 65 66
+
+shard on 2x6 mesh
+
+sharding = [[0], [1]]
+
+mesh axis 1
+----------->
++----+----+----+----+----+----+ mesh axis 0 |
+| 11 | 12 | 13 ‖ 14 | 15 | 16 |             |
+| 21 | 22 | 23 ‖ 24 | 23 | 26 |             |
+| 31 | 32 | 33 ‖ 34 | 35 | 36 |             |
++----+----+----+----+----+----+             |
+| 41 | 42 | 43 ‖ 44 | 45 | 46 |             |
+| 51 | 52 | 53 ‖ 54 | 55 | 56 |             |
+| 61 | 62 | 63 ‖ 64 | 65 | 66 |             |
++----+----+----+----+----+----+             ↓
+
+
+transform to
+sharding = [[1], [0]]
+
+mesh axis 0
+----------->
++----------+----------+ mesh axis 1 |
+| 11 12 13 | 14 15 16 |             |
++----------+----------+             |
+| 21 22 23 | 24 25 26 |             |
++----------+----------+             |
+| 31 32 33 | 34 35 36 |             |
++==========+==========+             |
+| 41 42 43 | 44 45 46 |             |
++----------+----------+             |
+| 51 52 53 | 54 55 56 |             |
++----------+----------+             |
+| 61 62 63 | 64 65 66 |             |
++----------+----------+             ↓
+
+TODO
+
+--------------------------------------------------------------
+
+Reshard KxL tensor from [[0], [1]] to [[1], [0]] on MxN mesh.
+
+M x N mesh.
+K x L tensor t.
+d(m, n) the tensor on device (m, n).
+
+sharding = [[0], [1]]
+Tensor shard s on each device has size (K ceildiv M, L ceildiv N).
+d(m, n)[k, l] -> t[m * (K ceildiv M) + k, n * (L ceildiv N) + l]
+
+substitute
+i <- m * (K ceildiv M) + k
+j <- n * (L ceildiv N) + l
+
+m -> i floordiv (K ceildiv M)
+n -> j floordiv (L ceildiv N)
+k -> i % (K ceildiv M)
+l -> j % (L ceildiv N)
+
+For the inverse map we get
+t[i, j] -> d(
+    i floordiv (K ceildiv M), j floordiv (L ceildiv N)
+)[
+    i % (K ceildiv M), j % (L ceildiv N)
+]
+
+Check:
+i = 13, j = 17, M = 3, N = 4, K = 16, L = 23
+t[13, 17] = d(
+    13 floordiv (16 ceildiv 3),
+    17 floordiv (23 ceilvid 4)
+)[
+    13 % (16 ceildiv 3),
+    17 % (23 ceilvid 4)
+]
+= d(
+    13 floordiv 6,
+    17 floordiv 6
+)[
+    13 % 6,
+    17 % 6
+]
+= d(2, 2)[1, 5]
+= t[
+    2 * (16 ceildiv 3) + 1,
+    2 * (23 ceildiv 4) + 5
+]
+= t[
+    2 * 6 + 1,
+    2 * 6 + 5
+]
+= t[13, 17]
+
+
+sharding = [[1], [0]]
+Tensor shard s on each device has size (K ceildiv N, L ceildiv M).
+d(m, n)[k, l] -> t[n * (K ceildiv N) + k, m * (L ceildiv M) + l]
+
+substitute
+i <- n * (K ceildiv N) + k
+j <- m * (L ceildiv M) + l
+
+m -> j floordiv (L ceildiv M)
+n -> i floordiv (K ceildiv N)
+k -> i % (K ceildiv N)
+l -> j % (L ceildiv M)
+
+For the inverse map we get
+t[i, j] -> d(
+    j floordiv (L ceildiv M), i floordiv (K ceildiv N)
+)[
+    i % (K ceildiv N), j % (L ceildiv M)
+]
+
+Check:
+i = 9, j = 19, M = 5, N = 2, K = 27, L = 14
+t[9, 19] = d(
+    19 floordiv (14 ceildiv 5),
+    9 floordiv (27 ceildiv 2)
+)[
+    9 % (27 ceildiv 2),
+    19 % (14 ceildiv 5)
+]
+= d(
+    19 floordiv 3,
+    9 floordiv 14
+)[
+    9 % 14
+    19 % 3
+]
+= d(6, 0)[9, 1]
+= t[
+    0 * (27 ceildiv 2) + 9,
+    6 * (14 ceildiv 5) + 1
+]
+= t[
+    0 * 14 + 9,
+    6 * 3 + 1
+]
+= t[9, 19]
+
+sharding = [[0], [1]]
+d(m, n)[k, l] -> t[m * (K ceildiv M) + k, n * (L ceildiv N) + l]
+t[i, j] -> d(i floordiv (K ceildiv M), j floordiv (L ceildiv N))[i % (K ceildiv M), j % (L ceildiv N)]
+
+sharding = [[1], [0]]
+d(m, n)[k, l] -> t[n * (K ceildiv N) + k, m * (L ceildiv M) + l]
+t[i, j] -> d(j floordiv (L ceildiv M), i floordiv (K ceildiv N))[i % (K ceildiv N), j % (L ceildiv M)]
+
+sharding [[0], [1]] -> [[1], [0]]
+d1(m, n) the tensor on device (m, n) for sharding sharding [[0], [1]].
+d2(m, n) the tensor on device (m, n) for sharding sharding [[1], [0]].
+d1(m, n)[k, l] ->
+t[m * (K ceildiv M) + k, n * (L ceildiv N) + l] ->
+d2(
+    (m * (L ceildiv M) + l) floordiv (L ceildiv M),
+    (n * (K ceildiv N) + k) floordiv (K ceildiv N)
+)[
+    (n * (K ceildiv N) + k) % (K ceildiv N),
+    (m * (L ceildiv M) + l) % (L ceildiv M)
+]
+= d2(p, q)[u, v]
+
+We want to copy the the data between devices in slices/tiles.
+What are the source/target tile coordinates?
+Fro a fixed (m, n, p, q) what is the range of (k, l, u, v)?
+TODO
+
+--------------------------------------------------------------
+
+Reshard KxL tensor from sharding [[0], [1]] to sharding [[1], [0]] on a 2x3 mesh.
+
+Device placement on a 2x3 mesh
+11 12 13  <- devices
+21 22 23
+
+sharding [[0], [1]]
+tensor axis 1
+----------->
++----+----+----+ tensor axis 0 |
+| 11 | 12 | 13 |               |
++----+----+----+               |
+| 21 | 22 | 23 |               |
++----+----+----+               ↓
+
+transform to
+sharding [[1], [0]]
+tensor axis 1
+----------->
++----+----+ tensor axis 0 |
+| 11 | 21 |               |
++----+----+               |
+| 12 | 22 |               |
++----+----+               |
+| 13 | 23 |               |
++----+----+               ↓
+
++-----------------+--------+--------+-----------------+
+|                 |                 |                 |
++                 +                 +                 +
+|       11        |        12       |        13       |
++                 +                 +                 +
+|                 |                 |                 |
++-----------------+--------+--------+-----------------+
+|                 |                 |                 |
++                 +                 +                 +
+|       21        |        22       |        23       |
++                 +                 +                 +
+|                 |                 |                 |
++-----------------+--------+--------+-----------------+
+
++-----------------+--------+--------+-----------------+
+|                          |                          |
++          11              +               21         +
+|                          |                          |
++-----------------+--------+--------+-----------------+
+|                          |                          |
++          12              +               22         +
+|                          |                          |
++-----------------+--------+--------+-----------------+
+|                          |                          |
++          13              +               23         +
+|                          |                          |
++-----------------+--------+--------+-----------------+
+
++-----------------+--------+--------+-----------------+
+|                 |        |        |                 |
++     11  11      + 12  11 + 12  21 +     13  21      +
+|                 |        |        |                 |
++-----------------+--------+--------+-----------------+
+|     11  12      | 12  12 | 12  22 |     13  22      |
++-----------------+--------+--------+-----------------+
+|     21  12      | 22  12 | 22  22 |     23  22      |
++-----------------+--------+--------+-----------------+
+|                 |        |        |                 |
++     21  13      + 22  13 + 22  23 +     23  23      +
+|                 |        |        |                 |
++-----------------+--------+--------+-----------------+
+
+If S and T are the source and target shard sizes along some tensor axis.
+Then we have a period of (S*T)/gcd(S, T). Then the cut pattern repeats.
+TODO
+
+--------------------------------------------------------------
+
+Reshard 6x6 tensor from sharding [[0], []] to sharding [[], [0]] on a 3 mesh.
+
+unsharded 6x6 tensor
+11 12 13 14 15 16
+21 22 23 24 25 26
+31 32 33 34 35 36
+41 42 43 44 45 46
+51 52 53 54 55 56
+61 62 63 64 65 66
+
+sharded on a 3 mesh
+
+sharding = [[0], []]
+
++-------------------+ mesh axis 0 |
+| 11 12 13 14 15 16 |             |
+| 21 22 23 24 25 26 |             |
++-------------------+             |
+| 31 32 33 34 35 36 |             |
+| 41 42 43 44 45 46 |             |
++-------------------+             |
+| 51 52 53 54 55 56 |             |
+| 61 62 63 64 65 66 |             |
++-------------------+             ↓
+
+transform to
+sharding = [[], [0]]
+
+mesh axis 0
+----------->
++-------+-------+-------+
+| 11 12 | 13 14 | 15 16 |
+| 21 22 | 23 24 | 25 26 |
+| 31 32 | 33 34 | 35 36 |
+| 41 42 | 43 44 | 45 46 |
+| 51 52 | 53 54 | 55 56 |
+| 61 62 | 63 64 | 65 66 |
++-------+-------+-------+
+
+%1 = all_to_all %0 on @mesh mesh_axes = [0] split_axis = 1 concat_axis = 0 : tensor<2x6xi8> -> tensor<6x2xi8>
+
+--------------------------------------------------------------
+
+Reshard 4x4 tensor from sharding [[0], [1, 2]] to sharding [[0, 1], [2]] on a 2x2x2 mesh.
+
+unsharded 4x4 tensor
+11 12 13 14
+21 22 23 24
+31 32 33 34
+41 42 43 44
+
+sharded on a 2x2x2 mesh
+
+sharding = [[0], [1, 2]]
+
+mesh axis 2
+----------->
++----+----+ mesh axis 1 | mesh axis 0 |
+| 11 | 12 |             |             |
+| 21 | 22 |             |             |
++----+----+             |             |
+| 13 | 14 |             |             |
+| 23 | 24 |             |             |
++----+----+             ↓             |
++----+----+                           |
+| 31 | 32 |                           |
+| 41 | 42 |                           |
++----+----+                           |
+| 33 | 34 |                           |
+| 43 | 44 |                           |
++----+----+                           ↓
+
+transform to
+sharding = [[0, 1], [2]]
+
+mesh axis 2
+----------->
++-------+-------+ mesh axis 1 | mesh axis 0 |
+| 11 12 | 13 41 |             |             |
++-------+-------+             |             |
+| 21 22 | 23 24 |             |             |
++-------+-------+             ↓             |
++-------+-------+                           |
+| 31 32 | 33 34 |                           |
++-------+-------+                           |
+| 41 42 | 43 44 |                           |
++-------+-------+                           ↓
+
+%1 = all_to_all %0 on @mesh mesh_axes = [2] split_axis = 1 concat_axis = 0 : tensor<2x1xi8> -> tensor<1x2xi8>
+is not enough.
+
+Can be decomposed into
+[[0], [1, 2]] -> [[0], [2, 1]] -> [[0, 1], [2]]
+
+--------------------------------------------------------------
+
+We can decompose each resharding into a sequence of basis reshardings.
+It is not communication efficient in terms of minimizing the data communicated
+between devices.
+An efficient approach would be more complicated to implement.
+Each device has to receive at most as much data as the size of its target
+sharding tensor.
+
+Basis:
+
+From replicate to split.
+[[]] -> [[1]]
+Extract slices without communication.
+
+From split to replicate.
+[[0]] -> [[]]
+[[0, 1]] -> [[1]]
+All-gather along mesh axis 0.
+
+Swap mesh axes order when assigned to the same tensor axis.
+[[0, 1]] -> [[1, 0]]
+Swap contents on devices with the same linear index.
+
+Move mesh axis to different tensor dimension.
+[[0], []] -> [[], [0]]
+All-to-all.
+
+Example decomposition of
+[[0], [1]] -> [[1], [0]]
+
+[[0], [1]] -> all-gather along mesh axis 1    ->
+[[0], []]  -> all-to-all along mesh axis 0    ->
+[[], [0]]  -> extract slice along mesh axis 1 ->
+[[1], [0]]
+
+Example decomposition of
+[[3, 2], [], [0, 1]] -> [[0], [1, 2], []]
+
+[[3, 2], [], [0, 1]] -> all-to-all along mesh axis 1 ->
+[[3, 2], [1], [0]]   -> all-to-all along mesh axis 2 ->
+[[3], [1, 2], [0]]   -> all-gather along mesh axis 3 ->
+[[], [1, 2], [0]]    -> all-to-all along mesh axis 0 ->
+[[0], [1, 2], []]
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Spmdization.h b/mlir/include/mlir/Dialect/Mesh/Transforms/Spmdization.h
new file mode 100644
index 00000000000000..f71bb9b262a380
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Spmdization.h
@@ -0,0 +1,35 @@
+//===- Simplifications.h - Mesh Simplifications -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_MESH_TRANSFORMS_SPMDIZATION_H
+#define MLIR_DIALECT_MESH_TRANSFORMS_SPMDIZATION_H
+
+#include "mlir/Dialect/Mesh/IR/MeshOps.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace mesh {
+
+// Return the sharded shape `shape` acording ot sharding `sharding`.
+ShapedType shardShapedType(ShapedType shape, ClusterOp mesh,
+                           MeshShardingAttr sharding);
+
+// Insert resharding spmdization of the value `sourceShardValue`
+// from sharding `source` to sharding `target`.
+// `sourceShardValue` is the already sharded value according to `source`.
+TypedValue<ShapedType> reshard(OpBuilder &builder, ClusterOp mesh,
+                               ShardOp source, ShardOp target,
+                               TypedValue<ShapedType> sourceShardValue);
+
+void reshardingRegisterDependentDialects(DialectRegistry &registry);
+
+} // namespace mesh
+} // namespace mlir
+
+#endif // MLIR_DIALECT_MESH_TRANSFORMS_SPMDIZATION_H
diff --git a/mlir/include/mlir/Support/MathExtras.h b/mlir/include/mlir/Support/MathExtras.h
index 17a747393d26eb..22493c11a370d1 100644
--- a/mlir/include/mlir/Support/MathExtras.h
+++ b/mlir/include/mlir/Support/MathExtras.h
@@ -15,9 +15,20 @@
 
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/APInt.h"
+#include <type_traits>
 
 namespace mlir {
 
+// ceilDiv for unsigned integral.
+template <typename T, std::enable_if_t<std::is_integral_v<T> &&
+                                       !std::is_unsigned_v<T>> = true>
+T ceilDiv(T lhs, T rhs) {
+  assert(rhs != static_cast<T>(0));
+  T q = lhs / rhs;
+  T r = lhs % rhs;
+  return r == static_cast<T>(0) ? q : q + static_cast<T>(1);
+}
+
 /// Returns the result of MLIR's ceildiv operation on constants. The RHS is
 /// expected to be non-zero.
 inline int64_t ceilDiv(int64_t lhs, int64_t rhs) {
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
index d27675564c6464..de4f58d54e8ca5 100644
--- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
+++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -9,8 +9,10 @@
 #include "mlir/Dialect/Mesh/IR/MeshOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/Location.h"
@@ -59,8 +61,6 @@ static SmallVector<T> &canonicalizeSetAsVector(SmallVector<T> &vec) {
   return vec;
 }
 
-using MeshAxis = int16_t;
-
 namespace {
 
 struct DimensionSize {
@@ -114,6 +114,56 @@ Operation *MeshDialect::materializeConstant(OpBuilder &builder, Attribute value,
 // Mesh utilities
 //===----------------------------------------------------------------------===//
 
+static FailureOr<ClusterOp> getMesh(Operation *op, FlatSymbolRefAttr meshSymbol,
+                                    SymbolTableCollection &symbolTable) {
+  mesh::ClusterOp mesh =
+      symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(op, meshSymbol);
+  if (!mesh) {
+    return op->emitError() << "Undefined required mesh symbol \""
+                           << meshSymbol.getValue() << "\".";
+  }
+
+  return mesh;
+}
+
+template <typename It>
+bool isUnique(It begin, It end) {
+  if (begin == end) {
+    return true;
+  }
+  It next = std::next(begin);
+  if (next == end) {
+    return true;
+  }
+  for (; next != end; ++next, ++begin) {
+    if (*begin == *next) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static LogicalResult verifyMeshAxes(Location loc, ArrayRef<MeshAxis> axes,
+                                    ClusterOp mesh) {
+  SmallVector<MeshAxis> sorted = llvm::to_vector(axes);
+  llvm::sort(sorted);
+  if (!isUnique(sorted.begin(), sorted.end())) {
+    return emitError(loc) << "Mesh axes contains duplicate elements.";
+  }
+
+  MeshAxis rank = mesh.getRank();
+  for (auto axis : axes) {
+    if (axis >= rank || axis < 0) {
+      return emitError(loc)
+             << "0-based mesh axis index " << axis
+             << " is out of bounds. The referenced mesh \"" << mesh.getSymName()
+             << "\" is of rank " << rank << ".";
+    }
+  }
+
+  return success();
+}
+
 bool mesh::isReductionLoop(IteratorType iType) {
   return iType != IteratorType::Parallel && iType != IteratorType::Invalid;
 }
@@ -173,7 +223,45 @@ SmallVector<int64_t> ClusterOp::canonicalDimSizes() {
 }
 
 //===----------------------------------------------------------------------===//
-// mesh.shard op
+// mesh.cluster_shape op
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+ClusterShapeOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  auto mesh = ::getMesh(getOperation(), getMeshAttr(), symbolTable);
+  if (failed(mesh)) {
+    return failure();
+  }
+  if (failed(verifyMeshAxes(getLoc(), getAxes(), mesh.value()))) {
+    return failure();
+  }
+
+  size_t expectedResultsCount =
+      getAxes().empty() ? mesh->getRank() : getAxes().size();
+  if (getResult().size() != expectedResultsCount) {
+    return emitError() << "Unexpected number of results " << getResult().size()
+                       << ". Expected " << expectedResultsCount << ".";
+  }
+
+  return success();
+}
+
+void ClusterShapeOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                           ClusterOp mesh) {
+  build(odsBuilder, odsState,
+        SmallVector<Type>(mesh.getRank(), odsBuilder.getIndexType()),
+        mesh.getSymName(), MeshAxesAttr());
+}
+
+void ClusterShapeOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                           StringRef mesh, ArrayRef<MeshAxis> axes) {
+  build(odsBuilder, odsState,
+        SmallVector<Type>(axes.size(), odsBuilder.getIndexType()), mesh,
+        MeshAxesAttr::get(odsBuilder.getContext(), axes));
+}
+
+//===----------------------------------------------------------------------===//
+// mesh.shard attr
 //===----------------------------------------------------------------------===//
 
 LogicalResult
@@ -205,6 +293,75 @@ MeshShardingAttr::verify(function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
+bool MeshShardingAttr::operator==(Attribute rhs) const {
+  MeshShardingAttr rhsAsMeshShardingAttr = rhs.dyn_cast<MeshShardingAttr>();
+  return rhsAsMeshShardingAttr && *this == rhsAsMeshShardingAttr;
+}
+
+bool MeshShardingAttr::operator==(MeshShardingAttr rhs) const {
+  if (getCluster() != rhs.getCluster() ||
+      getPartialAxes() != rhs.getPartialAxes()) {
+    return false;
+  }
+
+  if (!getPartialAxes().empty() && getPartialType() != rhs.getPartialType()) {
+    return false;
+  }
+
+  auto minSize = std::min(getSplitAxes().size(), rhs.getSplitAxes().size());
+  if (!llvm::equal(llvm::make_range(getSplitAxes().begin(),
+                                    getSplitAxes().begin() + minSize),
+                   llvm::make_range(rhs.getSplitAxes().begin(),
+                                    rhs.getSplitAxes().begin() + minSize))) {
+    return false;
+  }
+
+  return llvm::all_of(llvm::make_range(getSplitAxes().begin() + minSize,
+                                       getSplitAxes().end()),
+                      std::mem_fn(&DenseI32ArrayAttr::empty)) &&
+         llvm::all_of(llvm::make_range(rhs.getSplitAxes().begin() + minSize,
+                                       rhs.getSplitAxes().end()),
+                      std::mem_fn(&DenseI32ArrayAttr::empty));
+}
+
+//===----------------------------------------------------------------------===//
+// mesh.process_index op
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+ProcessIndexOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  auto mesh = ::getMesh(getOperation(), getMeshAttr(), symbolTable);
+  if (failed(mesh)) {
+    return failure();
+  }
+  if (failed(verifyMeshAxes(getLoc(), getAxes(), mesh.value()))) {
+    return failure();
+  }
+
+  size_t expectedResultsCount =
+      getAxes().empty() ? mesh->getRank() : getAxes().size();
+  if (getResult().size() != expectedResultsCount) {
+    return emitError() << "Unexpected number of results " << getResult().size()
+                       << ". Expected " << expectedResultsCount << ".";
+  }
+
+  return success();
+}
+
+void ProcessIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                           ClusterOp mesh) {
+  build(odsBuilder, odsState,
+        SmallVector<Type>(mesh.getRank(), odsBuilder.getIndexType()),
+        mesh.getSymName(), MeshAxesAttr());
+}
+
+void ProcessIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                           StringRef mesh, ArrayRef<MeshAxis> axes) {
+  build(odsBuilder, odsState,
+        SmallVector<Type>(axes.size(), odsBuilder.getIndexType()), mesh,
+        MeshAxesAttr::get(odsBuilder.getContext(), axes));
+}
+
 //===----------------------------------------------------------------------===//
 // collective communication ops
 //===----------------------------------------------------------------------===//
@@ -258,56 +415,6 @@ static LogicalResult verifyInGroupDevice(Location loc, StringRef deviceName,
   return success();
 }
 
-static FailureOr<ClusterOp> getMesh(Operation *op, FlatSymbolRefAttr meshSymbol,
-                                    SymbolTableCollection &symbolTable) {
-  mesh::ClusterOp mesh =
-      symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(op, meshSymbol);
-  if (!mesh) {
-    return op->emitError() << "Undefined required mesh symbol \""
-                           << meshSymbol.getValue() << "\".";
-  }
-
-  return mesh;
-}
-
-template <typename It>
-bool isUnique(It begin, It end) {
-  if (begin == end) {
-    return true;
-  }
-  It next = std::next(begin);
-  if (next == end) {
-    return true;
-  }
-  for (; next != end; ++next, ++begin) {
-    if (*begin == *next) {
-      return false;
-    }
-  }
-  return true;
-}
-
-static LogicalResult verifyMeshAxes(Location loc, ArrayRef<MeshAxis> axes,
-                                    ClusterOp mesh) {
-  SmallVector<MeshAxis> sorted = llvm::to_vector(axes);
-  llvm::sort(sorted);
-  if (!isUnique(sorted.begin(), sorted.end())) {
-    return emitError(loc) << "Mesh axes contains duplicate elements.";
-  }
-
-  MeshAxis rank = mesh.getRank();
-  for (auto axis : axes) {
-    if (axis >= rank || axis < 0) {
-      return emitError(loc)
-             << "0-based mesh axis index " << axis
-             << " is out of bounds. The referenced mesh \"" << mesh.getSymName()
-             << "\" is of rank " << rank << ".";
-    }
-  }
-
-  return success();
-}
-
 template <typename Op>
 static FailureOr<ClusterOp>
 getMeshAndVerifyAxes(Op op, SymbolTableCollection &symbolTable) {
diff --git a/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt
index 044b8672c8c60c..7a70c047ec9dce 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_dialect_library(MLIRMeshTransforms
   Simplifications.cpp
   ShardingPropagation.cpp
+  Spmdization.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Mesh
@@ -11,11 +12,13 @@ add_mlir_dialect_library(MLIRMeshTransforms
 
   LINK_LIBS PUBLIC
   MLIRArithDialect
+  MLIRControlFlowDialect
   MLIRFuncDialect
   MLIRIR
   MLIRMeshDialect
   MLIRPass
   MLIRShardingInterface
   MLIRSupport
+  MLIRTensorDialect
   MLIRTosaShardingInterfaceImpl
 )
diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
new file mode 100644
index 00000000000000..de8b3a98df998a
--- /dev/null
+++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
@@ -0,0 +1,639 @@
+//===- Spmdization.cpp --------------------------------------------- C++ --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Mesh/Transforms/Spmdization.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/Mesh/IR/MeshOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/ADL.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+
+namespace mlir {
+namespace mesh {
+
+int64_t shardDimension(int64_t dim, int64_t shardCount) {
+  if (ShapedType::isDynamic(dim) || ShapedType::isDynamic(shardCount))
+    return ShapedType::kDynamic;
+
+  assert(dim % shardCount == 0);
+  return ceilDiv(dim, shardCount);
+}
+
+int64_t unshardDimension(int64_t dim, int64_t shardCount) {
+  if (ShapedType::isDynamic(dim) || ShapedType::isDynamic(shardCount))
+    return ShapedType::kDynamic;
+
+  return dim * shardCount;
+}
+
+template <typename MeshShape, typename SplitAxes>
+int64_t shardCount(const MeshShape &meshShape, const SplitAxes &splitAxes) {
+  int64_t res = 1;
+  for (auto splitAxis : splitAxes) {
+    int64_t meshDimSize = meshShape[splitAxis];
+    if (ShapedType::isDynamic(meshDimSize)) {
+      return ShapedType::kDynamic;
+    }
+    res *= meshDimSize;
+  }
+  return res;
+}
+
+// Compute the shape for the tensor on each device in the mesh.
+// Example:
+// On a 2x4x? mesh with split axes = [[0], [1], [2]] the shape ?x5x1
+// would result in a shape for each shard of ?x2x?.
+template <typename InShape, typename MeshShape, typename SplitAxes,
+          typename OutShape>
+static void shardShape(const InShape &inShape, const MeshShape &meshShape,
+                       const SplitAxes &splitAxes, OutShape &outShape) {
+  std::copy(llvm::adl_begin(inShape), llvm::adl_end(inShape),
+            llvm::adl_begin(outShape));
+  for (auto [tensorAxis, innerSplitAxes] : llvm::enumerate(splitAxes)) {
+    outShape[tensorAxis] =
+        shardDimension(inShape[tensorAxis],
+                       shardCount(meshShape, innerSplitAxes.asArrayRef()));
+  }
+}
+
+ShapedType shardShapedType(ShapedType shape, ClusterOp mesh,
+                           MeshShardingAttr sharding) {
+  using Dim = std::decay_t<decltype(shape.getDimSize(0))>;
+  SmallVector<Dim> resShapeArr(shape.getShape().size());
+  shardShape(shape.getShape(), mesh.canonicalDimSizes(),
+             sharding.getSplitAxes(), resShapeArr);
+  return shape.clone(resShapeArr);
+}
+
+template <typename SourceAxes, typename TargetAxes>
+static bool arePartialAxesCompatible(const SourceAxes &sourceAxes,
+                                     const TargetAxes &targetAxes) {
+  return llvm::all_of(targetAxes, [&sourceAxes](auto &targetAxis) {
+    return sourceAxes.contains(targetAxis);
+  });
+}
+
+// Return the reduced value and its corresponding sharding.
+// Example:
+// sourceSharding = <@mesh_1d, [[0]], partial = sum[0]>
+// targetSharding = <@mesh_1d, [[]]>
+// Then will apply all-reduce on the source value
+// and return it with the sharding <@mesh_1d, [[0]]>.
+static std::tuple<TypedValue<ShapedType>, MeshShardingAttr>
+handlePartialAxesDuringResharding(OpBuilder &builder,
+                                  MeshShardingAttr sourceSharding,
+                                  MeshShardingAttr targetSharding,
+                                  TypedValue<ShapedType> sourceShard) {
+  if (sourceSharding.getPartialAxes().empty() &&
+      targetSharding.getPartialAxes().empty()) {
+    return {sourceShard, sourceSharding};
+  }
+  assert(targetSharding.getPartialAxes().empty() ||
+         (!sourceSharding.getPartialAxes().empty() &&
+          sourceSharding.getPartialType() == targetSharding.getPartialType()));
+  using Axis = std::decay_t<decltype(sourceSharding.getPartialAxes().front())>;
+  using AxisSet = llvm::SmallDenseSet<Axis>;
+  AxisSet sourceShardingPartialAxesSet(sourceSharding.getPartialAxes().begin(),
+                                       sourceSharding.getPartialAxes().end());
+  AxisSet targetShardingPartialAxesSet(targetSharding.getPartialAxes().begin(),
+                                       targetSharding.getPartialAxes().end());
+  assert(arePartialAxesCompatible(sourceShardingPartialAxesSet,
+                                  targetShardingPartialAxesSet));
+  llvm::SmallVector<MeshAxis> allReduceMeshAxes;
+  llvm::copy_if(sourceShardingPartialAxesSet,
+                std::back_inserter(allReduceMeshAxes),
+                [&targetShardingPartialAxesSet](Axis a) {
+                  return !targetShardingPartialAxesSet.contains(a);
+                });
+  if (allReduceMeshAxes.empty()) {
+    return {sourceShard, sourceSharding};
+  }
+
+  builder.setInsertionPointAfterValue(sourceShard);
+  TypedValue<ShapedType> resultValue =
+      builder
+          .create<AllReduceOp>(sourceShard.getLoc(), sourceShard.getType(),
+                               sourceSharding.getCluster().getLeafReference(),
+                               allReduceMeshAxes, sourceShard,
+                               sourceSharding.getPartialType())
+          .getResult()
+          .cast<TypedValue<ShapedType>>();
+
+  llvm::SmallVector<int32_t> remainingPartialAxes;
+  llvm::copy_if(sourceShardingPartialAxesSet,
+                std::back_inserter(allReduceMeshAxes),
+                [&targetShardingPartialAxesSet](Axis a) {
+                  return targetShardingPartialAxesSet.contains(a);
+                });
+  MeshShardingAttr resultSharding =
+      MeshShardingAttr::get(builder.getContext(), sourceSharding.getCluster(),
+                            sourceSharding.getSplitAxes(), remainingPartialAxes,
+                            sourceSharding.getPartialType());
+  return {resultValue, resultSharding};
+}
+
+static MeshShardingAttr
+targetShardingInSplitLastAxis(MLIRContext *ctx, MeshShardingAttr sourceSharding,
+                              int64_t splitTensorAxis, MeshAxis splitMeshAxis) {
+  SmallVector<DenseI32ArrayAttr> targetShardingSplitAxes =
+      llvm::to_vector(sourceSharding.getSplitAxes());
+  while (static_cast<int64_t>(targetShardingSplitAxes.size()) <=
+         splitTensorAxis) {
+    targetShardingSplitAxes.push_back(DenseI32ArrayAttr::get(ctx, {}));
+  }
+  auto targetSplitAxes =
+      llvm::to_vector(targetShardingSplitAxes[splitTensorAxis].asArrayRef());
+  targetSplitAxes.push_back(splitMeshAxis);
+  targetShardingSplitAxes[splitTensorAxis] =
+      DenseI32ArrayAttr::get(ctx, targetSplitAxes);
+  return MeshShardingAttr::get(
+      ctx, sourceSharding.getCluster(), targetShardingSplitAxes,
+      sourceSharding.getPartialAxes(), sourceSharding.getPartialType());
+}
+
+static ShapedType targetShapeInSplitLastAxis(ShapedType sourceShape,
+                                             int64_t splitTensorAxis,
+                                             int64_t splitCount) {
+  SmallVector<int64_t> targetShape = llvm::to_vector(sourceShape.getShape());
+  targetShape[splitTensorAxis] =
+      shardDimension(targetShape[splitTensorAxis], splitCount);
+  return sourceShape.cloneWith(targetShape, sourceShape.getElementType());
+}
+
+// Split a replicated tensor along a mesh axis.
+// e.g. [[0, 1]] -> [[0, 1, 2]].
+// Returns the spmdized target value with its sharding.
+//
+// The implementation is the extract the tensor slice corresponding
+// to the current device.
+static std::tuple<TypedValue<ShapedType>, MeshShardingAttr>
+splitLastAxisInResharding(ImplicitLocOpBuilder &builder,
+                          MeshShardingAttr sourceSharding,
+                          TypedValue<ShapedType> sourceShard, ClusterOp mesh,
+                          int64_t splitTensorAxis, MeshAxis splitMeshAxis) {
+  MLIRContext *ctx = builder.getContext();
+  builder.setInsertionPointAfterValue(sourceShard);
+
+  Value zero = builder.create<arith::ConstantOp>(builder.getIndexAttr(0));
+
+  Value processIndexAlongAxis =
+      builder
+          .create<ProcessIndexOp>(mesh.getSymName(),
+                                  SmallVector<MeshAxis>({splitMeshAxis}))
+          .getResult()[0];
+
+  MeshShardingAttr targetSharding = targetShardingInSplitLastAxis(
+      ctx, sourceSharding, splitTensorAxis, splitMeshAxis);
+  ShapedType targetShape =
+      targetShapeInSplitLastAxis(sourceShard.getType(), splitTensorAxis,
+                                 mesh.canonicalDimSizes()[splitMeshAxis]);
+
+  Value meshAxisSize =
+      builder
+          .create<ClusterShapeOp>(mesh.getSymName(),
+                                  SmallVector<MeshAxis>({splitMeshAxis}))
+          .getResult()[0];
+
+  Value sourceAxisSize =
+      builder.create<tensor::DimOp>(sourceShard, splitTensorAxis);
+  Value sourceAxisSizeModMeshAxisSize =
+      builder.create<arith::RemUIOp>(sourceAxisSize, meshAxisSize);
+  Value isTargetShapeExactlyDivisible = builder.create<arith::CmpIOp>(
+      arith::CmpIPredicate::eq, sourceAxisSizeModMeshAxisSize, zero);
+  builder.create<cf::AssertOp>(
+      isTargetShapeExactlyDivisible,
+      "Sharding a tensor with axis size that is not exactly divisible by the "
+      "mesh axis size is not supported.");
+  Value targetAxisSize =
+      builder.create<arith::DivUIOp>(sourceAxisSize, meshAxisSize);
+  Value axisOffset =
+      builder.create<arith::MulIOp>(targetAxisSize, processIndexAlongAxis);
+  SmallVector<int64_t> staticOffsets(targetShape.getRank(), 0);
+  staticOffsets[splitTensorAxis] = ShapedType::kDynamic;
+  DenseI64ArrayAttr staticOffsetsAttr =
+      DenseI64ArrayAttr::get(ctx, staticOffsets);
+  SmallVector<Value> dynamicOffsets(1, axisOffset);
+
+  DenseI64ArrayAttr staticSizesAttr =
+      DenseI64ArrayAttr::get(ctx, targetShape.getShape());
+  SmallVector<Value> dynamicSizes;
+  for (int64_t i = 0; i < targetShape.getRank(); ++i) {
+    if (ShapedType::isDynamic(staticSizesAttr.asArrayRef()[i])) {
+      if (i == splitTensorAxis) {
+        dynamicSizes.push_back(targetAxisSize);
+      } else {
+        Value dimSize = builder.create<tensor::DimOp>(sourceShard, i);
+        dynamicSizes.push_back(dimSize);
+      }
+    }
+  }
+
+  DenseI64ArrayAttr staticStridesAttr = DenseI64ArrayAttr::get(
+      ctx, SmallVector<int64_t>(targetShape.getRank(), 1));
+  TypedValue<RankedTensorType> targetShard =
+      builder
+          .create<tensor::ExtractSliceOp>(
+              targetShape, sourceShard, dynamicOffsets, dynamicSizes,
+              SmallVector<Value>({}), staticOffsetsAttr, staticSizesAttr,
+              staticStridesAttr)
+          .getResult();
+  return {targetShard.cast<TypedValue<ShapedType>>(), targetSharding};
+}
+
+// Detect if the resharding is of type e.g.
+// [[0, 1]] -> [[0, 1, 2]].
+// If detected, returns the corresponding tensor axis mesh axis pair.
+// Does not detect insertions like
+// [[0, 1]] -> [[0, 2, 1]].
+static std::optional<std::tuple<int64_t, MeshAxis>>
+detectSplitLastAxisInResharding(MeshShardingAttr sourceSharding,
+                                MeshShardingAttr targetSharding) {
+  for (size_t tensorAxis = 0; tensorAxis < targetSharding.getSplitAxes().size();
+       ++tensorAxis) {
+    if (sourceSharding.getSplitAxes().size() > tensorAxis) {
+      if (sourceSharding.getSplitAxes()[tensorAxis].size() + 1 !=
+          targetSharding.getSplitAxes()[tensorAxis].size()) {
+        continue;
+      }
+      if (!llvm::equal(
+              sourceSharding.getSplitAxes()[tensorAxis].asArrayRef(),
+              llvm::make_range(
+                  targetSharding.getSplitAxes()[tensorAxis]
+                      .asArrayRef()
+                      .begin(),
+                  targetSharding.getSplitAxes()[tensorAxis].asArrayRef().end() -
+                      1))) {
+        continue;
+      }
+    } else {
+      if (targetSharding.getSplitAxes()[tensorAxis].size() != 1) {
+        continue;
+      }
+    }
+    return std::make_tuple(
+        tensorAxis,
+        targetSharding.getSplitAxes()[tensorAxis].asArrayRef().back());
+  }
+  return std::nullopt;
+}
+
+static std::optional<std::tuple<TypedValue<ShapedType>, MeshShardingAttr>>
+trySplitLastAxisInResharding(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                             MeshShardingAttr sourceSharding,
+                             MeshShardingAttr targetSharding,
+                             TypedValue<ShapedType> sourceShard) {
+  if (auto detectRes =
+          detectSplitLastAxisInResharding(sourceSharding, targetSharding)) {
+    auto [tensorAxis, meshAxis] = detectRes.value();
+    return splitLastAxisInResharding(builder, sourceSharding, sourceShard, mesh,
+                                     tensorAxis, meshAxis);
+  }
+
+  return std::nullopt;
+}
+
+// Detect if the resharding is of type e.g.
+// [[0, 1, 2]] -> [[0, 1]].
+// If detected, returns the corresponding tensor axis mesh axis pair.
+static std::optional<std::tuple<int64_t, MeshAxis>>
+detectUnsplitLastAxisInResharding(MeshShardingAttr sourceSharding,
+                                  MeshShardingAttr targetSharding) {
+  for (size_t tensorAxis = 0; tensorAxis < sourceSharding.getSplitAxes().size();
+       ++tensorAxis) {
+    if (targetSharding.getSplitAxes().size() > tensorAxis) {
+      if (sourceSharding.getSplitAxes()[tensorAxis].size() !=
+          targetSharding.getSplitAxes()[tensorAxis].size() + 1)
+        continue;
+      if (!llvm::equal(
+              llvm::make_range(
+                  sourceSharding.getSplitAxes()[tensorAxis]
+                      .asArrayRef()
+                      .begin(),
+                  sourceSharding.getSplitAxes()[tensorAxis].asArrayRef().end() -
+                      1),
+              targetSharding.getSplitAxes()[tensorAxis].asArrayRef()))
+        continue;
+    } else {
+      if (sourceSharding.getSplitAxes()[tensorAxis].size() != 1)
+        continue;
+    }
+    return std::make_tuple(
+        tensorAxis,
+        sourceSharding.getSplitAxes()[tensorAxis].asArrayRef().back());
+  }
+  return std::nullopt;
+}
+
+static MeshShardingAttr
+targetShardingInUnsplitLastAxis(MLIRContext *ctx,
+                                MeshShardingAttr sourceSharding,
+                                int64_t splitTensorAxis) {
+  SmallVector<DenseI32ArrayAttr> targetShardingSplitAxes =
+      llvm::to_vector(sourceSharding.getSplitAxes());
+  assert(static_cast<int64_t>(targetShardingSplitAxes.size()) >
+         splitTensorAxis);
+  auto targetSplitAxes =
+      llvm::to_vector(targetShardingSplitAxes[splitTensorAxis].asArrayRef());
+
+  targetSplitAxes.pop_back();
+  targetShardingSplitAxes[splitTensorAxis] =
+      DenseI32ArrayAttr::get(ctx, targetSplitAxes);
+  return MeshShardingAttr::get(
+      ctx, sourceSharding.getCluster(), targetShardingSplitAxes,
+      sourceSharding.getPartialAxes(), sourceSharding.getPartialType());
+}
+
+static ShapedType allGatherResultShapeInUnsplitLastAxis(
+    ShapedType sourceShape, int64_t splitCount, int64_t splitTensorAxis) {
+  SmallVector<int64_t> targetShape = llvm::to_vector(sourceShape.getShape());
+  targetShape[splitTensorAxis] =
+      unshardDimension(targetShape[splitTensorAxis], splitCount);
+  return sourceShape.cloneWith(targetShape, sourceShape.getElementType());
+}
+
+static std::tuple<TypedValue<ShapedType>, MeshShardingAttr>
+unsplitLastAxisInResharding(ImplicitLocOpBuilder &builder,
+                            MeshShardingAttr sourceSharding,
+                            ShapedType sourceUnshardedShape,
+                            TypedValue<ShapedType> sourceShard, ClusterOp mesh,
+                            int64_t splitTensorAxis, MeshAxis splitMeshAxis) {
+  MLIRContext *ctx = builder.getContext();
+  builder.setInsertionPointAfterValue(sourceShard);
+
+  MeshShardingAttr targetSharding =
+      targetShardingInUnsplitLastAxis(ctx, sourceSharding, splitMeshAxis);
+  ShapedType allGatherResultShape = allGatherResultShapeInUnsplitLastAxis(
+      sourceShard.getType(), mesh.canonicalDimSizes()[splitMeshAxis],
+      splitTensorAxis);
+  Value allGatherResult = builder.create<AllGatherOp>(
+      RankedTensorType::get(allGatherResultShape.getShape(),
+                            allGatherResultShape.getElementType()),
+      mesh.getSymName(), SmallVector<MeshAxis>({splitMeshAxis}), sourceShard,
+      APInt(64, splitTensorAxis));
+  ShapedType targetShape =
+      shardShapedType(sourceUnshardedShape, mesh, targetSharding);
+  TypedValue<ShapedType> targetShard =
+      builder.create<tensor::CastOp>(targetShape, allGatherResult)
+          .getResult()
+          .cast<TypedValue<ShapedType>>();
+  return {targetShard, targetSharding};
+}
+
+static std::optional<std::tuple<TypedValue<ShapedType>, MeshShardingAttr>>
+tryUnsplitLastAxisInResharding(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                               MeshShardingAttr sourceSharding,
+                               MeshShardingAttr targetSharding,
+                               ShapedType sourceUnshardedShape,
+                               TypedValue<ShapedType> sourceShard) {
+  if (auto detectRes =
+          detectUnsplitLastAxisInResharding(sourceSharding, targetSharding)) {
+    auto [tensorAxis, meshAxis] = detectRes.value();
+    return unsplitLastAxisInResharding(builder, sourceSharding,
+                                       sourceUnshardedShape, sourceShard, mesh,
+                                       tensorAxis, meshAxis);
+  }
+
+  return std::nullopt;
+}
+
+// Detect if the resharding is of type e.g.
+// [[0, 1], [2]] -> [[0], [1, 2]].
+// Only moving the last axis counts.
+// If detected, returns the corresponding (source_tensor_axis,
+// target_tensor_axis, mesh_axis) tuple.
+static std::optional<std::tuple<int64_t, int64_t, MeshAxis>>
+detectMoveLastSplitAxisInResharding(MeshShardingAttr sourceSharding,
+                                    MeshShardingAttr targetSharding) {
+  for (size_t sourceTensorAxis = 0;
+       sourceTensorAxis < sourceSharding.getSplitAxes().size();
+       ++sourceTensorAxis) {
+    for (size_t targetTensorAxis = 0;
+         targetTensorAxis < targetSharding.getSplitAxes().size();
+         ++targetTensorAxis) {
+      if (sourceTensorAxis == targetTensorAxis)
+        continue;
+      if (sourceSharding.getSplitAxes()[sourceTensorAxis].empty() ||
+          targetSharding.getSplitAxes()[targetTensorAxis].empty() ||
+          sourceSharding.getSplitAxes()[sourceTensorAxis].asArrayRef().back() !=
+              targetSharding.getSplitAxes()[targetTensorAxis]
+                  .asArrayRef()
+                  .back())
+        continue;
+      if (!llvm::equal(
+              llvm::make_range(sourceSharding.getSplitAxes()[sourceTensorAxis]
+                                   .asArrayRef()
+                                   .begin(),
+                               sourceSharding.getSplitAxes()[sourceTensorAxis]
+                                       .asArrayRef()
+                                       .end() -
+                                   1),
+              llvm::make_range(targetSharding.getSplitAxes()[targetTensorAxis]
+                                   .asArrayRef()
+                                   .begin(),
+                               targetSharding.getSplitAxes()[targetTensorAxis]
+                                       .asArrayRef()
+                                       .end() -
+                                   1)))
+        continue;
+      return std::make_tuple(
+          sourceTensorAxis, targetTensorAxis,
+          sourceSharding.getSplitAxes()[sourceTensorAxis].asArrayRef().back());
+    }
+  }
+  return std::nullopt;
+}
+
+static MeshShardingAttr
+targetShardingInMoveLastAxis(MLIRContext *ctx, MeshShardingAttr sourceSharding,
+                             int64_t sourceTensorAxis,
+                             int64_t targetTensorAxis) {
+  SmallVector<DenseI32ArrayAttr> targetShardingSplitAxes =
+      llvm::to_vector(sourceSharding.getSplitAxes());
+  while (static_cast<int64_t>(targetShardingSplitAxes.size()) <=
+         targetTensorAxis) {
+    targetShardingSplitAxes.push_back(DenseI32ArrayAttr::get(ctx, {}));
+  }
+
+  auto sourceSplitAxes =
+      llvm::to_vector(targetShardingSplitAxes[sourceTensorAxis].asArrayRef());
+  assert(!sourceSplitAxes.empty());
+  auto meshAxis = sourceSplitAxes.back();
+  sourceSplitAxes.pop_back();
+  targetShardingSplitAxes[sourceTensorAxis] =
+      DenseI32ArrayAttr::get(ctx, sourceSplitAxes);
+
+  auto targetSplitAxes =
+      llvm::to_vector(targetShardingSplitAxes[targetTensorAxis].asArrayRef());
+  targetSplitAxes.push_back(meshAxis);
+  targetShardingSplitAxes[targetTensorAxis] =
+      DenseI32ArrayAttr::get(ctx, targetSplitAxes);
+
+  return MeshShardingAttr::get(
+      ctx, sourceSharding.getCluster(), targetShardingSplitAxes,
+      sourceSharding.getPartialAxes(), sourceSharding.getPartialType());
+}
+
+static ShapedType allToAllResultShapeInMoveLastAxis(ShapedType sourceShape,
+                                                    int64_t splitCount,
+                                                    int64_t sourceTensorAxis,
+                                                    int64_t targetTensorAxis) {
+  SmallVector<int64_t> targetShape = llvm::to_vector(sourceShape.getShape());
+  targetShape[sourceTensorAxis] =
+      unshardDimension(targetShape[sourceTensorAxis], splitCount);
+  targetShape[targetTensorAxis] =
+      shardDimension(targetShape[targetTensorAxis], splitCount);
+  return sourceShape.cloneWith(targetShape, sourceShape.getElementType());
+}
+
+static std::tuple<TypedValue<ShapedType>, MeshShardingAttr>
+moveLastSplitAxisInResharding(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                              MeshShardingAttr sourceSharding,
+                              ShapedType sourceUnshardedShape,
+                              TypedValue<ShapedType> sourceShard,
+                              int64_t sourceTensorAxis,
+                              int64_t targetTensorAxis, MeshAxis meshAxis) {
+  MLIRContext *ctx = builder.getContext();
+  builder.setInsertionPointAfterValue(sourceShard);
+
+  MeshShardingAttr targetSharding = targetShardingInMoveLastAxis(
+      ctx, sourceSharding, sourceTensorAxis, targetTensorAxis);
+  ShapedType allToAllResultShape = allToAllResultShapeInMoveLastAxis(
+      sourceShard.getType(), mesh.canonicalDimSizes()[meshAxis],
+      sourceTensorAxis, targetTensorAxis);
+  Value allToAllResult = builder.create<AllToAllOp>(
+      RankedTensorType::get(allToAllResultShape.getShape(),
+                            allToAllResultShape.getElementType()),
+      mesh.getSymName(), SmallVector<MeshAxis>({meshAxis}), sourceShard,
+      APInt(64, targetTensorAxis), APInt(64, sourceTensorAxis));
+  ShapedType targetShape =
+      shardShapedType(sourceUnshardedShape, mesh, targetSharding);
+  TypedValue<ShapedType> targetShard =
+      builder.create<tensor::CastOp>(targetShape, allToAllResult)
+          .getResult()
+          .cast<TypedValue<ShapedType>>();
+  return {targetShard, targetSharding};
+}
+
+static std::optional<std::tuple<TypedValue<ShapedType>, MeshShardingAttr>>
+tryMoveLastSplitAxisInResharding(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                                 MeshShardingAttr sourceSharding,
+                                 MeshShardingAttr targetSharding,
+                                 ShapedType sourceUnshardedShape,
+                                 TypedValue<ShapedType> sourceShard) {
+  if (auto detectRes =
+          detectMoveLastSplitAxisInResharding(sourceSharding, targetSharding)) {
+    auto [sourceTensorAxis, targetTensorAxis, meshAxis] = detectRes.value();
+    return moveLastSplitAxisInResharding(
+        builder, mesh, sourceSharding, sourceUnshardedShape, sourceShard,
+        sourceTensorAxis, targetTensorAxis, meshAxis);
+  }
+
+  return std::nullopt;
+}
+
+// Handles only resharding on a 1D mesh.
+// Currently the sharded tensor axes must be exactly divisible by the single
+// mesh axis size.
+static TypedValue<ShapedType>
+reshardOn1DMesh(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                MeshShardingAttr sourceSharding,
+                MeshShardingAttr targetSharding,
+                TypedValue<ShapedType> sourceUnshardedValue,
+                TypedValue<ShapedType> sourceShard) {
+  assert(sourceShard.getType() ==
+         shardShapedType(sourceUnshardedValue.getType(), mesh, sourceSharding));
+  ShapedType targetShardType =
+      shardShapedType(sourceUnshardedValue.getType(), mesh, targetSharding);
+  assert(sourceShard.getType().getRank() == targetShardType.getRank());
+  assert(mesh.getRank() == 1 && "Only 1D meshes are currently supported.");
+
+  auto [reducedSourceShard, reducedSourceSharding] =
+      handlePartialAxesDuringResharding(builder, sourceSharding, targetSharding,
+                                        sourceShard);
+
+  if (reducedSourceSharding == targetSharding) {
+    return reducedSourceShard;
+  }
+
+  TypedValue<ShapedType> targetShard;
+  MeshShardingAttr actualTargetSharding;
+  if (auto tryRes = tryMoveLastSplitAxisInResharding(
+          builder, mesh, reducedSourceSharding, targetSharding,
+          sourceUnshardedValue.getType(), reducedSourceShard)) {
+    std::tie(targetShard, actualTargetSharding) = tryRes.value();
+  } else if (auto tryRes = trySplitLastAxisInResharding(
+                 builder, mesh, reducedSourceSharding, targetSharding,
+                 reducedSourceShard)) {
+    std::tie(targetShard, actualTargetSharding) = tryRes.value();
+  } else if (auto tryRes = tryUnsplitLastAxisInResharding(
+                 builder, mesh, reducedSourceSharding, targetSharding,
+                 sourceUnshardedValue.getType(), reducedSourceShard)) {
+    std::tie(targetShard, actualTargetSharding) = tryRes.value();
+  } else {
+    assert(false && "Did not find any pattern to apply.");
+  }
+
+  assert(actualTargetSharding == targetSharding);
+  assert(targetShard.getType() == targetShardType);
+  return targetShard;
+}
+
+TypedValue<ShapedType> reshard(ImplicitLocOpBuilder &builder, ClusterOp mesh,
+                               MeshShardingAttr sourceSharding,
+                               MeshShardingAttr targetSharding,
+                               TypedValue<ShapedType> sourceUnshardedValue,
+                               TypedValue<ShapedType> sourceShard) {
+  // Resort to handling only 1D meshes since the general case is complicated if
+  // it needs to be communication efficient in terms of minimizing the data
+  // transfered between devices.
+  return reshardOn1DMesh(builder, mesh, sourceSharding, targetSharding,
+                         sourceUnshardedValue, sourceShard);
+}
+
+TypedValue<ShapedType> reshard(OpBuilder &builder, ClusterOp mesh,
+                               ShardOp source, ShardOp target,
+                               TypedValue<ShapedType> sourceShardValue) {
+  assert(!source.getAnnotateForUsers());
+  assert(target.getAnnotateForUsers());
+  assert(source.getResult() == target.getOperand());
+  ImplicitLocOpBuilder implicitLocOpBuilder(target->getLoc(), builder);
+  return reshard(
+      implicitLocOpBuilder, mesh, source.getShard(), target.getShard(),
+      source.getSrc().cast<TypedValue<ShapedType>>(), sourceShardValue);
+}
+
+void reshardingRegisterDependentDialects(DialectRegistry &registry) {
+  registry.insert<arith::ArithDialect, mesh::MeshDialect, tensor::TensorDialect,
+                  cf::ControlFlowDialect>();
+}
+
+} // namespace mesh
+} // namespace mlir
diff --git a/mlir/test/Dialect/Mesh/invalid.mlir b/mlir/test/Dialect/Mesh/invalid.mlir
index 03994f8f011e1f..3ee578a37235e2 100644
--- a/mlir/test/Dialect/Mesh/invalid.mlir
+++ b/mlir/test/Dialect/Mesh/invalid.mlir
@@ -70,6 +70,102 @@ func.func @mesh_axis_negtive_in_partial(
 
 // -----
 
+mesh.cluster @mesh0(rank = 2, dim_sizes = 2x4)
+
+func.func @cluster_shape_mesh_axis_out_of_bounds() -> (index, index) {
+  // expected-error at +1 {{0-based mesh axis index 2 is out of bounds. The referenced mesh "mesh0" is of rank 2.}}
+  %0:2 = mesh.cluster_shape @mesh0 axes = [0, 2] : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 3, dim_sizes = 1x2x3)
+
+func.func @cluster_shape_duplicate_mesh_axis() -> (index, index, index) {
+  // expected-error at +1 {{Mesh axes contains duplicate elements.}}
+  %0:3 = mesh.cluster_shape @mesh0 axes = [0, 2, 0] : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 2, dim_sizes = 2x4)
+
+func.func @cluster_shape_wrong_number_of_results() -> (index, index) {
+  // expected-error at +1 {{Unexpected number of results 2. Expected 1.}}
+  %0:2 = mesh.cluster_shape @mesh0 axes = [0] : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 3, dim_sizes = 1x2x3)
+
+func.func @cluster_shape_wrong_number_of_results_empty_mesh_axes() -> (index, index) {
+  // expected-error at +1 {{Unexpected number of results 2. Expected 3.}}
+  %0:2 = mesh.cluster_shape @mesh0 : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+func.func @cluster_shape_invalid_mesh_name() -> (index) {
+  // expected-error at +1 {{Undefined required mesh symbol "this_mesh_symbol_does_not_exist".}}
+  %0 = mesh.cluster_shape @this_mesh_symbol_does_not_exist : index
+  return %0#0 : index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 2, dim_sizes = 2x4)
+
+func.func @process_index_mesh_axis_out_of_bounds() -> (index, index) {
+  // expected-error at +1 {{0-based mesh axis index 2 is out of bounds. The referenced mesh "mesh0" is of rank 2.}}
+  %0:2 = mesh.process_index on @mesh0 axes = [0, 2] : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 3, dim_sizes = 1x2x3)
+
+func.func @process_index_duplicate_mesh_axis() -> (index, index, index) {
+  // expected-error at +1 {{Mesh axes contains duplicate elements.}}
+  %0:3 = mesh.process_index on @mesh0 axes = [0, 2, 0] : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 2, dim_sizes = 2x4)
+
+func.func @process_index_wrong_number_of_results() -> (index, index) {
+  // expected-error at +1 {{Unexpected number of results 2. Expected 1.}}
+  %0:2 = mesh.process_index on @mesh0 axes = [0] : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+mesh.cluster @mesh0(rank = 3, dim_sizes = 1x2x3)
+
+func.func @process_index_wrong_number_of_results_empty_mesh_axes() -> (index, index) {
+  // expected-error at +1 {{Unexpected number of results 2. Expected 3.}}
+  %0:2 = mesh.process_index on @mesh0 : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// -----
+
+func.func @process_index_invalid_mesh_name() -> (index) {
+  // expected-error at +1 {{Undefined required mesh symbol "this_mesh_symbol_does_not_exist".}}
+  %0 = mesh.process_index on @this_mesh_symbol_does_not_exist : index
+  return %0#0 : index
+}
+
+// -----
+
 func.func @all_reduce_invalid_mesh_symbol(
     %arg0 : tensor<4xf32>) -> tensor<4xf64> {
   // expected-error at +1 {{Undefined required mesh symbol "this_mesh_symbol_does_not_exist".}}
diff --git a/mlir/test/Dialect/Mesh/ops.mlir b/mlir/test/Dialect/Mesh/ops.mlir
index 8f8e309d18f156..a7c3b3dbab9c13 100644
--- a/mlir/test/Dialect/Mesh/ops.mlir
+++ b/mlir/test/Dialect/Mesh/ops.mlir
@@ -132,6 +132,55 @@ func.func @mesh_shard_op_two_users(%arg0 : tensor<4x8xf32>) ->
   return %1, %2 : tensor<4x8xf32>, tensor<4x8xf32>
 }
 
+// CHECK-LABEL: func @cluster_shape
+func.func @cluster_shape() -> (index, index) {
+  // CHECK: %[[RES:.*]]:2 = mesh.cluster_shape @mesh0 axes = [0, 1] : index, index
+  %0:2 = mesh.cluster_shape @mesh0 axes = [0, 1] : index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1 : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// CHECK-LABEL: func @cluster_shape_default_axes
+func.func @cluster_shape_default_axes() -> (index, index, index) {
+  // CHECK: %[[RES:.*]]:3 = mesh.cluster_shape @mesh0 : index, index, index
+  %0:3 = mesh.cluster_shape @mesh0 : index, index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// CHECK-LABEL: func @cluster_shape_empty_axes
+func.func @cluster_shape_empty_axes() -> (index, index, index) {
+  // CHECK: %[[RES:.*]]:3 = mesh.cluster_shape @mesh0 : index, index, index
+  %0:3 = mesh.cluster_shape @mesh0 axes = [] : index, index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// CHECK-LABEL: func @process_index
+func.func @process_index() -> (index, index) {
+  // CHECK: %[[RES:.*]]:2 = mesh.process_index on @mesh0 axes = [0, 1] : index, index
+  %0:2 = mesh.process_index on @mesh0 axes = [0, 1] : index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1 : index, index
+  return %0#0, %0#1 : index, index
+}
+
+// CHECK-LABEL: func @process_index_default_axes
+func.func @process_index_default_axes() -> (index, index, index) {
+  // CHECK: %[[RES:.*]]:3 = mesh.process_index on @mesh0 : index, index, index
+  %0:3 = mesh.process_index on @mesh0 : index, index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// CHECK-LABEL: func @process_index_empty_axes
+func.func @process_index_empty_axes() -> (index, index, index) {
+  // CHECK: %[[RES:.*]]:3 = mesh.process_index on @mesh0 : index, index, index
+  %0:3 = mesh.process_index on @mesh0 axes = [] : index, index, index
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+
 // CHECK-LABEL: func @all_reduce
 func.func @all_reduce(
     // CHECK-SAME: %[[ARG:.*]]: tensor<3x4xf32>
diff --git a/mlir/test/Dialect/Mesh/resharding-spmdization.mlir b/mlir/test/Dialect/Mesh/resharding-spmdization.mlir
new file mode 100644
index 00000000000000..0ba0d76c09a74d
--- /dev/null
+++ b/mlir/test/Dialect/Mesh/resharding-spmdization.mlir
@@ -0,0 +1,154 @@
+// RUN: mlir-opt -test-mesh-resharding-spmdization %s | FileCheck %s
+
+mesh.cluster @mesh_1d(rank = 1, dim_sizes = 2)
+mesh.cluster @mesh_1d_dynamic(rank = 1, dim_sizes = ?)
+
+// CHECK-LABEL: func @same_source_and_target_sharding
+func.func @same_source_and_target_sharding(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<2xf32>
+  %arg0: tensor<2xf32>
+) -> tensor<2xf32> {
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[]]> : tensor<2xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[]]> annotate_for_users : tensor<2xf32>
+  // CHECK: return %[[ARG]]
+  return %1 : tensor<2xf32>
+}
+
+// CHECK-LABEL: func @split_replicated_tensor_axis
+func.func @split_replicated_tensor_axis(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<3x14xf32>
+  %arg0: tensor<3x14xf32>
+) -> tensor<3x14xf32> {
+  // CHECK: %[[ZERO:.*]] = arith.constant 0 : index
+  // CHECK: %[[TENSOR_SPLIT_AXIS_SIZE:.*]] = arith.constant 14 : index
+  // CHECK: %[[PROCESS_INDEX:.*]] = mesh.process_index on @mesh_1d axes = [0] : index
+  // CHECK: %[[MESH_AXIS_SIZE:.*]] = mesh.cluster_shape @mesh_1d axes = [0] : index
+  // CHECK: %[[TENSOR_SPLIT_AXIS_SIZE_MOD_MESH_AXIS_SIZE:.*]] = arith.remui %[[TENSOR_SPLIT_AXIS_SIZE]], %[[MESH_AXIS_SIZE]] : index
+  // CHECK: %[[RESULT_TENSOR_AXIS_SIZE_CHECK:.*]] = arith.cmpi eq, %[[TENSOR_SPLIT_AXIS_SIZE_MOD_MESH_AXIS_SIZE]], %[[ZERO]] : index
+  // CHECK: cf.assert %[[RESULT_TENSOR_AXIS_SIZE_CHECK]]
+  // CHECK: %[[RESULT_TENSOR_AXIS_SIZE:.*]] = arith.divui %[[TENSOR_SPLIT_AXIS_SIZE]], %[[MESH_AXIS_SIZE]] : index
+  // CHECK: %[[RESULT_TENSOR_AXIS_OFFSET:.*]] = arith.muli %[[RESULT_TENSOR_AXIS_SIZE]], %[[PROCESS_INDEX]] : index
+  // CHECK: %[[RESULT_TENSOR_SLICE:.*]] = tensor.extract_slice %[[ARG]][0, %[[RESULT_TENSOR_AXIS_OFFSET]]] [3, 7] [1, 1] : tensor<3x14xf32> to tensor<3x7xf32>
+  // CHECK: %[[RESULT:.*]] = builtin.unrealized_conversion_cast %[[RESULT_TENSOR_SLICE]] : tensor<3x7xf32> to tensor<3x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[]]> : tensor<3x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[], [0]]> annotate_for_users : tensor<3x14xf32>
+  // CHECK: return %[[RESULT]] : tensor<3x14xf32>
+  return %1 : tensor<3x14xf32>
+}
+
+// CHECK-LABEL: func @split_replicated_tensor_axis_dynamic
+func.func @split_replicated_tensor_axis_dynamic(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<?x3x?xf32>
+  %arg0: tensor<?x3x?xf32>
+) -> tensor<?x3x?xf32> {
+  // CHECK: %[[ZERO:.*]] = arith.constant 0 : index
+  // CHECK: %[[TWO:.*]] = arith.constant 2 : index
+  // CHECK: %[[PROCESS_INDEX:.*]] = mesh.process_index on @mesh_1d_dynamic axes = [0] : index
+  // CHECK: %[[MESH_AXIS_SIZE:.*]] = mesh.cluster_shape @mesh_1d_dynamic axes = [0] : index
+  // CHECK: %[[TENSOR_SPLIT_AXIS_SIZE:.*]] = tensor.dim %[[ARG]], %[[ZERO]] : tensor<?x3x?xf32>
+  // CHECK: %[[TENSOR_SPLIT_AXIS_SIZE_MOD_MESH_AXIS_SIZE:.*]] = arith.remui %[[TENSOR_SPLIT_AXIS_SIZE]], %[[MESH_AXIS_SIZE]] : index
+  // CHECK: %[[RESULT_TENSOR_AXIS_SIZE_CHECK:.*]] = arith.cmpi eq, %[[TENSOR_SPLIT_AXIS_SIZE_MOD_MESH_AXIS_SIZE]], %[[ZERO]] : index
+  // CHECK: cf.assert %[[RESULT_TENSOR_AXIS_SIZE_CHECK]]
+  // CHECK: %[[RESULT_TENSOR_SPLIT_AXIS_SIZE:.*]] = arith.divui %[[TENSOR_SPLIT_AXIS_SIZE]], %[[MESH_AXIS_SIZE]] : index
+  // CHECK: %[[RESULT_TENSOR_SPLIT_AXIS_OFFSET:.*]] = arith.muli %[[RESULT_TENSOR_SPLIT_AXIS_SIZE]], %[[PROCESS_INDEX]] : index
+  // CHECK: %[[TENSOR_AXIS_2_SIZE:.*]] = tensor.dim %[[ARG]], %[[TWO]] : tensor<?x3x?xf32>
+  // CHECK: %[[RESULT_TENSOR_SLICE:.*]] = tensor.extract_slice %[[ARG]][%[[RESULT_TENSOR_SPLIT_AXIS_OFFSET]], 0, 0]
+  // CHECK-SAME: [%[[RESULT_TENSOR_SPLIT_AXIS_SIZE]], 3, %[[TENSOR_AXIS_2_SIZE]]] [1, 1, 1] : tensor<?x3x?xf32> to tensor<?x3x?xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d_dynamic, [[], [], []]> : tensor<?x3x?xf32>
+  %1 = mesh.shard %0 to <@mesh_1d_dynamic, [[0]]> annotate_for_users : tensor<?x3x?xf32>
+  // CHECK: return %[[RESULT_TENSOR_SLICE]] : tensor<?x3x?xf32>
+  return %1 : tensor<?x3x?xf32>
+}
+
+// CHECK-LABEL: func @move_split_axis
+func.func @move_split_axis(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<10x14xf32>
+  %arg0: tensor<10x14xf32>
+) -> tensor<10x14xf32> {
+  // CHECK: %[[SOURCE_SHARD:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : tensor<10x14xf32> to tensor<5x14xf32>
+  // CHECK: %[[TARGET_SHARD:.*]] = mesh.all_to_all %[[SOURCE_SHARD]] on @mesh_1d mesh_axes = [0] split_axis = 1 concat_axis = 0 : tensor<5x14xf32> -> tensor<10x7xf32>
+  // CHECK: %[[RES:.*]] = builtin.unrealized_conversion_cast %[[TARGET_SHARD]] : tensor<10x7xf32> to tensor<10x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[0]]> : tensor<10x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[], [0]]> annotate_for_users : tensor<10x14xf32>
+  // CHECK: return %[[RES]] : tensor<10x14xf32>
+  return %1 : tensor<10x14xf32>
+}
+
+// CHECK-LABEL: func @move_split_axis_dynamic_mesh
+func.func @move_split_axis_dynamic_mesh(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<10x14xf32>
+  %arg0: tensor<10x14xf32>
+) -> tensor<10x14xf32> {
+  // CHECK: %[[SOURCE_SHARD:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : tensor<10x14xf32> to tensor<?x14xf32>
+  // CHECK: %[[ALL_TO_ALL:.*]] = mesh.all_to_all %[[SOURCE_SHARD]] on @mesh_1d_dynamic mesh_axes = [0] split_axis = 1 concat_axis = 0 : tensor<?x14xf32> -> tensor<?x?xf32>
+  // CHECK: %[[TARGET_SHARD:.*]] = tensor.cast %[[ALL_TO_ALL]] : tensor<?x?xf32> to tensor<10x?xf32>
+  // CHECK: %[[RES:.*]] = builtin.unrealized_conversion_cast %[[TARGET_SHARD]] : tensor<10x?xf32> to tensor<10x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d_dynamic, [[0]]> : tensor<10x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d_dynamic, [[], [0]]> annotate_for_users : tensor<10x14xf32>
+  // CHECK: return %[[RES]] : tensor<10x14xf32>
+  return %1 : tensor<10x14xf32>
+}
+
+// CHECK-LABEL: func @move_split_dynamic_axis
+func.func @move_split_dynamic_axis(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<?x14xf32>
+  %arg0: tensor<?x14xf32>
+) -> tensor<?x14xf32> {
+  // CHECK: %[[TARGET_SHARD:.*]] = mesh.all_to_all %[[ARG]] on @mesh_1d mesh_axes = [0] split_axis = 1 concat_axis = 0 : tensor<?x14xf32> -> tensor<?x7xf32>
+  // CHECK: %[[RES:.*]] = builtin.unrealized_conversion_cast %[[TARGET_SHARD]] : tensor<?x7xf32> to tensor<?x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[0]]> : tensor<?x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[], [0]]> annotate_for_users : tensor<?x14xf32>
+  // CHECK: return %[[RES]] : tensor<?x14xf32>
+  return %1 : tensor<?x14xf32>
+}
+
+// CHECK-LABEL: func @unshard_static_axis
+func.func @unshard_static_axis(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<10x14xf32>
+  %arg0: tensor<10x14xf32>
+) -> tensor<10x14xf32> {
+  // CHECK: %[[SOURCE_SHARD:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : tensor<10x14xf32> to tensor<5x14xf32>
+  // CHECK: %[[ALL_GATHER:.*]] = mesh.all_gather %[[SOURCE_SHARD]] on @mesh_1d mesh_axes = [0] gather_axis = 0 : tensor<5x14xf32> -> tensor<10x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[0]]> : tensor<10x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[]]> annotate_for_users : tensor<10x14xf32>
+  // CHECK: return %[[ALL_GATHER]] : tensor<10x14xf32>
+  return %1 : tensor<10x14xf32>
+}
+
+// CHECK-LABEL: func @unshard_dynamic_axis
+func.func @unshard_dynamic_axis(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<?x14xf32>
+  %arg0: tensor<?x14xf32>
+) -> tensor<?x14xf32> {
+  // CHECK: %[[ALL_GATHER:.*]] = mesh.all_gather %[[ARG]] on @mesh_1d mesh_axes = [0] gather_axis = 0 : tensor<?x14xf32> -> tensor<?x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[0]]> : tensor<?x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[]]> annotate_for_users : tensor<?x14xf32>
+  // CHECK: return %[[ALL_GATHER]] : tensor<?x14xf32>
+  return %1 : tensor<?x14xf32>
+}
+
+// CHECK-LABEL: func @unshard_static_axis_on_dynamic_mesh_axis
+func.func @unshard_static_axis_on_dynamic_mesh_axis(
+// CHECK-SAME: %[[ARG:.*]]: tensor<10x14xf32>  
+  %arg0: tensor<10x14xf32>
+) -> tensor<10x14xf32> {
+  // CHECK: %[[SOURCE_SHARD:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : tensor<10x14xf32> to tensor<?x14xf32>
+  // CHECK: %[[ALL_GATHER:.*]] = mesh.all_gather %[[SOURCE_SHARD]] on @mesh_1d_dynamic mesh_axes = [0] gather_axis = 0 : tensor<?x14xf32> -> tensor<?x14xf32>
+  // CHECK: %[[RES:.*]] = tensor.cast %[[ALL_GATHER]] : tensor<?x14xf32> to tensor<10x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d_dynamic, [[0]]> : tensor<10x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d_dynamic, [[]]> annotate_for_users : tensor<10x14xf32>
+  // CHECK: return %[[RES]] : tensor<10x14xf32>
+  return %1 : tensor<10x14xf32>
+}
+
+// CHECK-LABEL: func @partial_axis
+func.func @partial_axis(
+// CHECK-SAME: %[[ARG:.*]]: tensor<10x14xf32>  
+  %arg0: tensor<10x14xf32>
+) -> tensor<10x14xf32> {
+  // CHECK: %[[ALL_REDUCE:.*]] = mesh.all_reduce %[[ARG]] on @mesh_1d mesh_axes = [0] : tensor<10x14xf32> -> tensor<10x14xf32>
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[]], partial = sum[0]> : tensor<10x14xf32>
+  %1 = mesh.shard %0 to <@mesh_1d, [[]]> annotate_for_users : tensor<10x14xf32>
+  // CHECK: %[[ALL_REDUCE]] : tensor<10x14xf32>
+  return %1 : tensor<10x14xf32>
+}
diff --git a/mlir/test/lib/Dialect/Mesh/CMakeLists.txt b/mlir/test/lib/Dialect/Mesh/CMakeLists.txt
index 16b50bb878a074..f14d282857a1e0 100644
--- a/mlir/test/lib/Dialect/Mesh/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Mesh/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRMeshTestSimplifications
+  TestReshardingSpmdization.cpp
   TestSimplifications.cpp
 
   EXCLUDE_FROM_LIBMLIR
diff --git a/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp b/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp
new file mode 100644
index 00000000000000..940010fd94888d
--- /dev/null
+++ b/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp
@@ -0,0 +1,124 @@
+//===- TestSimplification.cpp - Test simplification -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Mesh/IR/MeshOps.h"
+#include "mlir/Dialect/Mesh/Transforms/Spmdization.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::mesh;
+
+namespace {
+
+struct TestMeshReshardingRewritePattern : OpRewritePattern<ShardOp> {
+  using OpRewritePattern<ShardOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ShardOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getAnnotateForUsers()) {
+      return failure();
+    }
+
+    mesh::ClusterOp mesh = symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(
+        op, op.getShard().getCluster());
+
+    bool foundUser = false;
+    for (auto user : op->getUsers()) {
+      if (auto targetShardOp = llvm::dyn_cast<ShardOp>(user)) {
+        if (targetShardOp.getAnnotateForUsers() &&
+            mesh == symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(
+                        targetShardOp, targetShardOp.getShard().getCluster())) {
+          foundUser = true;
+          break;
+        }
+      }
+    }
+
+    if (!foundUser) {
+      return failure();
+    }
+
+    for (auto user : op->getUsers()) {
+      auto targetShardOp = llvm::dyn_cast<ShardOp>(user);
+      if (!targetShardOp || !targetShardOp.getAnnotateForUsers() ||
+          symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(
+              targetShardOp, targetShardOp.getShard().getCluster()) != mesh) {
+        continue;
+      }
+
+      ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
+      ShapedType sourceShardShape =
+          shardShapedType(op.getResult().getType(), mesh, op.getShard());
+      TypedValue<ShapedType> sourceShard =
+          builder
+              .create<UnrealizedConversionCastOp>(sourceShardShape,
+                                                  op.getOperand())
+              ->getResult(0)
+              .cast<TypedValue<ShapedType>>();
+      TypedValue<ShapedType> targetShard =
+          reshard(builder, mesh, op, targetShardOp, sourceShard);
+      Value newTargetUnsharded =
+          builder
+              .create<UnrealizedConversionCastOp>(
+                  targetShardOp.getResult().getType(), targetShard)
+              ->getResult(0);
+      rewriter.replaceAllUsesWith(targetShardOp.getResult(),
+                                  newTargetUnsharded);
+    }
+
+    return success();
+  }
+
+private:
+  mutable SymbolTableCollection symbolTable;
+};
+
+struct TestMeshReshardingPass
+    : public PassWrapper<TestMeshReshardingPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestMeshReshardingPass)
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.insert<TestMeshReshardingRewritePattern>(&getContext());
+    if (failed(applyPatternsAndFoldGreedily(getOperation().getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    reshardingRegisterDependentDialects(registry);
+    registry.insert<BuiltinDialect>();
+  }
+  StringRef getArgument() const final {
+    return "test-mesh-resharding-spmdization";
+  }
+  StringRef getDescription() const final {
+    return "Test Mesh dialect resharding spmdization.";
+  }
+};
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestMeshReshardingSpmdizationPass() {
+  PassRegistration<TestMeshReshardingPass>();
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index eedade691c6c39..bea2449f44dcde 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -119,6 +119,7 @@ void registerTestMathPolynomialApproximationPass();
 void registerTestMemRefDependenceCheck();
 void registerTestMemRefStrideCalculation();
 void registerTestMeshSimplificationsPass();
+void registerTestMeshReshardingSpmdizationPass();
 void registerTestNextAccessPass();
 void registerTestOneToNTypeConversionPass();
 void registerTestOpaqueLoc();
@@ -237,6 +238,7 @@ void registerTestPasses() {
   mlir::test::registerTestMemRefDependenceCheck();
   mlir::test::registerTestMemRefStrideCalculation();
   mlir::test::registerTestMeshSimplificationsPass();
+  mlir::test::registerTestMeshReshardingSpmdizationPass();
   mlir::test::registerTestNextAccessPass();
   mlir::test::registerTestOneToNTypeConversionPass();
   mlir::test::registerTestOpaqueLoc();

>From 1e0f1eb0d12f60d12654401b1e6304e58c8d964a Mon Sep 17 00:00:00 2001
From: Boian Petkantchin <boian.petkantchin at amd.com>
Date: Thu, 21 Dec 2023 13:47:33 -0800
Subject: [PATCH 2/6] Remove unused ceilDiv function

---
 mlir/include/mlir/Support/MathExtras.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/mlir/include/mlir/Support/MathExtras.h b/mlir/include/mlir/Support/MathExtras.h
index 22493c11a370d1..17a747393d26eb 100644
--- a/mlir/include/mlir/Support/MathExtras.h
+++ b/mlir/include/mlir/Support/MathExtras.h
@@ -15,20 +15,9 @@
 
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/APInt.h"
-#include <type_traits>
 
 namespace mlir {
 
-// ceilDiv for unsigned integral.
-template <typename T, std::enable_if_t<std::is_integral_v<T> &&
-                                       !std::is_unsigned_v<T>> = true>
-T ceilDiv(T lhs, T rhs) {
-  assert(rhs != static_cast<T>(0));
-  T q = lhs / rhs;
-  T r = lhs % rhs;
-  return r == static_cast<T>(0) ? q : q + static_cast<T>(1);
-}
-
 /// Returns the result of MLIR's ceildiv operation on constants. The RHS is
 /// expected to be non-zero.
 inline int64_t ceilDiv(int64_t lhs, int64_t rhs) {

>From 95c5e52a5271222cb462c137edcc3419840ac6a2 Mon Sep 17 00:00:00 2001
From: Boian Petkantchin <boian.petkantchin at amd.com>
Date: Thu, 21 Dec 2023 15:10:18 -0800
Subject: [PATCH 3/6] Remove mutable state from
 TestMeshReshardingRewritePattern

---
 mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp b/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp
index 940010fd94888d..6fecbd48f15387 100644
--- a/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp
+++ b/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp
@@ -36,6 +36,7 @@ struct TestMeshReshardingRewritePattern : OpRewritePattern<ShardOp> {
       return failure();
     }
 
+    SymbolTableCollection symbolTable;
     mesh::ClusterOp mesh = symbolTable.lookupNearestSymbolFrom<mesh::ClusterOp>(
         op, op.getShard().getCluster());
 
@@ -85,9 +86,6 @@ struct TestMeshReshardingRewritePattern : OpRewritePattern<ShardOp> {
 
     return success();
   }
-
-private:
-  mutable SymbolTableCollection symbolTable;
 };
 
 struct TestMeshReshardingPass

>From 9e72161754aace136cf4f99fe340a8cee45d004e Mon Sep 17 00:00:00 2001
From: Boian Petkantchin <sogartary at yahoo.com>
Date: Tue, 2 Jan 2024 09:28:23 -0800
Subject: [PATCH 4/6] Update
 mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc

Co-authored-by: Chengji Yao <yaochengji at hotmail.com>
---
 .../mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc b/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
index 181f07177e0af9..ba31e7e7ab28a1 100644
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
@@ -1,4 +1,4 @@
-Reshadring Spmdization Examples
+Resharding Spmdization Examples
 
 --------------------------------------------------------------
 

>From 517b50123acfa1ea502acea050a3b50ad87c8f94 Mon Sep 17 00:00:00 2001
From: Boian Petkantchin <sogartary at yahoo.com>
Date: Tue, 2 Jan 2024 09:29:15 -0800
Subject: [PATCH 5/6] Update
 mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc

Co-authored-by: Chengji Yao <yaochengji at hotmail.com>
---
 .../mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc b/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
index ba31e7e7ab28a1..9299bc68d75338 100644
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
@@ -401,7 +401,7 @@ d2(
 
 We want to copy the the data between devices in slices/tiles.
 What are the source/target tile coordinates?
-Fro a fixed (m, n, p, q) what is the range of (k, l, u, v)?
+For a fixed (m, n, p, q) what is the range of (k, l, u, v)?
 TODO
 
 --------------------------------------------------------------

>From 8b5fceb928400d5ca861991f20af16e312916b46 Mon Sep 17 00:00:00 2001
From: Boian Petkantchin <boian.petkantchin at amd.com>
Date: Tue, 2 Jan 2024 10:44:47 -0800
Subject: [PATCH 6/6] Make resharding doc markdown

---
 .../Mesh/Transforms/ReshardingSpmdizationDoc  | 621 ------------------
 1 file changed, 621 deletions(-)
 delete mode 100644 mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc

diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc b/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
deleted file mode 100644
index 9299bc68d75338..00000000000000
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/ReshardingSpmdizationDoc
+++ /dev/null
@@ -1,621 +0,0 @@
-Resharding Spmdization Examples
-
---------------------------------------------------------------
-
-Reshard 2x3 tensor from sharding [[0, 1]] to sharding [[0, 1]] on a 2x3 mesh.
-
-unsharded 2x3 tensor
-11 12 13
-21 22 23
-
-sharded on a 2x3 mesh
-
-sharding = [[0, 1]]
-
-mesh contents:
-
-mesh axis 1
------------>
-+----+----+----+ mesh axis 0 |
-| 11 | 12 | 13 |             |
-+----+----+----+             |
-| 21 | 22 | 23 |             |
-+----+----+----+             ↓
-
-Transform into
-sharding = [[1, 0]]
-
-mesh axis 1
------------>
-+----+----+----+ mesh axis 0 |
-| 11 | 13 | 22 |             |
-+----+----+----+             |
-| 12 | 21 | 23 |             |
-+----+----+----+             ↓
-
-Swap contents on devices that have the same linear index in the 2 shardings.
-
---------------------------------------------------------------
-
-Reshard 2x3 tensor from sharding [[0, 1]] to sharding [[1]] on a 2x3 mesh.
-
-unsharded 2x3 tensor
-11 12 13
-21 22 23
-
-sharded on a 2x3 mesh
-
-sharding = [[0, 1]]
-
-mesh contents:
-
-mesh axis 1
------------>
-+----+----+----+ mesh axis 0 |
-| 11 | 12 | 13 |             |
-+----+----+----+             |
-| 21 | 22 | 23 |             |
-+----+----+----+             ↓
-
-Transform into
-sharding = [[1]]
-
-mesh axis 1
------------>
-+----+----+----+ mesh axis 0 |
-| 11 | 12 | 13 |             |
-| 21 | 22 | 23 |             |
-+----+----+----+             |
-| 11 | 12 | 13 |             |
-| 21 | 22 | 23 |             |
-+----+----+----+             ↓
-
-All-gather along mesh axis 0.
-
---------------------------------------------------------------
-
-Reshard 4x6 tensor from sharding [[], [0, 1]] to sharding [[], [0]] on a 2x3 mesh.
-
-unsharded 4x6 tensor
-11 12 13 14 15 16
-21 22 23 24 25 26
-
-sharded on a 2x3 mesh
-
-sharding = [[], [0, 1]]
-
-mesh contents:
-
-mesh axis 1
------------>
-+----+----+----+ mesh axis 0 |
-| 11 | 12 | 13 |             |
-| 21 | 22 | 23 |             |
-+----+----+----+             |
-| 14 | 15 | 16 |             |
-| 24 | 25 | 26 |             |
-+----+----+----+             ↓
-
-Transform into
-sharding = [[], [0]]
-
-mesh axis 1
------------>
-+----------+----------+ mesh axis 0 |
-| 11 12 13 | 11 12 13 |             |
-| 21 22 23 | 21 22 23 |             |
-+----------+----------+             |
-| 14 15 16 | 14 15 16 |             |
-| 24 25 26 | 24 25 26 |             |
-+----------+----------+             ↓
-
-all-gather along mesh axis 1
-
---------------------------------------------------------------
-
-Reshard 4x8 tensor from sharding [[0], [1, 2]] to sharding [[0], [2]] on a 2x2x2 mesh.
-
-unsharded 4x8 tensor
-11 12 13 14 15 16 17 18
-21 22 23 24 25 26 27 28
-31 32 33 34 35 36 37 38
-41 42 43 44 45 46 47 48
-
-sharded on a 2x2x2 mesh
-
-sharding = [[0], [1, 2]]
-
-mesh contents:
-
-mesh axis 2
------------>
-+-------+-------+ mesh axis 1 | mesh axis 0 |
-| 11 12 | 13 14 |             |             |
-| 21 22 | 23 24 |             |             |
-+-------+-------+             |             |
-| 15 16 | 17 18 |             |             |
-| 25 26 | 27 28 |             |             |
-+-------+-------+             ↓             |
-+-------+-------+                           |
-| 31 32 | 33 34 |                           |
-| 41 42 | 43 44 |                           |
-+-------+-------+                           |
-| 35 36 | 37 38 |                           |
-| 45 46 | 47 48 |                           |
-+-------+-------+                           ↓
-
-Transform into
-sharding = [[0], [2]]
-
-mesh axis 2
------------>
-+-------------+-------------+ mesh axis 1 | mesh axis 0 |
-| 11 12 13 14 | 15 16 17 18 |             |             |
-| 21 22 23 24 | 25 26 27 28 |             |             |
-+-------------+-------------+             |             |
-| 11 12 13 14 | 15 16 17 18 |             |             |
-| 21 22 23 24 | 25 26 27 28 |             |             |
-+-------------+-------------+             ↓             |
-+-------------+-------------+                           |
-| 31 32 33 34 | 35 36 37 38 |                           |
-| 41 42 43 44 | 45 46 47 48 |                           |
-+-------------+-------------+                           |
-| 31 32 33 34 | 35 36 37 38 |                           |
-| 41 42 43 44 | 45 46 47 48 |                           |
-+-------------+-------------+                           ↓
-
-Can't be done with just an all-gather along mesh axis 1.
-Can be handled by multiple resharding transformations
-[[0], [1, 2]] -> [[0], [2, 1]] -> [[0], [2]]
-
---------------------------------------------------------------
-
-Reshard 6x6 tensor from sharding [[0], [1]] to sharding [[1], [0]] on a 2x3 mesh.
-
-unsharded 6x6 tensor
-11 12 13 14 15 16
-21 22 23 24 25 26
-31 32 33 34 35 36
-41 42 43 44 45 46
-51 52 53 54 55 56
-61 62 63 64 65 66
-
-sharded on a 2x3 mesh
-
-sharding = [[0], [1]]
-
-mesh axis 1
------------>
-+-------+-------+-------+ mesh axis 0 |
-| 11 12 | 13 14 | 15 16 |             |
-| 21 22 | 23 24 | 25 26 |             |
-| 31 32 | 33 34 | 35 36 |             |
-+-------+-------+-------+             |
-| 41 42 | 43 44 | 45 46 |             |
-| 51 52 | 53 54 | 55 56 |             |
-| 61 62 | 63 64 | 65 66 |             |
-+-------+-------+-------+             ↓
-
-transform to
-sharding = [[1], [0]]
-
-mesh axis 1
------------>
-+----------+----------+----------+ mesh axis 0 |
-| 11 12 13 | 31 32 33 | 51 52 53 |             |
-| 21 22 23 | 41 42 43 | 61 62 63 |             |
-+----------+----------+----------+             |
-| 14 15 16 | 34 35 36 | 54 55 56 |             |
-| 24 25 26 | 44 45 46 | 64 65 66 |             |
-+----------+----------+----------+             ↓
-
-mesh axis 0
------------>
-+----------+----------+ mesh axis 1 |
-| 11 12 13 | 14 15 16 |             |
-| 21 22 23 | 24 25 26 |             |
-+----------+----------+             |
-| 31 32 33 | 34 35 36 |             |
-| 41 42 43 | 44 45 46 |             |
-+----------+----------+             |
-| 51 52 53 | 54 55 56 |             |
-| 61 62 63 | 64 65 66 |             |
-+----------+----------+             ↓
-
-TODO
-
---------------------------------------------------------------
-
-Reshard 6x6 tensor from sharding [[0], [1]] to sharding [[1], [0]] on a 2x6 mesh.
-
-unsharded 6x6 tensor
-11 12 13 14 15 16
-21 22 23 24 25 26
-31 32 33 34 35 36
-41 42 43 44 45 46
-51 52 53 54 55 56
-61 62 63 64 65 66
-
-shard on 2x6 mesh
-
-sharding = [[0], [1]]
-
-mesh axis 1
------------>
-+----+----+----+----+----+----+ mesh axis 0 |
-| 11 | 12 | 13 ‖ 14 | 15 | 16 |             |
-| 21 | 22 | 23 ‖ 24 | 23 | 26 |             |
-| 31 | 32 | 33 ‖ 34 | 35 | 36 |             |
-+----+----+----+----+----+----+             |
-| 41 | 42 | 43 ‖ 44 | 45 | 46 |             |
-| 51 | 52 | 53 ‖ 54 | 55 | 56 |             |
-| 61 | 62 | 63 ‖ 64 | 65 | 66 |             |
-+----+----+----+----+----+----+             ↓
-
-
-transform to
-sharding = [[1], [0]]
-
-mesh axis 0
------------>
-+----------+----------+ mesh axis 1 |
-| 11 12 13 | 14 15 16 |             |
-+----------+----------+             |
-| 21 22 23 | 24 25 26 |             |
-+----------+----------+             |
-| 31 32 33 | 34 35 36 |             |
-+==========+==========+             |
-| 41 42 43 | 44 45 46 |             |
-+----------+----------+             |
-| 51 52 53 | 54 55 56 |             |
-+----------+----------+             |
-| 61 62 63 | 64 65 66 |             |
-+----------+----------+             ↓
-
-TODO
-
---------------------------------------------------------------
-
-Reshard KxL tensor from [[0], [1]] to [[1], [0]] on MxN mesh.
-
-M x N mesh.
-K x L tensor t.
-d(m, n) the tensor on device (m, n).
-
-sharding = [[0], [1]]
-Tensor shard s on each device has size (K ceildiv M, L ceildiv N).
-d(m, n)[k, l] -> t[m * (K ceildiv M) + k, n * (L ceildiv N) + l]
-
-substitute
-i <- m * (K ceildiv M) + k
-j <- n * (L ceildiv N) + l
-
-m -> i floordiv (K ceildiv M)
-n -> j floordiv (L ceildiv N)
-k -> i % (K ceildiv M)
-l -> j % (L ceildiv N)
-
-For the inverse map we get
-t[i, j] -> d(
-    i floordiv (K ceildiv M), j floordiv (L ceildiv N)
-)[
-    i % (K ceildiv M), j % (L ceildiv N)
-]
-
-Check:
-i = 13, j = 17, M = 3, N = 4, K = 16, L = 23
-t[13, 17] = d(
-    13 floordiv (16 ceildiv 3),
-    17 floordiv (23 ceilvid 4)
-)[
-    13 % (16 ceildiv 3),
-    17 % (23 ceilvid 4)
-]
-= d(
-    13 floordiv 6,
-    17 floordiv 6
-)[
-    13 % 6,
-    17 % 6
-]
-= d(2, 2)[1, 5]
-= t[
-    2 * (16 ceildiv 3) + 1,
-    2 * (23 ceildiv 4) + 5
-]
-= t[
-    2 * 6 + 1,
-    2 * 6 + 5
-]
-= t[13, 17]
-
-
-sharding = [[1], [0]]
-Tensor shard s on each device has size (K ceildiv N, L ceildiv M).
-d(m, n)[k, l] -> t[n * (K ceildiv N) + k, m * (L ceildiv M) + l]
-
-substitute
-i <- n * (K ceildiv N) + k
-j <- m * (L ceildiv M) + l
-
-m -> j floordiv (L ceildiv M)
-n -> i floordiv (K ceildiv N)
-k -> i % (K ceildiv N)
-l -> j % (L ceildiv M)
-
-For the inverse map we get
-t[i, j] -> d(
-    j floordiv (L ceildiv M), i floordiv (K ceildiv N)
-)[
-    i % (K ceildiv N), j % (L ceildiv M)
-]
-
-Check:
-i = 9, j = 19, M = 5, N = 2, K = 27, L = 14
-t[9, 19] = d(
-    19 floordiv (14 ceildiv 5),
-    9 floordiv (27 ceildiv 2)
-)[
-    9 % (27 ceildiv 2),
-    19 % (14 ceildiv 5)
-]
-= d(
-    19 floordiv 3,
-    9 floordiv 14
-)[
-    9 % 14
-    19 % 3
-]
-= d(6, 0)[9, 1]
-= t[
-    0 * (27 ceildiv 2) + 9,
-    6 * (14 ceildiv 5) + 1
-]
-= t[
-    0 * 14 + 9,
-    6 * 3 + 1
-]
-= t[9, 19]
-
-sharding = [[0], [1]]
-d(m, n)[k, l] -> t[m * (K ceildiv M) + k, n * (L ceildiv N) + l]
-t[i, j] -> d(i floordiv (K ceildiv M), j floordiv (L ceildiv N))[i % (K ceildiv M), j % (L ceildiv N)]
-
-sharding = [[1], [0]]
-d(m, n)[k, l] -> t[n * (K ceildiv N) + k, m * (L ceildiv M) + l]
-t[i, j] -> d(j floordiv (L ceildiv M), i floordiv (K ceildiv N))[i % (K ceildiv N), j % (L ceildiv M)]
-
-sharding [[0], [1]] -> [[1], [0]]
-d1(m, n) the tensor on device (m, n) for sharding sharding [[0], [1]].
-d2(m, n) the tensor on device (m, n) for sharding sharding [[1], [0]].
-d1(m, n)[k, l] ->
-t[m * (K ceildiv M) + k, n * (L ceildiv N) + l] ->
-d2(
-    (m * (L ceildiv M) + l) floordiv (L ceildiv M),
-    (n * (K ceildiv N) + k) floordiv (K ceildiv N)
-)[
-    (n * (K ceildiv N) + k) % (K ceildiv N),
-    (m * (L ceildiv M) + l) % (L ceildiv M)
-]
-= d2(p, q)[u, v]
-
-We want to copy the the data between devices in slices/tiles.
-What are the source/target tile coordinates?
-For a fixed (m, n, p, q) what is the range of (k, l, u, v)?
-TODO
-
---------------------------------------------------------------
-
-Reshard KxL tensor from sharding [[0], [1]] to sharding [[1], [0]] on a 2x3 mesh.
-
-Device placement on a 2x3 mesh
-11 12 13  <- devices
-21 22 23
-
-sharding [[0], [1]]
-tensor axis 1
------------>
-+----+----+----+ tensor axis 0 |
-| 11 | 12 | 13 |               |
-+----+----+----+               |
-| 21 | 22 | 23 |               |
-+----+----+----+               ↓
-
-transform to
-sharding [[1], [0]]
-tensor axis 1
------------>
-+----+----+ tensor axis 0 |
-| 11 | 21 |               |
-+----+----+               |
-| 12 | 22 |               |
-+----+----+               |
-| 13 | 23 |               |
-+----+----+               ↓
-
-+-----------------+--------+--------+-----------------+
-|                 |                 |                 |
-+                 +                 +                 +
-|       11        |        12       |        13       |
-+                 +                 +                 +
-|                 |                 |                 |
-+-----------------+--------+--------+-----------------+
-|                 |                 |                 |
-+                 +                 +                 +
-|       21        |        22       |        23       |
-+                 +                 +                 +
-|                 |                 |                 |
-+-----------------+--------+--------+-----------------+
-
-+-----------------+--------+--------+-----------------+
-|                          |                          |
-+          11              +               21         +
-|                          |                          |
-+-----------------+--------+--------+-----------------+
-|                          |                          |
-+          12              +               22         +
-|                          |                          |
-+-----------------+--------+--------+-----------------+
-|                          |                          |
-+          13              +               23         +
-|                          |                          |
-+-----------------+--------+--------+-----------------+
-
-+-----------------+--------+--------+-----------------+
-|                 |        |        |                 |
-+     11  11      + 12  11 + 12  21 +     13  21      +
-|                 |        |        |                 |
-+-----------------+--------+--------+-----------------+
-|     11  12      | 12  12 | 12  22 |     13  22      |
-+-----------------+--------+--------+-----------------+
-|     21  12      | 22  12 | 22  22 |     23  22      |
-+-----------------+--------+--------+-----------------+
-|                 |        |        |                 |
-+     21  13      + 22  13 + 22  23 +     23  23      +
-|                 |        |        |                 |
-+-----------------+--------+--------+-----------------+
-
-If S and T are the source and target shard sizes along some tensor axis.
-Then we have a period of (S*T)/gcd(S, T). Then the cut pattern repeats.
-TODO
-
---------------------------------------------------------------
-
-Reshard 6x6 tensor from sharding [[0], []] to sharding [[], [0]] on a 3 mesh.
-
-unsharded 6x6 tensor
-11 12 13 14 15 16
-21 22 23 24 25 26
-31 32 33 34 35 36
-41 42 43 44 45 46
-51 52 53 54 55 56
-61 62 63 64 65 66
-
-sharded on a 3 mesh
-
-sharding = [[0], []]
-
-+-------------------+ mesh axis 0 |
-| 11 12 13 14 15 16 |             |
-| 21 22 23 24 25 26 |             |
-+-------------------+             |
-| 31 32 33 34 35 36 |             |
-| 41 42 43 44 45 46 |             |
-+-------------------+             |
-| 51 52 53 54 55 56 |             |
-| 61 62 63 64 65 66 |             |
-+-------------------+             ↓
-
-transform to
-sharding = [[], [0]]
-
-mesh axis 0
------------>
-+-------+-------+-------+
-| 11 12 | 13 14 | 15 16 |
-| 21 22 | 23 24 | 25 26 |
-| 31 32 | 33 34 | 35 36 |
-| 41 42 | 43 44 | 45 46 |
-| 51 52 | 53 54 | 55 56 |
-| 61 62 | 63 64 | 65 66 |
-+-------+-------+-------+
-
-%1 = all_to_all %0 on @mesh mesh_axes = [0] split_axis = 1 concat_axis = 0 : tensor<2x6xi8> -> tensor<6x2xi8>
-
---------------------------------------------------------------
-
-Reshard 4x4 tensor from sharding [[0], [1, 2]] to sharding [[0, 1], [2]] on a 2x2x2 mesh.
-
-unsharded 4x4 tensor
-11 12 13 14
-21 22 23 24
-31 32 33 34
-41 42 43 44
-
-sharded on a 2x2x2 mesh
-
-sharding = [[0], [1, 2]]
-
-mesh axis 2
------------>
-+----+----+ mesh axis 1 | mesh axis 0 |
-| 11 | 12 |             |             |
-| 21 | 22 |             |             |
-+----+----+             |             |
-| 13 | 14 |             |             |
-| 23 | 24 |             |             |
-+----+----+             ↓             |
-+----+----+                           |
-| 31 | 32 |                           |
-| 41 | 42 |                           |
-+----+----+                           |
-| 33 | 34 |                           |
-| 43 | 44 |                           |
-+----+----+                           ↓
-
-transform to
-sharding = [[0, 1], [2]]
-
-mesh axis 2
------------>
-+-------+-------+ mesh axis 1 | mesh axis 0 |
-| 11 12 | 13 41 |             |             |
-+-------+-------+             |             |
-| 21 22 | 23 24 |             |             |
-+-------+-------+             ↓             |
-+-------+-------+                           |
-| 31 32 | 33 34 |                           |
-+-------+-------+                           |
-| 41 42 | 43 44 |                           |
-+-------+-------+                           ↓
-
-%1 = all_to_all %0 on @mesh mesh_axes = [2] split_axis = 1 concat_axis = 0 : tensor<2x1xi8> -> tensor<1x2xi8>
-is not enough.
-
-Can be decomposed into
-[[0], [1, 2]] -> [[0], [2, 1]] -> [[0, 1], [2]]
-
---------------------------------------------------------------
-
-We can decompose each resharding into a sequence of basis reshardings.
-It is not communication efficient in terms of minimizing the data communicated
-between devices.
-An efficient approach would be more complicated to implement.
-Each device has to receive at most as much data as the size of its target
-sharding tensor.
-
-Basis:
-
-From replicate to split.
-[[]] -> [[1]]
-Extract slices without communication.
-
-From split to replicate.
-[[0]] -> [[]]
-[[0, 1]] -> [[1]]
-All-gather along mesh axis 0.
-
-Swap mesh axes order when assigned to the same tensor axis.
-[[0, 1]] -> [[1, 0]]
-Swap contents on devices with the same linear index.
-
-Move mesh axis to different tensor dimension.
-[[0], []] -> [[], [0]]
-All-to-all.
-
-Example decomposition of
-[[0], [1]] -> [[1], [0]]
-
-[[0], [1]] -> all-gather along mesh axis 1    ->
-[[0], []]  -> all-to-all along mesh axis 0    ->
-[[], [0]]  -> extract slice along mesh axis 1 ->
-[[1], [0]]
-
-Example decomposition of
-[[3, 2], [], [0, 1]] -> [[0], [1, 2], []]
-
-[[3, 2], [], [0, 1]] -> all-to-all along mesh axis 1 ->
-[[3, 2], [1], [0]]   -> all-to-all along mesh axis 2 ->
-[[3], [1, 2], [0]]   -> all-gather along mesh axis 3 ->
-[[], [1, 2], [0]]    -> all-to-all along mesh axis 0 ->
-[[0], [1, 2], []]