[Mlir-commits] [lldb] [compiler-rt] [mlir] [llvm] Changes to support running tests for Windows arm64 asan (PR #66973)

Mon Nov 13 11:04:08 PST 2023

https://github.com/farzonl updated https://github.com/llvm/llvm-project/pull/66973

>From 85e5f76f0e9ccf4c3ed77eac0aaa3de944091c2c Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzon at farzon.org>
Date: Wed, 20 Sep 2023 22:58:08 -0400
Subject: [PATCH 01/13] Changes to support running tests for Windows arm64 asan
 1. Differentiate SANITIZER_WINDOWS64 for x64 and arm64 2. fix A Warning where
 asserts needs messages 3. turn of interception tests that expect x86 assembly

---
 compiler-rt/lib/interception/interception_win.cpp         | 6 +++---
 .../lib/interception/tests/interception_win_test.cpp      | 4 +++-
 compiler-rt/lib/sanitizer_common/sanitizer_platform.h     | 8 ++++++++
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 1b681ada37b170d..0a0e03ba63e5b59 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -1,4 +1,4 @@
-//===-- interception_win.cpp ------------------------------------*- C++ -*-===//
+//===-- interception_win.cpp ------------------------------------*- C++ -*-===// 
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -462,7 +462,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
   return 4;
 #endif
 
-#if SANITIZER_WINDOWS64
+#if SANITIZER_WINDOWS_x64
   if (memcmp((u8*)address, kPrologueWithShortJump1,
              sizeof(kPrologueWithShortJump1)) == 0 ||
       memcmp((u8*)address, kPrologueWithShortJump2,
@@ -544,7 +544,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
       return 7;
   }
 
-#if SANITIZER_WINDOWS64
+#if SANITIZER_WINDOWS_x64
   switch (*(u8*)address) {
     case 0xA1:  // A1 XX XX XX XX XX XX XX XX :
                 //   movabs eax, dword ptr ds:[XXXXXXXX]
diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp
index 9159ce405f2dc49..7dca93556527e7b 100644
--- a/compiler-rt/lib/interception/tests/interception_win_test.cpp
+++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp
@@ -1,4 +1,4 @@
-//===-- interception_win_test.cpp -----------------------------------------===//
+//===-- interception_win_test.cpp -----------------------------------------===// 
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -17,6 +17,7 @@
 // Too slow for debug build
 #if !SANITIZER_DEBUG
 #if SANITIZER_WINDOWS
+#if !SANITIZER_WINDOWS_ARM64
 
 #include <stdarg.h>
 
@@ -793,5 +794,6 @@ TEST(Interception, EmptyExportTable) {
 
 }  // namespace __interception
 
+#endif   // !SANITIZER_WINDOWS_ARM64
 #endif  // SANITIZER_WINDOWS
 #endif  // #if !SANITIZER_DEBUG
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
index 3e1b078a0212f5e..6af3051ac5aff3f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
@@ -260,6 +260,14 @@
 #  define SANITIZER_ARM64 0
 #endif
 
+#if SANITIZER_WINDOWS64 && SANITIZER_ARM64 
+#  define SANITIZER_WINDOWS_ARM64 1
+#  define SANITIZER_WINDOWS_x64 0
+#else
+#  define SANITIZER_WINDOWS_ARM64 0
+#  define SANITIZER_WINDOWS_x64 1
+#endif
+
 #if SANITIZER_SOLARIS && SANITIZER_WORDSIZE == 32
 #  define SANITIZER_SOLARIS32 1
 #else

>From fee3661104d31f4ff426397f1cb183b7aeaa1f27 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzon at farzon.org>
Date: Mon, 13 Nov 2023 12:38:39 -0500
Subject: [PATCH 02/13] add comments, run git clang-format.

---
 compiler-rt/lib/interception/interception_win.cpp        | 9 +++++++--
 .../lib/interception/tests/interception_win_test.cpp     | 4 +++-
 compiler-rt/lib/sanitizer_common/sanitizer_platform.h    | 2 +-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 0a0e03ba63e5b59..8c002c06539b7f1 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -1,4 +1,4 @@
-//===-- interception_win.cpp ------------------------------------*- C++ -*-===// 
+//===-- interception_win.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -431,7 +431,8 @@ static uptr AllocateMemoryForTrampoline(uptr image_address, size_t size) {
 // The following prologues cannot be patched because of the short jump
 // jumping to the patching region.
 
-#if SANITIZER_WINDOWS64
+// Note: The jump byte array below is x86 assembly
+#if SANITIZER_WINDOWS_x64
 // ntdll!wcslen in Win11
 //   488bc1          mov     rax,rcx
 //   0fb710          movzx   edx,word ptr [rax]
@@ -442,6 +443,7 @@ static const u8 kPrologueWithShortJump1[] = {
     0x48, 0x8b, 0xc1, 0x0f, 0xb7, 0x10, 0x48, 0x83,
     0xc0, 0x02, 0x66, 0x85, 0xd2, 0x75, 0xf4,
 };
+#endif
 
 // ntdll!strrchr in Win11
 //   4c8bc1          mov     r8,rcx
@@ -462,6 +464,9 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
   return 4;
 #endif
 
+// Note: kPrologueWithShortJump1 and kPrologueWithShortJump2 are x86_64 assembly
+//  Adding the preprocessor check here because the variable declaration
+//  are not enabled if building for arm64.
 #if SANITIZER_WINDOWS_x64
   if (memcmp((u8*)address, kPrologueWithShortJump1,
              sizeof(kPrologueWithShortJump1)) == 0 ||
diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp
index 7dca93556527e7b..a4b74da08a7cf4d 100644
--- a/compiler-rt/lib/interception/tests/interception_win_test.cpp
+++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp
@@ -1,4 +1,4 @@
-//===-- interception_win_test.cpp -----------------------------------------===// 
+//===-- interception_win_test.cpp -----------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -17,6 +17,8 @@
 // Too slow for debug build
 #if !SANITIZER_DEBUG
 #if SANITIZER_WINDOWS
+// Note: Disabling these tests for arm64 since
+// the only assembly assumed is x86 and x86_64.
 #if !SANITIZER_WINDOWS_ARM64
 
 #include <stdarg.h>
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
index 6af3051ac5aff3f..49d8a67cc12db3f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
@@ -260,7 +260,7 @@
 #  define SANITIZER_ARM64 0
 #endif
 
-#if SANITIZER_WINDOWS64 && SANITIZER_ARM64 
+#if SANITIZER_WINDOWS64 && SANITIZER_ARM64
 #  define SANITIZER_WINDOWS_ARM64 1
 #  define SANITIZER_WINDOWS_x64 0
 #else

>From 6c44ae03241c72657d77d8d12ca535df9887c521 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzon at farzon.com>
Date: Mon, 13 Nov 2023 13:20:53 -0500
Subject: [PATCH 03/13] fix build

---
 compiler-rt/lib/interception/interception_win.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 8c002c06539b7f1..377811cce2e0f40 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -443,7 +443,6 @@ static const u8 kPrologueWithShortJump1[] = {
     0x48, 0x8b, 0xc1, 0x0f, 0xb7, 0x10, 0x48, 0x83,
     0xc0, 0x02, 0x66, 0x85, 0xd2, 0x75, 0xf4,
 };
-#endif
 
 // ntdll!strrchr in Win11
 //   4c8bc1          mov     r8,rcx

>From 4ab4bb88b11103815aa499e9e4a762739bf12b67 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzon at farzon.com>
Date: Mon, 13 Nov 2023 13:41:49 -0500
Subject: [PATCH 04/13] fix formatting

---
 compiler-rt/lib/interception/interception_win.cpp      | 10 ++++++----
 .../lib/interception/tests/interception_win_test.cpp   |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 377811cce2e0f40..60bef3e8dfb644b 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -431,7 +431,8 @@ static uptr AllocateMemoryForTrampoline(uptr image_address, size_t size) {
 // The following prologues cannot be patched because of the short jump
 // jumping to the patching region.
 
-// Note: The jump byte array below is x86 assembly
+// Note: The jump byte array below is x86_64 assembly.
+
 #if SANITIZER_WINDOWS_x64
 // ntdll!wcslen in Win11
 //   488bc1          mov     rax,rcx
@@ -463,9 +464,10 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
   return 4;
 #endif
 
-// Note: kPrologueWithShortJump1 and kPrologueWithShortJump2 are x86_64 assembly
-//  Adding the preprocessor check here because the variable declaration
-//  are not enabled if building for arm64.
+  // Note: kPrologueWithShortJump1 and kPrologueWithShortJump2 are
+  // x86_64 assembly. Adding the preprocessor check here because the
+  // variable declaration are not enabled if building for arm64.
+
 #if SANITIZER_WINDOWS_x64
   if (memcmp((u8*)address, kPrologueWithShortJump1,
              sizeof(kPrologueWithShortJump1)) == 0 ||
diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp
index a4b74da08a7cf4d..9d93587690542b8 100644
--- a/compiler-rt/lib/interception/tests/interception_win_test.cpp
+++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp
@@ -17,8 +17,10 @@
 // Too slow for debug build
 #if !SANITIZER_DEBUG
 #if SANITIZER_WINDOWS
+
 // Note: Disabling these tests for arm64 since
 // the only assembly assumed is x86 and x86_64.
+
 #if !SANITIZER_WINDOWS_ARM64
 
 #include <stdarg.h>

>From 77450d8637fe5abb01658f6b910cb33153bd0505 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzon at farzon.com>
Date: Mon, 13 Nov 2023 13:53:37 -0500
Subject: [PATCH 05/13] add indent for preprocessor

---
 compiler-rt/lib/interception/interception_win.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 60bef3e8dfb644b..4c6ba12f6c93d8f 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -433,7 +433,7 @@ static uptr AllocateMemoryForTrampoline(uptr image_address, size_t size) {
 
 // Note: The jump byte array below is x86_64 assembly.
 
-#if SANITIZER_WINDOWS_x64
+#  if SANITIZER_WINDOWS_x64
 // ntdll!wcslen in Win11
 //   488bc1          mov     rax,rcx
 //   0fb710          movzx   edx,word ptr [rax]
@@ -468,7 +468,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
   // x86_64 assembly. Adding the preprocessor check here because the
   // variable declaration are not enabled if building for arm64.
 
-#if SANITIZER_WINDOWS_x64
+#  if SANITIZER_WINDOWS_x64
   if (memcmp((u8*)address, kPrologueWithShortJump1,
              sizeof(kPrologueWithShortJump1)) == 0 ||
       memcmp((u8*)address, kPrologueWithShortJump2,

>From 14af38c0d0930fd344f9fa4f835bbb120fd15b04 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik at users.noreply.github.com>
Date: Mon, 13 Nov 2023 10:05:00 -0800
Subject: [PATCH 06/13] [mlir][sparse] unify support of (dis)assemble between
 direct IR/lib path (#71880)

Note that the (dis)assemble operations still make some simplfying
assumptions (e.g. trailing 2-D COO in AoS format) but now at least both
the direct IR and support library path behave exactly the same.

Generalizing the ops is still TBD.
---
 .../SparseTensor/IR/SparseTensorType.h        |   6 +
 .../ExecutionEngine/SparseTensor/Storage.h    |  65 +++--
 .../SparseTensor/Transforms/CodegenUtils.cpp  |  11 +
 .../SparseTensor/Transforms/CodegenUtils.h    |   4 +
 .../Transforms/SparseTensorCodegen.cpp        |  13 -
 .../Transforms/SparseTensorConversion.cpp     | 222 +++++++++++++-----
 .../Transforms/SparseTensorPasses.cpp         |   3 +-
 .../Dialect/SparseTensor/CPU/sparse_pack.mlir | 116 ++++++---
 .../SparseTensor/CPU/sparse_pack_libgen.mlir  | 165 -------------
 9 files changed, 310 insertions(+), 295 deletions(-)
 delete mode 100644 mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_libgen.mlir

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index e808057cf6b0a67..a97c185c12e67d3 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -295,9 +295,15 @@ class SparseTensorType {
   // `getLvlType` method instead of STEA's.
   bool isDenseLvl(Level l) const { return isDenseDLT(getLvlType(l)); }
   bool isCompressedLvl(Level l) const { return isCompressedDLT(getLvlType(l)); }
+  bool isLooseCompressedLvl(Level l) const {
+    return isLooseCompressedDLT(getLvlType(l));
+  }
   bool isSingletonLvl(Level l) const { return isSingletonDLT(getLvlType(l)); }
+  bool is2OutOf4Lvl(Level l) const { return is2OutOf4DLT(getLvlType(l)); }
   bool isOrderedLvl(Level l) const { return isOrderedDLT(getLvlType(l)); }
   bool isUniqueLvl(Level l) const { return isUniqueDLT(getLvlType(l)); }
+  bool isWithPos(Level l) const { return isDLTWithPos(getLvlType(l)); }
+  bool isWithCrd(Level l) const { return isDLTWithCrd(getLvlType(l)); }
 
   /// Returns the coordinate-overhead bitwidth, defaulting to zero.
   unsigned getCrdWidth() const { return enc ? enc.getCrdWidth() : 0; }
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index 460549726356370..1ee5d025f6426f2 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -301,8 +301,8 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
       uint64_t lvlRank = getLvlRank();
       uint64_t valIdx = 0;
       // Linearize the address.
-      for (uint64_t lvl = 0; lvl < lvlRank; lvl++)
-        valIdx = valIdx * getLvlSize(lvl) + lvlCoords[lvl];
+      for (uint64_t l = 0; l < lvlRank; l++)
+        valIdx = valIdx * getLvlSize(l) + lvlCoords[l];
       values[valIdx] = val;
       return;
     }
@@ -472,9 +472,10 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   uint64_t assembledSize(uint64_t parentSz, uint64_t l) const {
     if (isCompressedLvl(l))
       return positions[l][parentSz];
-    if (isSingletonLvl(l))
-      return parentSz; // New size is same as the parent.
-    // TODO: support levels assignment for loose/2:4?
+    if (isLooseCompressedLvl(l))
+      return positions[l][2 * parentSz - 1];
+    if (isSingletonLvl(l) || is2OutOf4Lvl(l))
+      return parentSz; // new size same as the parent
     assert(isDenseLvl(l));
     return parentSz * getLvlSize(l);
   }
@@ -766,40 +767,59 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
     const uint64_t *dim2lvl, const uint64_t *lvl2dim, const intptr_t *lvlBufs)
     : SparseTensorStorage(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
                           dim2lvl, lvl2dim) {
+  // Note that none of the buffers can be reused because ownership
+  // of the memory passed from clients is not necessarily transferred.
+  // Therefore, all data is copied over into a new SparseTensorStorage.
+  //
+  // TODO: this needs to be generalized to all formats AND
+  //       we need a proper audit of e.g. double compressed
+  //       levels where some are not filled
+  //
   uint64_t trailCOOLen = 0, parentSz = 1, bufIdx = 0;
   for (uint64_t l = 0; l < lvlRank; l++) {
-    if (!isUniqueLvl(l) && isCompressedLvl(l)) {
-      // A `compressed_nu` level marks the start of trailing COO start level.
-      // Since the coordinate buffer used for trailing COO are passed in as AoS
-      // scheme, and SparseTensorStorage uses a SoA scheme, we can not simply
-      // copy the value from the provided buffers.
+    if (!isUniqueLvl(l) && (isCompressedLvl(l) || isLooseCompressedLvl(l))) {
+      // A `(loose)compressed_nu` level marks the start of trailing COO
+      // start level. Since the coordinate buffer used for trailing COO
+      // is passed in as AoS scheme and SparseTensorStorage uses a SoA
+      // scheme, we cannot simply copy the value from the provided buffers.
       trailCOOLen = lvlRank - l;
       break;
     }
-    assert(!isSingletonLvl(l) &&
-           "Singleton level not following a compressed_nu level");
-    if (isCompressedLvl(l)) {
+    if (isCompressedLvl(l) || isLooseCompressedLvl(l)) {
       P *posPtr = reinterpret_cast<P *>(lvlBufs[bufIdx++]);
       C *crdPtr = reinterpret_cast<C *>(lvlBufs[bufIdx++]);
-      // Copies the lvlBuf into the vectors. The buffer can not be simply reused
-      // because the memory passed from users is not necessarily allocated on
-      // heap.
-      positions[l].assign(posPtr, posPtr + parentSz + 1);
-      coordinates[l].assign(crdPtr, crdPtr + positions[l][parentSz]);
+      if (isLooseCompressedLvl(l)) {
+        positions[l].assign(posPtr, posPtr + 2 * parentSz);
+        coordinates[l].assign(crdPtr, crdPtr + positions[l][2 * parentSz - 1]);
+      } else {
+        positions[l].assign(posPtr, posPtr + parentSz + 1);
+        coordinates[l].assign(crdPtr, crdPtr + positions[l][parentSz]);
+      }
+    } else if (isSingletonLvl(l)) {
+      assert(0 && "general singleton not supported yet");
+    } else if (is2OutOf4Lvl(l)) {
+      assert(0 && "2Out4 not supported yet");
     } else {
-      // TODO: support levels assignment for loose/2:4?
       assert(isDenseLvl(l));
     }
     parentSz = assembledSize(parentSz, l);
   }
 
+  // Handle Aos vs. SoA mismatch for COO.
   if (trailCOOLen != 0) {
     uint64_t cooStartLvl = lvlRank - trailCOOLen;
-    assert(!isUniqueLvl(cooStartLvl) && isCompressedLvl(cooStartLvl));
+    assert(!isUniqueLvl(cooStartLvl) &&
+           (isCompressedLvl(cooStartLvl) || isLooseCompressedLvl(cooStartLvl)));
     P *posPtr = reinterpret_cast<P *>(lvlBufs[bufIdx++]);
     C *aosCrdPtr = reinterpret_cast<C *>(lvlBufs[bufIdx++]);
-    positions[cooStartLvl].assign(posPtr, posPtr + parentSz + 1);
-    P crdLen = positions[cooStartLvl][parentSz];
+    P crdLen;
+    if (isLooseCompressedLvl(cooStartLvl)) {
+      positions[cooStartLvl].assign(posPtr, posPtr + 2 * parentSz);
+      crdLen = positions[cooStartLvl][2 * parentSz - 1];
+    } else {
+      positions[cooStartLvl].assign(posPtr, posPtr + parentSz + 1);
+      crdLen = positions[cooStartLvl][parentSz];
+    }
     for (uint64_t l = cooStartLvl; l < lvlRank; l++) {
       coordinates[l].resize(crdLen);
       for (uint64_t n = 0; n < crdLen; n++) {
@@ -809,6 +829,7 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
     parentSz = assembledSize(parentSz, cooStartLvl);
   }
 
+  // Copy the values buffer.
   V *valPtr = reinterpret_cast<V *>(lvlBufs[bufIdx]);
   values.assign(valPtr, valPtr + parentSz);
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
index d5c9ee41215ae97..8e2c2cd6dad7b19 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -163,6 +163,17 @@ Value sparse_tensor::genCast(OpBuilder &builder, Location loc, Value value,
   return mlir::convertScalarToDtype(builder, loc, value, dstTp, isUnsignedCast);
 }
 
+Value sparse_tensor::genScalarToTensor(OpBuilder &builder, Location loc,
+                                       Value elem, Type dstTp) {
+  if (auto rtp = dstTp.dyn_cast<RankedTensorType>()) {
+    // Scalars can only be converted to 0-ranked tensors.
+    assert(rtp.getRank() == 0);
+    elem = sparse_tensor::genCast(builder, loc, elem, rtp.getElementType());
+    return builder.create<tensor::FromElementsOp>(loc, rtp, elem);
+  }
+  return sparse_tensor::genCast(builder, loc, elem, dstTp);
+}
+
 Value sparse_tensor::genIndexLoad(OpBuilder &builder, Location loc, Value mem,
                                   Value s) {
   Value load = builder.create<memref::LoadOp>(loc, mem, s);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
index 1f53f3525203c70..d3b0889b71b514c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -142,6 +142,10 @@ class FuncCallOrInlineGenerator {
 /// Add type casting between arith and index types when needed.
 Value genCast(OpBuilder &builder, Location loc, Value value, Type dstTy);
 
+/// Add conversion from scalar to given type (possibly a 0-rank tensor).
+Value genScalarToTensor(OpBuilder &builder, Location loc, Value elem,
+                        Type dstTp);
+
 /// Generates a pointer/index load from the sparse storage scheme. Narrower
 /// data types need to be zero extended before casting the value into the
 /// index type used for looping and indexing.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 08c38394a46343a..888f513be2e4dc7 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -435,19 +435,6 @@ static ReassociationIndices getReassociationForFlattening(ShapedType srcTp) {
   return reassociation;
 }
 
-/// Generates scalar to tensor cast.
-static Value genScalarToTensor(OpBuilder &builder, Location loc, Value elem,
-                               Type dstTp) {
-  if (auto rtp = dstTp.dyn_cast<RankedTensorType>()) {
-    // Scalars can only be converted to 0-ranked tensors.
-    if (rtp.getRank() != 0)
-      return nullptr;
-    elem = genCast(builder, loc, elem, rtp.getElementType());
-    return builder.create<tensor::FromElementsOp>(loc, rtp, elem);
-  }
-  return genCast(builder, loc, elem, dstTp);
-}
-
 //===----------------------------------------------------------------------===//
 // Codegen rules.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index 4fe9c59d8c320a7..e629133171e15dc 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -46,17 +46,6 @@ static std::optional<Type> convertSparseTensorTypes(Type type) {
   return std::nullopt;
 }
 
-/// Replaces the `op` with a `CallOp` to the `getFunc()` function reference.
-static func::CallOp replaceOpWithFuncCall(RewriterBase &rewriter, Operation *op,
-                                          StringRef name, TypeRange resultType,
-                                          ValueRange operands,
-                                          EmitCInterface emitCInterface) {
-  auto fn = getFunc(op->getParentOfType<ModuleOp>(), name, resultType, operands,
-                    emitCInterface);
-  return rewriter.replaceOpWithNewOp<func::CallOp>(op, resultType, fn,
-                                                   operands);
-}
-
 /// Generates call to lookup a level-size.  N.B., this only generates
 /// the raw function call, and therefore (intentionally) does not perform
 /// any dim<->lvl conversion or other logic.
@@ -264,11 +253,36 @@ class NewCallParams final {
 };
 
 /// Generates a call to obtain the values array.
-static Value genValuesCall(OpBuilder &builder, Location loc, ShapedType tp,
-                           ValueRange ptr) {
-  SmallString<15> name{"sparseValues",
-                       primaryTypeFunctionSuffix(tp.getElementType())};
-  return createFuncCall(builder, loc, name, tp, ptr, EmitCInterface::On)
+static Value genValuesCall(OpBuilder &builder, Location loc,
+                           SparseTensorType stt, Value ptr) {
+  auto eltTp = stt.getElementType();
+  auto resTp = MemRefType::get({ShapedType::kDynamic}, eltTp);
+  SmallString<15> name{"sparseValues", primaryTypeFunctionSuffix(eltTp)};
+  return createFuncCall(builder, loc, name, resTp, {ptr}, EmitCInterface::On)
+      .getResult(0);
+}
+
+/// Generates a call to obtain the positions array.
+static Value genPositionsCall(OpBuilder &builder, Location loc,
+                              SparseTensorType stt, Value ptr, Level l) {
+  Type posTp = stt.getPosType();
+  auto resTp = MemRefType::get({ShapedType::kDynamic}, posTp);
+  Value lvl = constantIndex(builder, loc, l);
+  SmallString<17> name{"sparsePositions", overheadTypeFunctionSuffix(posTp)};
+  return createFuncCall(builder, loc, name, resTp, {ptr, lvl},
+                        EmitCInterface::On)
+      .getResult(0);
+}
+
+/// Generates a call to obtain the coordindates array.
+static Value genCoordinatesCall(OpBuilder &builder, Location loc,
+                                SparseTensorType stt, Value ptr, Level l) {
+  Type crdTp = stt.getCrdType();
+  auto resTp = MemRefType::get({ShapedType::kDynamic}, crdTp);
+  Value lvl = constantIndex(builder, loc, l);
+  SmallString<19> name{"sparseCoordinates", overheadTypeFunctionSuffix(crdTp)};
+  return createFuncCall(builder, loc, name, resTp, {ptr, lvl},
+                        EmitCInterface::On)
       .getResult(0);
 }
 
@@ -391,7 +405,7 @@ class SparseTensorAllocConverter
     SmallVector<Value> dimSizes;
     dimSizes.reserve(dimRank);
     unsigned operandCtr = 0;
-    for (Dimension d = 0; d < dimRank; ++d) {
+    for (Dimension d = 0; d < dimRank; d++) {
       dimSizes.push_back(
           stt.isDynamicDim(d)
               ? adaptor.getOperands()[operandCtr++]
@@ -423,7 +437,7 @@ class SparseTensorEmptyConverter : public OpConversionPattern<tensor::EmptyOp> {
     dimSizes.reserve(dimRank);
     auto shape = op.getType().getShape();
     unsigned operandCtr = 0;
-    for (Dimension d = 0; d < dimRank; ++d) {
+    for (Dimension d = 0; d < dimRank; d++) {
       dimSizes.push_back(stt.isDynamicDim(d)
                              ? adaptor.getOperands()[operandCtr++]
                              : constantIndex(rewriter, loc, shape[d]));
@@ -487,12 +501,10 @@ class SparseTensorToPositionsConverter
   LogicalResult
   matchAndRewrite(ToPositionsOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Type resTp = op.getType();
-    Type posTp = cast<ShapedType>(resTp).getElementType();
-    SmallString<17> name{"sparsePositions", overheadTypeFunctionSuffix(posTp)};
-    Value lvl = constantIndex(rewriter, op->getLoc(), op.getLevel());
-    replaceOpWithFuncCall(rewriter, op, name, resTp, {adaptor.getTensor(), lvl},
-                          EmitCInterface::On);
+    auto stt = getSparseTensorType(op.getTensor());
+    auto poss = genPositionsCall(rewriter, op.getLoc(), stt,
+                                 adaptor.getTensor(), op.getLevel());
+    rewriter.replaceOp(op, poss);
     return success();
   }
 };
@@ -505,29 +517,14 @@ class SparseTensorToCoordinatesConverter
   LogicalResult
   matchAndRewrite(ToCoordinatesOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    // TODO: use `SparseTensorType::getCrdType` instead.
-    Type resType = op.getType();
-    const Type crdTp = cast<ShapedType>(resType).getElementType();
-    SmallString<19> name{"sparseCoordinates",
-                         overheadTypeFunctionSuffix(crdTp)};
-    Location loc = op->getLoc();
-    Value lvl = constantIndex(rewriter, loc, op.getLevel());
-
-    // The function returns a MemRef without a layout.
-    MemRefType callRetType = get1DMemRefType(crdTp, false);
-    SmallVector<Value> operands{adaptor.getTensor(), lvl};
-    auto fn = getFunc(op->getParentOfType<ModuleOp>(), name, callRetType,
-                      operands, EmitCInterface::On);
-    Value callRet =
-        rewriter.create<func::CallOp>(loc, callRetType, fn, operands)
-            .getResult(0);
-
+    auto stt = getSparseTensorType(op.getTensor());
+    auto crds = genCoordinatesCall(rewriter, op.getLoc(), stt,
+                                   adaptor.getTensor(), op.getLevel());
     // Cast the MemRef type to the type expected by the users, though these
     // two types should be compatible at runtime.
-    if (resType != callRetType)
-      callRet = rewriter.create<memref::CastOp>(loc, resType, callRet);
-    rewriter.replaceOp(op, callRet);
-
+    if (op.getType() != crds.getType())
+      crds = rewriter.create<memref::CastOp>(op.getLoc(), op.getType(), crds);
+    rewriter.replaceOp(op, crds);
     return success();
   }
 };
@@ -539,9 +536,9 @@ class SparseTensorToValuesConverter : public OpConversionPattern<ToValuesOp> {
   LogicalResult
   matchAndRewrite(ToValuesOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto resType = cast<ShapedType>(op.getType());
-    rewriter.replaceOp(op, genValuesCall(rewriter, op.getLoc(), resType,
-                                         adaptor.getOperands()));
+    auto stt = getSparseTensorType(op.getTensor());
+    auto vals = genValuesCall(rewriter, op.getLoc(), stt, adaptor.getTensor());
+    rewriter.replaceOp(op, vals);
     return success();
   }
 };
@@ -554,13 +551,11 @@ class SparseNumberOfEntriesConverter
   LogicalResult
   matchAndRewrite(NumberOfEntriesOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
     // Query values array size for the actually stored values size.
-    Type eltType = cast<ShapedType>(op.getTensor().getType()).getElementType();
-    auto resTp = MemRefType::get({ShapedType::kDynamic}, eltType);
-    Value values = genValuesCall(rewriter, loc, resTp, adaptor.getOperands());
-    rewriter.replaceOpWithNewOp<memref::DimOp>(op, values,
-                                               constantIndex(rewriter, loc, 0));
+    auto stt = getSparseTensorType(op.getTensor());
+    auto vals = genValuesCall(rewriter, op.getLoc(), stt, adaptor.getTensor());
+    auto zero = constantIndex(rewriter, op.getLoc(), 0);
+    rewriter.replaceOpWithNewOp<memref::DimOp>(op, vals, zero);
     return success();
   }
 };
@@ -701,7 +696,7 @@ class SparseTensorCompressConverter : public OpConversionPattern<CompressOp> {
   }
 };
 
-/// Sparse conversion rule for the sparse_tensor.pack operator.
+/// Sparse conversion rule for the sparse_tensor.assemble operator.
 class SparseTensorAssembleConverter : public OpConversionPattern<AssembleOp> {
 public:
   using OpConversionPattern::OpConversionPattern;
@@ -710,9 +705,12 @@ class SparseTensorAssembleConverter : public OpConversionPattern<AssembleOp> {
                   ConversionPatternRewriter &rewriter) const override {
     const Location loc = op->getLoc();
     const auto dstTp = getSparseTensorType(op.getResult());
-    // AssembleOps always returns a static shaped tensor result.
     assert(dstTp.hasStaticDimShape());
     SmallVector<Value> dimSizes = getDimSizes(rewriter, loc, dstTp);
+    // Use a library method to transfer the external buffers from
+    // clients to the internal SparseTensorStorage. Since we cannot
+    // assume clients transfer ownership of the buffers, this method
+    // will copy all data over into a new SparseTensorStorage.
     Value dst =
         NewCallParams(rewriter, loc)
             .genBuffers(dstTp.withoutDimToLvl(), dimSizes)
@@ -724,6 +722,115 @@ class SparseTensorAssembleConverter : public OpConversionPattern<AssembleOp> {
   }
 };
 
+/// Sparse conversion rule for the sparse_tensor.disassemble operator.
+class SparseTensorDisassembleConverter
+    : public OpConversionPattern<DisassembleOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(DisassembleOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // We simply expose the buffers to the external client. This
+    // assumes the client only reads the buffers (usually copying it
+    // to the external data structures, such as numpy arrays).
+    Location loc = op->getLoc();
+    auto stt = getSparseTensorType(op.getTensor());
+    SmallVector<Value> retVal;
+    SmallVector<Value> retLen;
+    // Get the values buffer first.
+    auto vals = genValuesCall(rewriter, loc, stt, adaptor.getTensor());
+    auto valLenTp = op.getValLen().getType();
+    auto valLen = linalg::createOrFoldDimOp(rewriter, loc, vals, 0);
+    retVal.push_back(vals);
+    retLen.push_back(genScalarToTensor(rewriter, loc, valLen, valLenTp));
+    // Then get the positions and coordinates buffers.
+    const Level lvlRank = stt.getLvlRank();
+    Level trailCOOLen = 0;
+    for (Level l = 0; l < lvlRank; l++) {
+      if (!stt.isUniqueLvl(l) &&
+          (stt.isCompressedLvl(l) || stt.isLooseCompressedLvl(l))) {
+        // A `(loose)compressed_nu` level marks the start of trailing COO
+        // start level. Since the target coordinate buffer used for trailing
+        // COO is passed in as AoS scheme and SparseTensorStorage uses a SoA
+        // scheme, we cannot simply use the internal buffers.
+        trailCOOLen = lvlRank - l;
+        break;
+      }
+      if (stt.isWithPos(l)) {
+        auto poss =
+            genPositionsCall(rewriter, loc, stt, adaptor.getTensor(), l);
+        auto posLen = linalg::createOrFoldDimOp(rewriter, loc, poss, 0);
+        auto posLenTp = op.getLvlLens().getTypes()[retLen.size() - 1];
+        retVal.push_back(poss);
+        retLen.push_back(genScalarToTensor(rewriter, loc, posLen, posLenTp));
+      }
+      if (stt.isWithCrd(l)) {
+        auto crds =
+            genCoordinatesCall(rewriter, loc, stt, adaptor.getTensor(), l);
+        auto crdLen = linalg::createOrFoldDimOp(rewriter, loc, crds, 0);
+        auto crdLenTp = op.getLvlLens().getTypes()[retLen.size() - 1];
+        retVal.push_back(crds);
+        retLen.push_back(genScalarToTensor(rewriter, loc, crdLen, crdLenTp));
+      }
+    }
+    // Handle AoS vs. SoA mismatch for COO.
+    if (trailCOOLen != 0) {
+      uint64_t cooStartLvl = lvlRank - trailCOOLen;
+      assert(!stt.isUniqueLvl(cooStartLvl) &&
+             (stt.isCompressedLvl(cooStartLvl) ||
+              stt.isLooseCompressedLvl(cooStartLvl)));
+      // Positions.
+      auto poss = genPositionsCall(rewriter, loc, stt, adaptor.getTensor(),
+                                   cooStartLvl);
+      auto posLen = linalg::createOrFoldDimOp(rewriter, loc, poss, 0);
+      auto posLenTp = op.getLvlLens().getTypes()[retLen.size() - 1];
+      retVal.push_back(poss);
+      retLen.push_back(genScalarToTensor(rewriter, loc, posLen, posLenTp));
+      // Coordinates, copied over with:
+      //    for (i = 0; i < crdLen; i++)
+      //       buf[i][0] = crd0[i]; buf[i][1] = crd1[i];
+      auto buf =
+          genToMemref(rewriter, loc, op.getOutLevels()[retLen.size() - 1]);
+      auto crds0 = genCoordinatesCall(rewriter, loc, stt, adaptor.getTensor(),
+                                      cooStartLvl);
+      auto crds1 = genCoordinatesCall(rewriter, loc, stt, adaptor.getTensor(),
+                                      cooStartLvl + 1);
+      auto crdLen = linalg::createOrFoldDimOp(rewriter, loc, crds0, 0);
+      auto two = constantIndex(rewriter, loc, 2);
+      auto bufLen = rewriter.create<arith::MulIOp>(loc, crdLen, two);
+      Type indexType = rewriter.getIndexType();
+      auto zero = constantZero(rewriter, loc, indexType);
+      auto one = constantOne(rewriter, loc, indexType);
+      scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, zero, crdLen, one);
+      auto idx = forOp.getInductionVar();
+      rewriter.setInsertionPointToStart(forOp.getBody());
+      auto c0 = rewriter.create<memref::LoadOp>(loc, crds0, idx);
+      auto c1 = rewriter.create<memref::LoadOp>(loc, crds1, idx);
+      SmallVector<Value> args;
+      args.push_back(idx);
+      args.push_back(zero);
+      rewriter.create<memref::StoreOp>(loc, c0, buf, args);
+      args[1] = one;
+      rewriter.create<memref::StoreOp>(loc, c1, buf, args);
+      rewriter.setInsertionPointAfter(forOp);
+      auto bufLenTp = op.getLvlLens().getTypes()[retLen.size() - 1];
+      retVal.push_back(buf);
+      retLen.push_back(genScalarToTensor(rewriter, loc, bufLen, bufLenTp));
+    }
+    // Converts MemRefs back to Tensors.
+    assert(retVal.size() + retLen.size() == op.getNumResults());
+    for (unsigned i = 0, sz = retVal.size(); i < sz; i++) {
+      auto tensor = rewriter.create<bufferization::ToTensorOp>(loc, retVal[i]);
+      retVal[i] =
+          rewriter.create<tensor::CastOp>(loc, op.getResultTypes()[i], tensor);
+    }
+    // Appends the actual memory length used in each buffer returned.
+    retVal.append(retLen.begin(), retLen.end());
+    rewriter.replaceOp(op, retVal);
+    return success();
+  }
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -752,5 +859,6 @@ void mlir::populateSparseTensorConversionPatterns(TypeConverter &typeConverter,
            SparseTensorToValuesConverter, SparseNumberOfEntriesConverter,
            SparseTensorLoadConverter, SparseTensorInsertConverter,
            SparseTensorExpandConverter, SparseTensorCompressConverter,
-           SparseTensorAssembleConverter>(typeConverter, patterns.getContext());
+           SparseTensorAssembleConverter, SparseTensorDisassembleConverter>(
+          typeConverter, patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index e1cbf3482708ad0..10ebfa7922088a4 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -198,7 +198,8 @@ struct SparseTensorConversionPass
     // The following operations and dialects may be introduced by the
     // rewriting rules, and are therefore marked as legal.
     target.addLegalOp<complex::ConstantOp, complex::NotEqualOp, linalg::FillOp,
-                      linalg::YieldOp, tensor::ExtractOp>();
+                      linalg::YieldOp, tensor::ExtractOp,
+                      tensor::FromElementsOp>();
     target.addLegalDialect<
         arith::ArithDialect, bufferization::BufferizationDialect,
         LLVM::LLVMDialect, memref::MemRefDialect, scf::SCFDialect>();
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
index a2f93614590f106..840e3c97ae28843 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
@@ -3,7 +3,7 @@
 //
 // Set-up that's shared across all tests in this directory. In principle, this
 // config could be moved to lit.local.cfg. However, there are downstream users that
-//  do not use these LIT config files. Hence why this is kept inline.
+// do not use these LIT config files. Hence why this is kept inline.
 //
 // DEFINE: %{sparse_compiler_opts} = enable-runtime-library=true
 // DEFINE: %{sparse_compiler_opts_sve} = enable-arm-sve=true %{sparse_compiler_opts}
@@ -17,15 +17,19 @@
 // DEFINE: %{env} =
 //--------------------------------------------------------------------------------------------------
 
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation.
 // REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false
 // RUN: %{compile} | %{run} | FileCheck %s
 //
-// Do the same run, but now with VLA vectorization.
-// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false vl=4
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and VLA vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
 
-// TODO: support sparse_tensor.disassemble on libgen path.
-
 #SortedCOO = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)
 }>
@@ -54,11 +58,13 @@ module {
     %c0 = arith.constant 0 : index
     %f0 = arith.constant 0.0 : f64
     %i0 = arith.constant 0 : i32
+
     //
-    // Initialize a 3-dim dense tensor.
+    // Setup COO.
     //
+
     %data = arith.constant dense<
-       [  1.0,  2.0,  3.0]
+       [ 1.0,  2.0,  3.0 ]
     > : tensor<3xf64>
 
     %pos = arith.constant dense<
@@ -83,12 +89,16 @@ module {
 
     %s4 = sparse_tensor.assemble %data, %pos, %index : tensor<3xf64>, tensor<2xindex>, tensor<3x2xindex>
                                           to tensor<10x10xf64, #SortedCOO>
-    %s5= sparse_tensor.assemble %data, %pos32, %index32 : tensor<3xf64>, tensor<2xi32>, tensor<3x2xi32>
-                                           to tensor<10x10xf64, #SortedCOOI32>
+    %s5 = sparse_tensor.assemble %data, %pos32, %index32 : tensor<3xf64>, tensor<2xi32>, tensor<3x2xi32>
+                                          to tensor<10x10xf64, #SortedCOOI32>
+
+    //
+    // Setup CSR.
+    //
 
     %csr_data = arith.constant dense<
-       [  1.0,  2.0,  3.0,  4.0]
-    > : tensor<4xf64>
+       [ 1.0,  2.0,  3.0 ]
+    > : tensor<3xf64>
 
     %csr_pos32 = arith.constant dense<
        [0, 1, 3]
@@ -97,12 +107,16 @@ module {
     %csr_index32 = arith.constant dense<
        [1, 0, 1]
     > : tensor<3xi32>
-    %csr= sparse_tensor.assemble %csr_data, %csr_pos32, %csr_index32 : tensor<4xf64>, tensor<3xi32>, tensor<3xi32>
+    %csr = sparse_tensor.assemble %csr_data, %csr_pos32, %csr_index32 : tensor<3xf64>, tensor<3xi32>, tensor<3xi32>
                                            to tensor<2x2xf64, #CSR>
 
+    //
+    // Setup BCOO.
+    //
+
     %bdata = arith.constant dense<
-       [  1.0,  2.0,  3.0,  4.0,  5.0,  0.0]
-    > : tensor<6xf64>
+       [ 1.0,  2.0,  3.0,  4.0,  5.0 ]
+    > : tensor<5xf64>
 
     %bpos = arith.constant dense<
        [0, 3, 3, 5]
@@ -116,10 +130,15 @@ module {
        [  4,  2],
        [ 10, 10]]
     > : tensor<6x2xindex>
+
     %bs = sparse_tensor.assemble %bdata, %bpos, %bindex :
-          tensor<6xf64>, tensor<4xindex>,  tensor<6x2xindex> to tensor<2x10x10xf64, #BCOO>
+          tensor<5xf64>, tensor<4xindex>,  tensor<6x2xindex> to tensor<2x10x10xf64, #BCOO>
 
-    // CHECK:1
+    //
+    // Verify results.
+    //
+
+    // CHECK:     1
     // CHECK-NEXT:2
     // CHECK-NEXT:1
     //
@@ -135,7 +154,7 @@ module {
         vector.print %1: index
         vector.print %2: index
         vector.print %v: f64
-     }
+    }
 
     // CHECK-NEXT:1
     // CHECK-NEXT:2
@@ -153,7 +172,7 @@ module {
         vector.print %1: index
         vector.print %2: index
         vector.print %v: f64
-     }
+    }
 
     // CHECK-NEXT:0
     // CHECK-NEXT:1
@@ -171,32 +190,43 @@ module {
         vector.print %1: index
         vector.print %2: index
         vector.print %v: f64
-     }
-
-    %d_csr = tensor.empty() : tensor<4xf64>
-    %p_csr = tensor.empty() : tensor<3xi32>
-    %i_csr = tensor.empty() : tensor<3xi32>
-    %rd_csr, %rp_csr, %ri_csr, %ld_csr, %lp_csr, %li_csr = sparse_tensor.disassemble %csr : tensor<2x2xf64, #CSR>
-                 outs(%d_csr, %p_csr, %i_csr : tensor<4xf64>, tensor<3xi32>, tensor<3xi32>)
-                 -> tensor<4xf64>, (tensor<3xi32>, tensor<3xi32>), index, (i32, i64)
-
-    // CHECK-NEXT: ( 1, 2, 3, {{.*}} )
-    %vd_csr = vector.transfer_read %rd_csr[%c0], %f0 : tensor<4xf64>, vector<4xf64>
-    vector.print %vd_csr : vector<4xf64>
+    }
 
+    // CHECK-NEXT:0
+    // CHECK-NEXT:1
+    // CHECK-NEXT:2
+    // CHECK-NEXT:1
+    //
+    // CHECK-NEXT:0
+    // CHECK-NEXT:5
+    // CHECK-NEXT:6
+    // CHECK-NEXT:2
+    //
+    // CHECK-NEXT:0
+    // CHECK-NEXT:7
+    // CHECK-NEXT:8
+    // CHECK-NEXT:3
+    //
     // CHECK-NEXT:1
     // CHECK-NEXT:2
     // CHECK-NEXT:3
+    // CHECK-NEXT:4
     //
+    // CHECK-NEXT:1
     // CHECK-NEXT:4
+    // CHECK-NEXT:2
     // CHECK-NEXT:5
-    //
-    // Make sure the trailing zeros are not traversed.
-    // CHECK-NOT: 0
     sparse_tensor.foreach in %bs : tensor<2x10x10xf64, #BCOO> do {
       ^bb0(%0: index, %1: index, %2: index, %v: f64) :
+        vector.print %0: index
+        vector.print %1: index
+        vector.print %2: index
         vector.print %v: f64
-     }
+    }
+
+    //
+    // Verify disassemble operations.
+    //
 
     %od = tensor.empty() : tensor<3xf64>
     %op = tensor.empty() : tensor<2xi32>
@@ -213,6 +243,16 @@ module {
     %vi = vector.transfer_read %i[%c0, %c0], %i0 : tensor<3x2xi32>, vector<3x2xi32>
     vector.print %vi : vector<3x2xi32>
 
+    %d_csr = tensor.empty() : tensor<4xf64>
+    %p_csr = tensor.empty() : tensor<3xi32>
+    %i_csr = tensor.empty() : tensor<3xi32>
+    %rd_csr, %rp_csr, %ri_csr, %ld_csr, %lp_csr, %li_csr = sparse_tensor.disassemble %csr : tensor<2x2xf64, #CSR>
+                 outs(%d_csr, %p_csr, %i_csr : tensor<4xf64>, tensor<3xi32>, tensor<3xi32>)
+                 -> tensor<4xf64>, (tensor<3xi32>, tensor<3xi32>), index, (i32, i64)
+
+    // CHECK-NEXT: ( 1, 2, 3 )
+    %vd_csr = vector.transfer_read %rd_csr[%c0], %f0 : tensor<4xf64>, vector<3xf64>
+    vector.print %vd_csr : vector<3xf64>
 
     %bod = tensor.empty() : tensor<6xf64>
     %bop = tensor.empty() : tensor<4xindex>
@@ -221,15 +261,17 @@ module {
                     outs(%bod, %bop, %boi : tensor<6xf64>, tensor<4xindex>, tensor<6x2xindex>)
                     -> tensor<6xf64>, (tensor<4xindex>, tensor<6x2xindex>), index, (i32, tensor<i64>)
 
-    // CHECK-NEXT: ( 1, 2, 3, 4, 5, {{.*}} )
-    %vbd = vector.transfer_read %bd[%c0], %f0 : tensor<6xf64>, vector<6xf64>
-    vector.print %vbd : vector<6xf64>
+    // CHECK-NEXT: ( 1, 2, 3, 4, 5 )
+    %vbd = vector.transfer_read %bd[%c0], %f0 : tensor<6xf64>, vector<5xf64>
+    vector.print %vbd : vector<5xf64>
+
     // CHECK-NEXT: 5
     vector.print %ld : index
 
     // CHECK-NEXT: ( ( 1, 2 ), ( 5, 6 ), ( 7, 8 ), ( 2, 3 ), ( 4, 2 ), ( {{.*}}, {{.*}} ) )
     %vbi = vector.transfer_read %bi[%c0, %c0], %c0 : tensor<6x2xindex>, vector<6x2xindex>
     vector.print %vbi : vector<6x2xindex>
+
     // CHECK-NEXT: 10
     %si = tensor.extract %li[] : tensor<i64>
     vector.print %si : i64
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_libgen.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_libgen.mlir
deleted file mode 100644
index 6540c950ab675b0..000000000000000
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_libgen.mlir
+++ /dev/null
@@ -1,165 +0,0 @@
-//--------------------------------------------------------------------------------------------------
-// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
-//
-// Set-up that's shared across all tests in this directory. In principle, this
-// config could be moved to lit.local.cfg. However, there are downstream users that
-//  do not use these LIT config files. Hence why this is kept inline.
-//
-// DEFINE: %{sparse_compiler_opts} = enable-runtime-library=true
-// DEFINE: %{sparse_compiler_opts_sve} = enable-arm-sve=true %{sparse_compiler_opts}
-// DEFINE: %{compile} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts}"
-// DEFINE: %{compile_sve} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts_sve}"
-// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
-// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
-// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
-//
-// DEFINE: %{env} =
-//--------------------------------------------------------------------------------------------------
-
-// RUN: %{compile} | %{run} | FileCheck %s
-//
-// Do the same run, but now with VLA vectorization.
-// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=true vl=4
-// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
-
-// TODO: This is considered to be a short-living tests and should be merged with sparse_pack.mlir
-// after sparse_tensor.disassemble is supported on libgen path.
-
-#SortedCOO = #sparse_tensor.encoding<{
-  map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)
-}>
-
-#SortedCOOI32 = #sparse_tensor.encoding<{
-  map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton),
-  posWidth = 32,
-  crdWidth = 32
-}>
-
-#CSR = #sparse_tensor.encoding<{
-  map = (d0, d1) -> (d0 : dense, d1 : compressed),
-  posWidth = 32,
-  crdWidth = 32
-}>
-
-// TODO: "loose_compressed" is not supported by libgen path.
-// #BCOO = #sparse_tensor.encoding<{
-//   map = (d0, d1, d2) -> (d0 : dense, d1 : compressed(nonunique, high), d2 : singleton)
-//}>
-
-module {
-  //
-  // Main driver.
-  //
-  func.func @entry() {
-    %c0 = arith.constant 0 : index
-    %f0 = arith.constant 0.0 : f64
-    %i0 = arith.constant 0 : i32
-    //
-    // Initialize a 3-dim dense tensor.
-    //
-    %data = arith.constant dense<
-       [  1.0,  2.0,  3.0]
-    > : tensor<3xf64>
-
-    %pos = arith.constant dense<
-       [0, 3]
-    > : tensor<2xindex>
-
-    %index = arith.constant dense<
-       [[  1,  2],
-        [  5,  6],
-        [  7,  8]]
-    > : tensor<3x2xindex>
-
-    %pos32 = arith.constant dense<
-       [0, 3]
-    > : tensor<2xi32>
-
-    %index32 = arith.constant dense<
-       [[  1,  2],
-        [  5,  6],
-        [  7,  8]]
-    > : tensor<3x2xi32>
-
-    %s4 = sparse_tensor.assemble %data, %pos, %index : tensor<3xf64>, tensor<2xindex>, tensor<3x2xindex>
-                                          to tensor<10x10xf64, #SortedCOO>
-    %s5= sparse_tensor.assemble %data, %pos32, %index32 : tensor<3xf64>, tensor<2xi32>, tensor<3x2xi32>
-                                           to tensor<10x10xf64, #SortedCOOI32>
-
-    %csr_data = arith.constant dense<
-       [  1.0,  2.0,  3.0,  4.0]
-    > : tensor<4xf64>
-
-    %csr_pos32 = arith.constant dense<
-       [0, 1, 3]
-    > : tensor<3xi32>
-
-    %csr_index32 = arith.constant dense<
-       [1, 0, 1]
-    > : tensor<3xi32>
-    %csr= sparse_tensor.assemble %csr_data, %csr_pos32, %csr_index32 : tensor<4xf64>, tensor<3xi32>, tensor<3xi32>
-                                           to tensor<2x2xf64, #CSR>
-
-    // CHECK:1
-    // CHECK-NEXT:2
-    // CHECK-NEXT:1
-    //
-    // CHECK-NEXT:5
-    // CHECK-NEXT:6
-    // CHECK-NEXT:2
-    //
-    // CHECK-NEXT:7
-    // CHECK-NEXT:8
-    // CHECK-NEXT:3
-    sparse_tensor.foreach in %s4 : tensor<10x10xf64, #SortedCOO> do {
-      ^bb0(%1: index, %2: index, %v: f64) :
-        vector.print %1: index
-        vector.print %2: index
-        vector.print %v: f64
-     }
-
-    // CHECK-NEXT:1
-    // CHECK-NEXT:2
-    // CHECK-NEXT:1
-    //
-    // CHECK-NEXT:5
-    // CHECK-NEXT:6
-    // CHECK-NEXT:2
-    //
-    // CHECK-NEXT:7
-    // CHECK-NEXT:8
-    // CHECK-NEXT:3
-    sparse_tensor.foreach in %s5 : tensor<10x10xf64, #SortedCOOI32> do {
-      ^bb0(%1: index, %2: index, %v: f64) :
-        vector.print %1: index
-        vector.print %2: index
-        vector.print %v: f64
-     }
-
-    // CHECK-NEXT:0
-    // CHECK-NEXT:1
-    // CHECK-NEXT:1
-    //
-    // CHECK-NEXT:1
-    // CHECK-NEXT:0
-    // CHECK-NEXT:2
-    //
-    // CHECK-NEXT:1
-    // CHECK-NEXT:1
-    // CHECK-NEXT:3
-    sparse_tensor.foreach in %csr : tensor<2x2xf64, #CSR> do {
-      ^bb0(%1: index, %2: index, %v: f64) :
-        vector.print %1: index
-        vector.print %2: index
-        vector.print %v: f64
-     }
-
-
-    bufferization.dealloc_tensor %s4  : tensor<10x10xf64, #SortedCOO>
-    bufferization.dealloc_tensor %s5  : tensor<10x10xf64, #SortedCOOI32>
-    bufferization.dealloc_tensor %csr  : tensor<2x2xf64, #CSR>
-
-    return
-  }
-}

>From 24fad83915d610810f6631484bb310ebbbd5e1fc Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <joseph942010 at gmail.com>
Date: Mon, 13 Nov 2023 13:05:27 -0500
Subject: [PATCH 07/13] [llvm] Remove no-op ptr-to-ptr bitcasts (NFC) (#72133)

Opaque ptr cleanup effort (NFC).
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp           |  2 +-
 llvm/lib/IR/Module.cpp                              | 13 -------------
 llvm/lib/LTO/LTO.cpp                                |  2 +-
 llvm/lib/Transforms/CFGuard/CFGuard.cpp             |  5 -----
 llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp |  4 +---
 5 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 6be58d70648f4db..24d15267a65e933 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4494,7 +4494,7 @@ Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
                                                     StringRef EntryFnIDName) {
   if (Config.isTargetDevice()) {
     assert(OutlinedFn && "The outlined function must exist if embedded");
-    return ConstantExpr::getBitCast(OutlinedFn, Builder.getInt8PtrTy());
+    return OutlinedFn;
   }
 
   return new GlobalVariable(
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 17efe7956a21c5d..eeb90a6cb3c465a 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -156,12 +156,6 @@ FunctionCallee Module::getOrInsertFunction(StringRef Name, FunctionType *Ty,
     return {Ty, New}; // Return the new prototype.
   }
 
-  // If the function exists but has the wrong type, return a bitcast to the
-  // right type.
-  auto *PTy = PointerType::get(Ty, F->getAddressSpace());
-  if (F->getType() != PTy)
-    return {Ty, ConstantExpr::getBitCast(F, PTy)};
-
   // Otherwise, we just found the existing function or a prototype.
   return {Ty, F};
 }
@@ -212,13 +206,6 @@ Constant *Module::getOrInsertGlobal(
     GV = CreateGlobalCallback();
   assert(GV && "The CreateGlobalCallback is expected to create a global");
 
-  // If the variable exists but has the wrong type, return a bitcast to the
-  // right type.
-  Type *GVTy = GV->getType();
-  PointerType *PTy = PointerType::get(Ty, GVTy->getPointerAddressSpace());
-  if (GVTy != PTy)
-    return ConstantExpr::getBitCast(GV, PTy);
-
   // Otherwise, we just found the existing function or a prototype.
   return GV;
 }
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 214c2ef45de0664..e111e09681178e2 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1260,7 +1260,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
                                   ConstantAggregateZero::get(Ty), "");
     GV->setAlignment(I.second.Alignment);
     if (OldGV) {
-      OldGV->replaceAllUsesWith(ConstantExpr::getBitCast(GV, OldGV->getType()));
+      OldGV->replaceAllUsesWith(GV);
       GV->takeName(OldGV);
       OldGV->eraseFromParent();
     } else {
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index b043879359ac349..387734358775b38 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -195,11 +195,6 @@ void CFGuard::insertCFGuardDispatch(CallBase *CB) {
   Value *CalledOperand = CB->getCalledOperand();
   Type *CalledOperandType = CalledOperand->getType();
 
-  // Cast the guard dispatch global to the type of the called operand.
-  PointerType *PTy = PointerType::get(CalledOperandType, 0);
-  if (GuardFnGlobal->getType() != PTy)
-    GuardFnGlobal = ConstantExpr::getBitCast(GuardFnGlobal, PTy);
-
   // Load the global as a pointer to a function of the same type.
   LoadInst *GuardDispatchLoad = B.CreateLoad(CalledOperandType, GuardFnGlobal);
 
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index b6e39c6af9eec26..092f1799755d174 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -65,9 +65,7 @@ static void insertCall(Function &CurFn, StringRef Func,
         InsertionPt);
     RetAddr->setDebugLoc(DL);
 
-    Value *Args[] = {
-        ConstantExpr::getBitCast(&CurFn, PointerType::getUnqual(C)), RetAddr};
-
+    Value *Args[] = {&CurFn, RetAddr};
     CallInst *Call =
         CallInst::Create(Fn, ArrayRef<Value *>(Args), "", InsertionPt);
     Call->setDebugLoc(DL);

>From f63c5ba268c6b36ac7f9a093d8574dc5d40db8c6 Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford at apple.com>
Date: Mon, 13 Nov 2023 10:12:36 -0800
Subject: [PATCH 08/13] [lldb] Remove
 StructuredData::Array::GetItemAtIndexAsArray (#71994)

This method is completely unused.
---
 lldb/include/lldb/Utility/StructuredData.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/lldb/include/lldb/Utility/StructuredData.h b/lldb/include/lldb/Utility/StructuredData.h
index 8d0ae372f43c6bf..e7ee12868512f4a 100644
--- a/lldb/include/lldb/Utility/StructuredData.h
+++ b/lldb/include/lldb/Utility/StructuredData.h
@@ -276,16 +276,6 @@ class StructuredData {
       return {};
     }
 
-    bool GetItemAtIndexAsArray(size_t idx, Array *&result) const {
-      result = nullptr;
-      ObjectSP value_sp = GetItemAtIndex(idx);
-      if (value_sp.get()) {
-        result = value_sp->GetAsArray();
-        return (result != nullptr);
-      }
-      return false;
-    }
-
     void Push(const ObjectSP &item) { m_items.push_back(item); }
 
     void AddItem(const ObjectSP &item) { m_items.push_back(item); }

>From 9eb8d8cd10da2df08dafa6faeb080dceed66fb1b Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar at google.com>
Date: Mon, 13 Nov 2023 10:21:21 -0800
Subject: [PATCH 09/13] [mlir][py] Overload print with state. (#72064)

Enables reusing the AsmState when printing from Python. Also moves the
fileObject and binary to the end (pybind11::object was resulting in the
overload not working unless `state=` was specified).

---------

Co-authored-by: Maksim Levental <maksim.levental at gmail.com>
---
 mlir/lib/Bindings/Python/IRCore.cpp | 47 +++++++++++++++++++++++------
 mlir/lib/Bindings/Python/IRModule.h |  9 ++++--
 mlir/test/python/ir/operation.py    |  7 ++++-
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 745aa64e63b67d4..3ddb750bbcabc78 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -110,6 +110,15 @@ static const char kOperationPrintDocstring[] =
     invalid, behavior is undefined.
 )";
 
+static const char kOperationPrintStateDocstring[] =
+    R"(Prints the assembly form of the operation to a file like object.
+
+Args:
+  file: The file like object to write to. Defaults to sys.stdout.
+  binary: Whether to write bytes (True) or str (False). Defaults to False.
+  state: AsmState capturing the operation numbering and flags.
+)";
+
 static const char kOperationGetAsmDocstring[] =
     R"(Gets the assembly form of the operation with all options available.
 
@@ -1169,11 +1178,11 @@ void PyOperation::checkValid() const {
   }
 }
 
-void PyOperationBase::print(py::object fileObject, bool binary,
-                            std::optional<int64_t> largeElementsLimit,
+void PyOperationBase::print(std::optional<int64_t> largeElementsLimit,
                             bool enableDebugInfo, bool prettyDebugInfo,
                             bool printGenericOpForm, bool useLocalScope,
-                            bool assumeVerified) {
+                            bool assumeVerified, py::object fileObject,
+                            bool binary) {
   PyOperation &operation = getOperation();
   operation.checkValid();
   if (fileObject.is_none())
@@ -1198,6 +1207,17 @@ void PyOperationBase::print(py::object fileObject, bool binary,
   mlirOpPrintingFlagsDestroy(flags);
 }
 
+void PyOperationBase::print(PyAsmState &state, py::object fileObject,
+                            bool binary) {
+  PyOperation &operation = getOperation();
+  operation.checkValid();
+  if (fileObject.is_none())
+    fileObject = py::module::import("sys").attr("stdout");
+  PyFileAccumulator accum(fileObject, binary);
+  mlirOperationPrintWithState(operation, state.get(), accum.getCallback(),
+                              accum.getUserData());
+}
+
 void PyOperationBase::writeBytecode(const py::object &fileObject,
                                     std::optional<int64_t> bytecodeVersion) {
   PyOperation &operation = getOperation();
@@ -1230,13 +1250,14 @@ py::object PyOperationBase::getAsm(bool binary,
   } else {
     fileObject = py::module::import("io").attr("StringIO")();
   }
-  print(fileObject, /*binary=*/binary,
-        /*largeElementsLimit=*/largeElementsLimit,
+  print(/*largeElementsLimit=*/largeElementsLimit,
         /*enableDebugInfo=*/enableDebugInfo,
         /*prettyDebugInfo=*/prettyDebugInfo,
         /*printGenericOpForm=*/printGenericOpForm,
         /*useLocalScope=*/useLocalScope,
-        /*assumeVerified=*/assumeVerified);
+        /*assumeVerified=*/assumeVerified,
+        /*fileObject=*/fileObject,
+        /*binary=*/binary);
 
   return fileObject.attr("getvalue")();
 }
@@ -2946,15 +2967,23 @@ void mlir::python::populateIRCore(py::module &m) {
                                /*assumeVerified=*/false);
           },
           "Returns the assembly form of the operation.")
-      .def("print", &PyOperationBase::print,
+      .def("print",
+           py::overload_cast<PyAsmState &, pybind11::object, bool>(
+               &PyOperationBase::print),
+           py::arg("state"), py::arg("file") = py::none(),
+           py::arg("binary") = false, kOperationPrintStateDocstring)
+      .def("print",
+           py::overload_cast<std::optional<int64_t>, bool, bool, bool, bool,
+                             bool, py::object, bool>(
+               &PyOperationBase::print),
            // Careful: Lots of arguments must match up with print method.
-           py::arg("file") = py::none(), py::arg("binary") = false,
            py::arg("large_elements_limit") = py::none(),
            py::arg("enable_debug_info") = false,
            py::arg("pretty_debug_info") = false,
            py::arg("print_generic_op_form") = false,
            py::arg("use_local_scope") = false,
-           py::arg("assume_verified") = false, kOperationPrintDocstring)
+           py::arg("assume_verified") = false, py::arg("file") = py::none(),
+           py::arg("binary") = false, kOperationPrintDocstring)
       .def("write_bytecode", &PyOperationBase::writeBytecode, py::arg("file"),
            py::arg("desired_version") = py::none(),
            kOperationPrintBytecodeDocstring)
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index af55693f18fbbf9..d99b87d19bbea3c 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -550,16 +550,19 @@ class PyModule : public BaseContextObject {
   pybind11::handle handle;
 };
 
+class PyAsmState;
+
 /// Base class for PyOperation and PyOpView which exposes the primary, user
 /// visible methods for manipulating it.
 class PyOperationBase {
 public:
   virtual ~PyOperationBase() = default;
   /// Implements the bound 'print' method and helps with others.
-  void print(pybind11::object fileObject, bool binary,
-             std::optional<int64_t> largeElementsLimit, bool enableDebugInfo,
+  void print(std::optional<int64_t> largeElementsLimit, bool enableDebugInfo,
              bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope,
-             bool assumeVerified);
+             bool assumeVerified, py::object fileObject, bool binary);
+  void print(PyAsmState &state, py::object fileObject, bool binary);
+
   pybind11::object getAsm(bool binary,
                           std::optional<int64_t> largeElementsLimit,
                           bool enableDebugInfo, bool prettyDebugInfo,
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index 04239b048c1c641..04f8a9936e31f79 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -622,10 +622,15 @@ def testOperationPrint():
     print(bytes_value.__class__)
     print(bytes_value)
 
-    # Test get_asm local_scope.
+    # Test print local_scope.
     # CHECK: constant dense<[1, 2, 3, 4]> : tensor<4xi32> loc("nom")
     module.operation.print(enable_debug_info=True, use_local_scope=True)
 
+    # Test printing using state.
+    state = AsmState(module.operation)
+    # CHECK: constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+    module.operation.print(state)
+
     # Test get_asm with options.
     # CHECK: value = dense_resource<__elided__> : tensor<4xi32>
     # CHECK: "func.return"(%arg0) : (i32) -> () -:4:7

>From a524676378247559659e68ab9b980ed0b92c459b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Mon, 13 Nov 2023 10:26:32 -0800
Subject: [PATCH 10/13] [RISCV][GISel] Legalize G_CTPOP. (#72005)

The base ISA does not have an instruction for this so we need to lower.
Zbb support will come in a future patch.
---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |   5 +
 .../legalizer/rv32/legalize-ctpop.mir         | 198 ++++++++++++++++++
 .../legalizer/rv64/legalize-ctpop.mir         | 180 ++++++++++++++++
 3 files changed, 383 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-ctpop.mir
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ctpop.mir

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 5acc933438f6bea..7accc6e1efc2756 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -89,6 +89,11 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
       .maxScalar(0, sXLen)
       .lower();
 
+  getActionDefinitionsBuilder(G_CTPOP)
+      .maxScalar(0, sXLen)
+      .scalarSameSizeAs(1, 0)
+      .lower();
+
   getActionDefinitionsBuilder({G_CONSTANT, G_IMPLICIT_DEF})
       .legalFor({s32, sXLen, p0})
       .widenScalarToNextPow2(0)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-ctpop.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-ctpop.mir
new file mode 100644
index 000000000000000..0ea924745d40944
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-ctpop.mir
@@ -0,0 +1,198 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mattr=+m -mtriple=riscv32 -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            ctpop_i8
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: ctpop_i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 85
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[AND1]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C4]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 51
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C5]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND3]], [[AND4]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C6]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[C8]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C9]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[LSHR3]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %2:_(s8) = G_CTPOP %0(s8)
+    %3:_(s32) = G_ANYEXT %2(s8)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ctpop_i16
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: ctpop_i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 21845
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[AND1]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C4]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 13107
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C5]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND3]], [[AND4]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C6]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 3855
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 257
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[C8]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C10]]
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[C9]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[LSHR3]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %2:_(s16) = G_CTPOP %0(s16)
+    %3:_(s32) = G_ANYEXT %2(s16)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ctpop_i32
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: ctpop_i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655765
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[AND]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 858993459
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C3]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND1]], [[AND2]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C4]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[LSHR3]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = G_CTPOP %0(s32)
+    $x10 = COPY %1(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ctpop_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: ctpop_i64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655765
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[AND]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 858993459
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C3]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND1]], [[AND2]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C4]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32)
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C8]](s32)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655765
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C9]]
+    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[AND4]]
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[SUB1]], [[C10]](s32)
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 858993459
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C11]]
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C11]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND5]], [[AND6]]
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[ADD2]], [[C12]](s32)
+    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR6]], [[ADD2]]
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ADD3]], [[C13]]
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009
+    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[C14]]
+    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C15]](s32)
+    ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[LSHR7]], [[LSHR3]]
+    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: $x10 = COPY [[ADD4]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[C16]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %1:_(s32) = COPY $x10
+    %2:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+    %3:_(s64) = G_CTPOP %0(s64)
+    %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3(s64)
+    $x10 = COPY %4(s32)
+    $x11 = COPY %5(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ctpop.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ctpop.mir
new file mode 100644
index 000000000000000..16bbdb878d1e68e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ctpop.mir
@@ -0,0 +1,180 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mattr=+m -mtriple=riscv64 -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            ctpop_i8
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: ctpop_i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 85
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC1]], [[AND1]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C4]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 51
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C5]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND3]], [[AND4]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C6]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[C8]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C9]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR3]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %2:_(s8) = G_CTPOP %0(s8)
+    %3:_(s64) = G_ANYEXT %2(s8)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ctpop_i16
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: ctpop_i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 21845
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC1]], [[AND1]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C4]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 13107
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C5]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND3]], [[AND4]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C6]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 3855
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 257
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[C8]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C10]]
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[C9]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR3]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %2:_(s16) = G_CTPOP %0(s16)
+    %3:_(s64) = G_ANYEXT %2(s16)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ctpop_i32
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: ctpop_i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655765
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC]], [[AND]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 858993459
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C3]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND1]], [[AND2]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C4]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR3]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %2:_(s32) = G_CTPOP %0(s32)
+    %3:_(s64) = G_ANYEXT %2(s32)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ctpop_i64
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: ctpop_i64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 6148914691236517205
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C1]]
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[AND]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[SUB]], [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3689348814741910323
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C3]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[AND1]], [[AND2]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[ADD]], [[C4]](s64)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[ADD]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1085102592571150095
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C5]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND3]], [[C6]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 56
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[MUL]], [[C7]](s64)
+    ; CHECK-NEXT: $x10 = COPY [[LSHR3]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = G_CTPOP %0(s64)
+    $x10 = COPY %1(s64)
+    PseudoRET implicit $x10
+
+...

>From 8c49cda34b13626820552b3a9254eb38a341efa0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Mon, 13 Nov 2023 10:38:31 -0800
Subject: [PATCH 11/13] [RISCV][GISel] Update RV64 legalize-ctpop.mir to
 account for constant shift amounts being i64 now.

This changed while the ctpop patch was in review and I forgot to update it.
---
 .../legalizer/rv64/legalize-ctpop.mir         | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ctpop.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ctpop.mir
index 16bbdb878d1e68e..1e98e3d8ff5726b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ctpop.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ctpop.mir
@@ -12,32 +12,32 @@ body:             |
     ; CHECK: liveins: $x10
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s64)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 85
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC1]], [[AND1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C4]]
-    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C3]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s64)
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 51
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
     ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C5]]
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND3]], [[AND4]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C6]](s32)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C6]](s64)
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
     ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
     ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C7]]
     ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[C8]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C9]](s32)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C9]](s64)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR3]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -59,34 +59,34 @@ body:             |
     ; CHECK: liveins: $x10
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s64)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 21845
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC1]], [[AND1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C4]]
-    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C3]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s64)
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 13107
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
     ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C5]]
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND3]], [[AND4]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C6]](s32)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C6]](s64)
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
     ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 3855
     ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C7]]
     ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 257
     ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[C8]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C10]]
-    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[C9]](s32)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C9]]
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[C10]](s64)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR3]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -109,26 +109,26 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655765
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC]], [[AND]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s64)
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 858993459
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C3]]
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND1]], [[AND2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C4]](s32)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C4]](s64)
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]]
     ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009
     ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s64)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR3]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10

>From 78a896fef06dea0e5f2edc043ea8bb573fba7c75 Mon Sep 17 00:00:00 2001
From: Felix Schneider <fx.schn at gmail.com>
Date: Mon, 13 Nov 2023 19:54:01 +0100
Subject: [PATCH 12/13] [mlir] NFC: Clarify documentation on
 `Infer(Shaped)TypeOpInterface`  (#70350)

This patch clarifies the documentation of methods `inferReturnTypeComponents()`
and `inferReturnTypes()` regarding verified/valid arguments.
---
 mlir/include/mlir/Interfaces/InferTypeOpInterface.td | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Interfaces/InferTypeOpInterface.td b/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
index c5eeeaf58a7b4f8..a009e21c61b49b2 100644
--- a/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
+++ b/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
@@ -67,8 +67,10 @@ def InferTypeOpInterface : OpInterface<"InferTypeOpInterface"> {
       The return types may be elided or specific elements be null for elements
       that should just be returned but not verified.
 
-      Be aware that this method is supposed to be called with valid arguments,
-      e.g., operands are verified, or it may result in an undefined behavior.
+      Because this method can be called from within different stages of IR
+      verification, implementations should not assume the arguments to
+      represent fully valid IR and are responsible for checking inputs for
+      validity to the degree necessary to perform the return type inference.
       }],
       /*retTy=*/"::mlir::LogicalResult",
       /*methodName=*/"refineReturnTypes",
@@ -142,6 +144,11 @@ def InferShapedTypeOpInterface : OpInterface<"InferShapedTypeOpInterface"> {
       Unknown (e.g., unranked) shape and nullptrs for element type and attribute
       may be returned by this function while returning success. E.g., partial
       population of components is not error condition.
+
+      Because this method can be called from within different stages of IR
+      verification, implementations should not assume the arguments to
+      represent fully valid IR and are responsible for checking inputs for
+      validity to the degree necessary to perform the return type inference.
       }],
       /*retTy=*/"::mlir::LogicalResult",
       /*methodName=*/"inferReturnTypeComponents",

>From 9407eec4c6e39f17b2a0a3c9881acbcb04f8c442 Mon Sep 17 00:00:00 2001
From: Matteo Franciolini <m_franciolini at apple.com>
Date: Mon, 13 Nov 2023 12:59:30 -0600
Subject: [PATCH 13/13] [mlir][bytecode] Add bytecode writer config API to skip
 serialization of resources (#71991)

When serializing to bytecode, users can select the option to elide
resources from the bytecode file. This will instruct the bytecode writer
to serialize only the key and resource kind, while skipping
serialization of the data buffer. At parsing, the IR is built in memory
with valid (but empty) resource handlers.
---
 mlir/include/mlir/Bytecode/BytecodeWriter.h   |  3 +++
 .../include/mlir/Tools/mlir-opt/MlirOptMain.h |  6 +++++
 mlir/lib/Bytecode/Writer/BytecodeWriter.cpp   | 26 ++++++++++++++-----
 mlir/lib/Tools/mlir-opt/MlirOptMain.cpp       |  7 +++++
 mlir/test/Bytecode/resources_elision.mlir     | 21 +++++++++++++++
 5 files changed, 57 insertions(+), 6 deletions(-)
 create mode 100644 mlir/test/Bytecode/resources_elision.mlir

diff --git a/mlir/include/mlir/Bytecode/BytecodeWriter.h b/mlir/include/mlir/Bytecode/BytecodeWriter.h
index b82d8ddad38ed1c..ea4b36832e0bac3 100644
--- a/mlir/include/mlir/Bytecode/BytecodeWriter.h
+++ b/mlir/include/mlir/Bytecode/BytecodeWriter.h
@@ -152,6 +152,9 @@ class BytecodeWriterConfig {
   // Resources
   //===--------------------------------------------------------------------===//
 
+  /// Set a boolean flag to skip emission of resources into the bytecode file.
+  void setElideResourceDataFlag(bool shouldElideResourceData = true);
+
   /// Attach the given resource printer to the writer configuration.
   void attachResourcePrinter(std::unique_ptr<AsmResourcePrinter> printer);
 
diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
index a1530936f55caee..e255d9fa70b6594 100644
--- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
+++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
@@ -82,6 +82,9 @@ class MlirOptMainConfig {
     return *this;
   }
   bool shouldEmitBytecode() const { return emitBytecodeFlag; }
+  bool shouldElideResourceDataFromBytecode() const {
+    return elideResourceDataFromBytecodeFlag;
+  }
 
   /// Set the IRDL file to load before processing the input.
   MlirOptMainConfig &setIrdlFile(StringRef file) {
@@ -185,6 +188,9 @@ class MlirOptMainConfig {
   /// Emit bytecode instead of textual assembly when generating output.
   bool emitBytecodeFlag = false;
 
+  /// Elide resources when generating bytecode.
+  bool elideResourceDataFromBytecodeFlag = false;
+
   /// Enable the Debugger action hook: Debugger can intercept MLIR Actions.
   bool enableDebuggerActionHookFlag = false;
 
diff --git a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
index 01dcea1ca3848eb..6097f0eda469cd2 100644
--- a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
+++ b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
@@ -39,6 +39,10 @@ struct BytecodeWriterConfig::Impl {
   /// Note: This only differs from kVersion if a specific version is set.
   int64_t bytecodeVersion = bytecode::kVersion;
 
+  /// A flag specifying whether to elide emission of resources into the bytecode
+  /// file.
+  bool shouldElideResourceData = false;
+
   /// A map containing dialect version information for each dialect to emit.
   llvm::StringMap<std::unique_ptr<DialectVersion>> dialectVersionMap;
 
@@ -89,6 +93,11 @@ void BytecodeWriterConfig::attachResourcePrinter(
   impl->externalResourcePrinters.emplace_back(std::move(printer));
 }
 
+void BytecodeWriterConfig::setElideResourceDataFlag(
+    bool shouldElideResourceData) {
+  impl->shouldElideResourceData = shouldElideResourceData;
+}
+
 void BytecodeWriterConfig::setDesiredBytecodeVersion(int64_t bytecodeVersion) {
   impl->bytecodeVersion = bytecodeVersion;
 }
@@ -1170,22 +1179,25 @@ class ResourceBuilder : public AsmResourceBuilder {
   using PostProcessFn = function_ref<void(StringRef, AsmResourceEntryKind)>;
 
   ResourceBuilder(EncodingEmitter &emitter, StringSectionBuilder &stringSection,
-                  PostProcessFn postProcessFn)
+                  PostProcessFn postProcessFn, bool shouldElideData)
       : emitter(emitter), stringSection(stringSection),
-        postProcessFn(postProcessFn) {}
+        postProcessFn(postProcessFn), shouldElideData(shouldElideData) {}
   ~ResourceBuilder() override = default;
 
   void buildBlob(StringRef key, ArrayRef<char> data,
                  uint32_t dataAlignment) final {
-    emitter.emitOwnedBlobAndAlignment(data, dataAlignment);
+    if (!shouldElideData)
+      emitter.emitOwnedBlobAndAlignment(data, dataAlignment);
     postProcessFn(key, AsmResourceEntryKind::Blob);
   }
   void buildBool(StringRef key, bool data) final {
-    emitter.emitByte(data);
+    if (!shouldElideData)
+      emitter.emitByte(data);
     postProcessFn(key, AsmResourceEntryKind::Bool);
   }
   void buildString(StringRef key, StringRef data) final {
-    emitter.emitVarInt(stringSection.insert(data));
+    if (!shouldElideData)
+      emitter.emitVarInt(stringSection.insert(data));
     postProcessFn(key, AsmResourceEntryKind::String);
   }
 
@@ -1193,6 +1205,7 @@ class ResourceBuilder : public AsmResourceBuilder {
   EncodingEmitter &emitter;
   StringSectionBuilder &stringSection;
   PostProcessFn postProcessFn;
+  bool shouldElideData = false;
 };
 } // namespace
 
@@ -1225,7 +1238,8 @@ void BytecodeWriter::writeResourceSection(Operation *op,
 
   // Builder used to emit resources.
   ResourceBuilder entryBuilder(resourceEmitter, stringSection,
-                               appendResourceOffset);
+                               appendResourceOffset,
+                               config.shouldElideResourceData);
 
   // Emit the external resource entries.
   resourceOffsetEmitter.emitVarInt(config.externalResourcePrinters.size());
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index c36afae716b12c5..d7d47619ef4ac98 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -90,6 +90,11 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
         "emit-bytecode", cl::desc("Emit bytecode when generating output"),
         cl::location(emitBytecodeFlag), cl::init(false));
 
+    static cl::opt<bool, /*ExternalStorage=*/true> elideResourcesFromBytecode(
+        "elide-resource-data-from-bytecode",
+        cl::desc("Elide resources when generating bytecode"),
+        cl::location(elideResourceDataFromBytecodeFlag), cl::init(false));
+
     static cl::opt<std::optional<int64_t>, /*ExternalStorage=*/true,
                    BytecodeVersionParser>
         bytecodeVersion(
@@ -385,6 +390,8 @@ performActions(raw_ostream &os,
     BytecodeWriterConfig writerConfig(fallbackResourceMap);
     if (auto v = config.bytecodeVersionToEmit())
       writerConfig.setDesiredBytecodeVersion(*v);
+    if (config.shouldElideResourceDataFromBytecode())
+      writerConfig.setElideResourceDataFlag();
     return writeBytecodeToFile(op.get(), os, writerConfig);
   }
 
diff --git a/mlir/test/Bytecode/resources_elision.mlir b/mlir/test/Bytecode/resources_elision.mlir
new file mode 100644
index 000000000000000..bca70012f47a949
--- /dev/null
+++ b/mlir/test/Bytecode/resources_elision.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-opt -emit-bytecode -elide-resource-data-from-bytecode %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: @TestDialectResources
+module @TestDialectResources attributes {
+  // CHECK: bytecode.test = dense_resource<decl_resource> : tensor<2xui32>
+  // CHECK: bytecode.test2 = dense_resource<resource> : tensor<4xf64>
+  // CHECK: bytecode.test3 = dense_resource<resource_2> : tensor<4xf64>
+  bytecode.test = dense_resource<decl_resource> : tensor<2xui32>,
+  bytecode.test2 = dense_resource<resource> : tensor<4xf64>,
+  bytecode.test3 = dense_resource<resource_2> : tensor<4xf64>
+} {}
+
+// CHECK-NOT: dialect_resources
+{-#
+  dialect_resources: {
+    builtin: {
+      resource: "0x08000000010000000000000002000000000000000300000000000000",
+      resource_2: "0x08000000010000000000000002000000000000000300000000000000"
+    }
+  }
+#-}