[llvm-branch-commits] [llvm] DAG: Avoid stack usage in bitcast operand promotion to legal vector (PR #125637)

Mon Feb 3 21:54:17 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-selectiondag

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

Fix introducing stack usage if a bitcast source operand is an illegal
integer type cast to a legal vector type. This should cover more
situations, but this is the first one I noticed.

---

Patch is 156.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125637.diff


12 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp (+34-1) 
- (modified) llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll (-160) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll (-9) 
- (modified) llvm/test/CodeGen/AMDGPU/ctpop16.ll (+54-274) 
- (modified) llvm/test/CodeGen/AMDGPU/kernel-args.ll (+122-611) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+17-23) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+195-1105) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+34-45) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+16-32) 
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+75-231) 
- (modified) llvm/test/CodeGen/AMDGPU/shl.ll (+13-46) 
- (modified) llvm/test/CodeGen/AMDGPU/sra.ll (+14-53) 


``````````diff

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 95fb8b406e51bf..eb0c5faa7fe1eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2202,9 +2202,42 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
+  EVT OutVT = N->getValueType(0);
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+  SDLoc dl(N);
+
+  switch (getTypeAction(InVT)) {
+  case TargetLowering::TypePromoteInteger: {
+    if (OutVT.isVector()) {
+      EVT EltVT = OutVT.getVectorElementType();
+      TypeSize EltSize = EltVT.getSizeInBits();
+      TypeSize NInSize = NInVT.getSizeInBits();
+
+      if (NInSize.hasKnownScalarFactor(EltSize)) {
+        unsigned NumEltsWithPadding = NInSize.getKnownScalarFactor(EltSize);
+        EVT WideVecVT =
+            EVT::getVectorVT(*DAG.getContext(), EltVT, NumEltsWithPadding);
+
+        if (isTypeLegal(WideVecVT)) {
+          SDValue Promoted = GetPromotedInteger(InOp);
+          SDValue Cast = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Promoted);
+          return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, Cast,
+                             DAG.getVectorIdxConstant(0, dl));
+        }
+      }
+    }
+
+    break;
+  }
+  default:
+    break;
+  }
+
   // This should only occur in unusual situations like bitcasting to an
   // x86_fp80, so just turn it into a store+load
-  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+  return CreateStackStoreLoad(InOp, OutVT);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
index ab89bb293f6e6e..2c6aabec763306 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
@@ -80,15 +80,6 @@ define <5 x i32> @bitcast_i160_to_v5i32(i160 %int) {
 ; GFX9-LABEL: bitcast_i160_to_v5i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s33
-; GFX9-NEXT:    s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT:    s_mov_b32 s5, s34
-; GFX9-NEXT:    s_mov_b32 s34, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x1000
-; GFX9-NEXT:    s_mov_b32 s32, s34
-; GFX9-NEXT:    s_mov_b32 s34, s5
-; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: bitcast_i160_to_v5i32:
@@ -98,23 +89,6 @@ define <5 x i32> @bitcast_i160_to_v5i32(i160 %int) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, s33
-; GFX12-NEXT:    s_add_co_i32 s33, s32, 31
-; GFX12-NEXT:    s_mov_b32 s1, s34
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT:    scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT:    s_mov_b32 s34, s32
-; GFX12-NEXT:    s_add_co_i32 s32, s32, 64
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s32, s34
-; GFX12-NEXT:    s_mov_b32 s34, s1
-; GFX12-NEXT:    s_mov_b32 s33, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast i160 %int to <5 x i32>
   ret <5 x i32> %bitcast
@@ -124,15 +98,6 @@ define <6 x i32> @bitcast_i192_to_v6i32(i192 %int) {
 ; GFX9-LABEL: bitcast_i192_to_v6i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s33
-; GFX9-NEXT:    s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT:    s_mov_b32 s5, s34
-; GFX9-NEXT:    s_mov_b32 s34, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x1000
-; GFX9-NEXT:    s_mov_b32 s32, s34
-; GFX9-NEXT:    s_mov_b32 s34, s5
-; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: bitcast_i192_to_v6i32:
@@ -142,23 +107,6 @@ define <6 x i32> @bitcast_i192_to_v6i32(i192 %int) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, s33
-; GFX12-NEXT:    s_add_co_i32 s33, s32, 31
-; GFX12-NEXT:    s_mov_b32 s1, s34
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT:    scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT:    s_mov_b32 s34, s32
-; GFX12-NEXT:    s_add_co_i32 s32, s32, 64
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s32, s34
-; GFX12-NEXT:    s_mov_b32 s34, s1
-; GFX12-NEXT:    s_mov_b32 s33, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast i192 %int to <6 x i32>
   ret <6 x i32> %bitcast
@@ -168,15 +116,6 @@ define <7 x i32> @bitcast_i224_to_v7i32(i224 %int) {
 ; GFX9-LABEL: bitcast_i224_to_v7i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s33
-; GFX9-NEXT:    s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT:    s_mov_b32 s5, s34
-; GFX9-NEXT:    s_mov_b32 s34, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x1000
-; GFX9-NEXT:    s_mov_b32 s32, s34
-; GFX9-NEXT:    s_mov_b32 s34, s5
-; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: bitcast_i224_to_v7i32:
@@ -186,27 +125,6 @@ define <7 x i32> @bitcast_i224_to_v7i32(i224 %int) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, s33
-; GFX12-NEXT:    s_add_co_i32 s33, s32, 31
-; GFX12-NEXT:    s_mov_b32 s1, s34
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT:    scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_store_b32 off, v6, s33 offset:24
-; GFX12-NEXT:    scratch_store_b64 off, v[4:5], s33 offset:16
-; GFX12-NEXT:    scratch_load_b96 v[4:6], off, s33 offset:16
-; GFX12-NEXT:    s_mov_b32 s34, s32
-; GFX12-NEXT:    s_add_co_i32 s32, s32, 64
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s32, s34
-; GFX12-NEXT:    s_mov_b32 s34, s1
-; GFX12-NEXT:    s_mov_b32 s33, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast i224 %int to <7 x i32>
   ret <7 x i32> %bitcast
@@ -252,15 +170,6 @@ define <3 x i64> @bitcast_i192_to_v3i64(i192 %int) {
 ; GFX9-LABEL: bitcast_i192_to_v3i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s33
-; GFX9-NEXT:    s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT:    s_mov_b32 s5, s34
-; GFX9-NEXT:    s_mov_b32 s34, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x1000
-; GFX9-NEXT:    s_mov_b32 s32, s34
-; GFX9-NEXT:    s_mov_b32 s34, s5
-; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: bitcast_i192_to_v3i64:
@@ -270,23 +179,6 @@ define <3 x i64> @bitcast_i192_to_v3i64(i192 %int) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, s33
-; GFX12-NEXT:    s_add_co_i32 s33, s32, 31
-; GFX12-NEXT:    s_mov_b32 s1, s34
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT:    scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT:    s_mov_b32 s34, s32
-; GFX12-NEXT:    s_add_co_i32 s32, s32, 64
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s32, s34
-; GFX12-NEXT:    s_mov_b32 s34, s1
-; GFX12-NEXT:    s_mov_b32 s33, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast i192 %int to <3 x i64>
   ret <3 x i64> %bitcast
@@ -408,15 +300,6 @@ define <5 x float> @bitcast_i160_to_v5f32(i160 %int) {
 ; GFX9-LABEL: bitcast_i160_to_v5f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s33
-; GFX9-NEXT:    s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT:    s_mov_b32 s5, s34
-; GFX9-NEXT:    s_mov_b32 s34, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x1000
-; GFX9-NEXT:    s_mov_b32 s32, s34
-; GFX9-NEXT:    s_mov_b32 s34, s5
-; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: bitcast_i160_to_v5f32:
@@ -426,23 +309,6 @@ define <5 x float> @bitcast_i160_to_v5f32(i160 %int) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, s33
-; GFX12-NEXT:    s_add_co_i32 s33, s32, 31
-; GFX12-NEXT:    s_mov_b32 s1, s34
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT:    scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT:    s_mov_b32 s34, s32
-; GFX12-NEXT:    s_add_co_i32 s32, s32, 64
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s32, s34
-; GFX12-NEXT:    s_mov_b32 s34, s1
-; GFX12-NEXT:    s_mov_b32 s33, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast i160 %int to <5 x float>
   ret <5 x float> %bitcast
@@ -452,15 +318,6 @@ define <6 x float> @bitcast_i192_to_v6f32(i192 %int) {
 ; GFX9-LABEL: bitcast_i192_to_v6f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s33
-; GFX9-NEXT:    s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT:    s_mov_b32 s5, s34
-; GFX9-NEXT:    s_mov_b32 s34, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x1000
-; GFX9-NEXT:    s_mov_b32 s32, s34
-; GFX9-NEXT:    s_mov_b32 s34, s5
-; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: bitcast_i192_to_v6f32:
@@ -470,23 +327,6 @@ define <6 x float> @bitcast_i192_to_v6f32(i192 %int) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, s33
-; GFX12-NEXT:    s_add_co_i32 s33, s32, 31
-; GFX12-NEXT:    s_mov_b32 s1, s34
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT:    scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT:    s_mov_b32 s34, s32
-; GFX12-NEXT:    s_add_co_i32 s32, s32, 64
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s32, s34
-; GFX12-NEXT:    s_mov_b32 s34, s1
-; GFX12-NEXT:    s_mov_b32 s33, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast i192 %int to <6 x float>
   ret <6 x float> %bitcast
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 5f49e69a58ed87..405058b24dcc21 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3110,17 +3110,8 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i160:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, s33
-; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
-; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
-; SDAG-NEXT:    s_mov_b32 s5, s34
-; SDAG-NEXT:    s_mov_b32 s34, s32
-; SDAG-NEXT:    s_addk_i32 s32, 0x1000
 ; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
-; SDAG-NEXT:    s_mov_b32 s32, s34
-; SDAG-NEXT:    s_mov_b32 s34, s5
-; SDAG-NEXT:    s_mov_b32 s33, s4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 17ab8fc780fb41..6bf126af5ade23 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -457,58 +457,27 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; EG-LABEL: v_ctpop_v4i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 37, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1
+; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T6.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_64 T8.XY, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     MOV T0.Y, T4.X,
-; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     AND_INT * T0.W, T8.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     LSHR * T0.W, T8.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T5.X,
-; EG-NEXT:     AND_INT * T0.W, T8.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     LSHR * T0.W, T8.Y, literal.x,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     LSHR * T0.W, T0.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT:     BCNT_INT T0.Y, PV.W,
+; EG-NEXT:     AND_INT * T0.W, T0.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     OR_INT * T8.Y, T1.W, PV.W,
+; EG-NEXT:     BCNT_INT T0.X, PV.W,
+; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.X, PV.Y,
-; EG-NEXT:     MOV * T8.X, T4.X,
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
   %val = load <4 x i16>, ptr addrspace(1) %in.gep, align 16
@@ -601,94 +570,33 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; EG-LABEL: v_ctpop_v8i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 73, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1
+; EG-NEXT:    ALU 13, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_128 T12.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     MOV T0.Y, T4.X,
-; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     LSHR * T0.W, T12.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT * T0.W, PV.W,
-; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T12.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T5.X,
-; EG-NEXT:     LSHR * T0.W, T12.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T12.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.Y, PS, PV.W,
-; EG-NEXT:     MOV T5.X, PV.Y,
-; EG-NEXT:     MOV * T0.X, T8.X,
-; EG-NEXT:     LSHR * T0.W, T12.Z, literal.x,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     LSHR * T0.W, T0.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T8.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T12.Z, literal.x,
+; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV T8.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T9.X,
-; EG-NEXT:     LSHR * T0.W, T12.W, literal.x,
+; EG-NEXT:     BCNT_INT T0.Z, PS,
+; EG-NEXT:     LSHR * T1.W, T0.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
+; EG-NEXT:     BCNT_INT T0.Y, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T9.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T12.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:   ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/125637