[llvm] Reland: CodeGen: Record MMOs in finalizeBundle (PR #166689)

via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 5 18:50:24 PST 2025


Nicolai =?utf-8?q?Hähnle?= <nicolai.haehnle at amd.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/166689 at github.com>


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-globalisel

Author: Nicolai Hähnle (nhaehnle)

<details>
<summary>Changes</summary>

(original PR: #<!-- -->166210)

---

Patch is 2.24 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166689.diff


50 Files Affected:

- (modified) llvm/lib/CodeGen/MIRParser/MIParser.cpp (+2) 
- (modified) llvm/lib/CodeGen/MachineInstrBundle.cpp (+6) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll (+10-10) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll (+3-4) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+3827-4097) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+141-160) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+288-343) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+74-83) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+108-123) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+96-100) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+150-182) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+279-330) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+200-212) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+456-503) 
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+39-10) 
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+32-31) 
- (modified) llvm/test/CodeGen/AMDGPU/finalizebundle.mir (+52) 
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+50-50) 
- (modified) llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll (+13-8) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+64-92) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+9-9) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+7-8) 
- (modified) llvm/test/CodeGen/AMDGPU/load-local-i16.ll (+105-109) 
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll (+14-17) 
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll (+27-31) 
- (modified) llvm/test/CodeGen/AMDGPU/max.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+872-1021) 
- (modified) llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+103-100) 
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+21-20) 
- (modified) llvm/test/CodeGen/AMDGPU/scratch-simple.ll (+870-868) 
- (modified) llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll (+12-3) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-agpr.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+10-12) 
- (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+1-1) 
- (modified) llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir (+23-22) 


``````````diff
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 4795d81e3f348..434a579c3be3f 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
       MemOperands.push_back(MemOp);
       if (Token.isNewlineOrEOF())
         break;
+      if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
+        break;
       if (Token.isNot(MIToken::comma))
         return error("expected ',' before the next machine memory operand");
       lex();
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 88d81993fbe55..a8dc614288f20 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -137,6 +137,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   SmallSet<Register, 8> KilledUseSet;
   SmallSet<Register, 8> UndefUseSet;
   SmallVector<std::pair<Register, Register>> TiedOperands;
+  SmallVector<MachineInstr *> MemMIs;
   for (auto MII = FirstMI; MII != LastMI; ++MII) {
     // Debug instructions have no effects to track.
     if (MII->isDebugInstr())
@@ -200,6 +201,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       MIB.setMIFlag(MachineInstr::FrameSetup);
     if (MII->getFlag(MachineInstr::FrameDestroy))
       MIB.setMIFlag(MachineInstr::FrameDestroy);
+
+    if (MII->mayLoadOrStore())
+      MemMIs.push_back(&*MII);
   }
 
   for (Register Reg : LocalDefs) {
@@ -225,6 +229,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     assert(UseIdx < ExternUses.size());
     MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
   }
+
+  MIB->cloneMergedMemRefs(MF, MemMIs);
 }
 
 /// finalizeBundle - Same functionality as the previous finalizeBundle except
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9460145d47111..6ce18ea921a9b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3917,6 +3917,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   if (isLDSDMA(MIa) || isLDSDMA(MIb))
     return false;
 
+  if (MIa.isBundle() || MIb.isBundle())
+    return false;
+
   // TODO: Should we check the address space from the MachineMemOperand? That
   // would allow us to distinguish objects we know don't alias based on the
   // underlying address space, even if it was lowered to a different one,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 1812e17800e71..10e83b70a57d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX10-NEXT:    s_and_b32 s9, 0xffff, s2
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s7, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s6
-; GFX10-NEXT:    v_mov_b32_e32 v7, s1
-; GFX10-NEXT:    s_lshr_b32 s1, s9, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s0
 ; GFX10-NEXT:    ds_write_b8 v1, v0
@@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX10-NEXT:    ds_write_b8 v1, v8 offset:1
 ; GFX10-NEXT:    ds_write_b8 v1, v9 offset:5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v3, s2
-; GFX10-NEXT:    v_mov_b32_e32 v10, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX10-NEXT:    s_and_b32 s9, 0xffff, s2
 ; GFX10-NEXT:    s_lshr_b32 s0, s2, 24
-; GFX10-NEXT:    ds_write_b8 v1, v7 offset:7
-; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
+; GFX10-NEXT:    v_mov_b32_e32 v7, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s9, 8
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX10-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_and_b32 s0, 0xffff, s3
-; GFX10-NEXT:    s_lshr_b32 s1, s3, 16
+; GFX10-NEXT:    v_mov_b32_e32 v10, s1
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s1, s3, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-NEXT:    ds_write_b8 v1, v7 offset:7
+; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
+; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s3, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index b33b8a7d8cd72..4a22a911c60b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
 ; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
 ; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
 ; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
 ; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
 ; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 74552a500ac51..08e64da632d3b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -3105,22 +3105,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i32_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
@@ -3253,6 +3237,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -3284,14 +3284,13 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB12_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -3523,7 +3522,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB12_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -4317,22 +4315,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-LABEL: bitcast_v32i32_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -4437,6 +4419,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -4542,129 +4540,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/166689


More information about the llvm-commits mailing list