[llvm-branch-commits] [llvm] CodeGen: Record MMOs in finalizeBundle (PR #166210)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Nov 3 11:04:44 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Nicolai Hähnle (nhaehnle)
<details>
<summary>Changes</summary>
This allows more accurate alias analysis to apply at the bundle level.
This has a bunch of minor effects in post-RA scheduling that look mostly
beneficial to me, all of them in AMDGPU (the Thumb2 change is cosmetic).
The pre-existing (and unchanged) test in
CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll tests that MIR with a
bundle with MMOs can be parsed successfully.
---
**Stack**:
- [5/5] #<!-- -->166213
- [4/5] #<!-- -->166212
- [3/5] #<!-- -->166211
- [2/5] #<!-- -->166210 ⬅
- [1/5] #<!-- -->166209
⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Merging this PR using the GitHub UI may have unexpected results.*
---
Patch is 2.73 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166210.diff
52 Files Affected:
- (modified) llvm/lib/CodeGen/MIRParser/MIParser.cpp (+2)
- (modified) llvm/lib/CodeGen/MachineInstrBundle.cpp (+5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+4839-5115)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+172-191)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+724-779)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+143-152)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+172-185)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+184-186)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+252-282)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+394-445)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+372-384)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+657-704)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+39-10)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+32-31)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+76-77)
- (modified) llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll (+13-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+64-92)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/load-local-i16.ll (+105-109)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll (+14-17)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll (+27-31)
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/max.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+968-1117)
- (modified) llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+103-100)
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+21-20)
- (modified) llvm/test/CodeGen/AMDGPU/scratch-simple.ll (+1192-1178)
- (modified) llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll (+12-3)
- (modified) llvm/test/CodeGen/AMDGPU/spill-agpr.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+10-12)
- (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+1-5)
- (modified) llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir (+23-22)
``````````diff
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 4795d81e3f348..434a579c3be3f 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
MemOperands.push_back(MemOp);
if (Token.isNewlineOrEOF())
break;
+ if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
+ break;
if (Token.isNot(MIToken::comma))
return error("expected ',' before the next machine memory operand");
lex();
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 1f47dc50351d2..522e237d2096d 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -137,6 +137,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
SmallSet<Register, 8> KilledUseSet;
SmallSet<Register, 8> UndefUseSet;
SmallVector<std::pair<Register, Register>> TiedOperands;
+ SmallVector<MachineMemOperand *> MMOs;
for (auto MII = FirstMI; MII != LastMI; ++MII) {
// Debug instructions have no effects to track.
if (MII->isDebugInstr())
@@ -201,6 +202,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
MIB.setMIFlag(MachineInstr::FrameSetup);
if (MII->getFlag(MachineInstr::FrameDestroy))
MIB.setMIFlag(MachineInstr::FrameDestroy);
+
+ MMOs.insert(MMOs.end(), MII->memoperands_begin(), MII->memoperands_end());
}
for (Register Reg : LocalDefs) {
@@ -226,6 +229,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
assert(UseIdx < ExternUses.size());
MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
}
+
+ MIB->setMemRefs(MF, MMOs);
}
/// finalizeBundle - Same functionality as the previous finalizeBundle except
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index c2129c20e4543..6076a2eec44bc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -33,7 +33,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN-NEXT: v_mov_b32_e32 v13, s49
; GCN-NEXT: v_mov_b32_e32 v14, s50
; GCN-NEXT: v_mov_b32_e32 v15, s51
-; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
@@ -51,6 +50,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:56
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:60
; GCN-NEXT: v_mov_b32_e32 v0, s52
+; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64
; GCN-NEXT: v_mov_b32_e32 v0, s53
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 1812e17800e71..10e83b70a57d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: s_lshr_b32 s1, s1, 24
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s5, s5, 8
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
; GFX10-NEXT: v_mov_b32_e32 v6, s6
-; GFX10-NEXT: v_mov_b32_e32 v7, s1
-; GFX10-NEXT: s_lshr_b32 s1, s9, 8
; GFX10-NEXT: v_mov_b32_e32 v8, s5
; GFX10-NEXT: v_mov_b32_e32 v9, s0
; GFX10-NEXT: ds_write_b8 v1, v0
@@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v3, s2
-; GFX10-NEXT: v_mov_b32_e32 v10, s1
+; GFX10-NEXT: s_lshr_b32 s1, s1, 24
+; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s0, s2, 24
-; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
-; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
+; GFX10-NEXT: v_mov_b32_e32 v7, s1
+; GFX10-NEXT: s_lshr_b32 s1, s9, 8
+; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_and_b32 s0, 0xffff, s3
-; GFX10-NEXT: s_lshr_b32 s1, s3, 16
+; GFX10-NEXT: v_mov_b32_e32 v10, s1
; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: s_lshr_b32 s1, s3, 16
; GFX10-NEXT: v_mov_b32_e32 v2, s3
+; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
+; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
+; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: v_mov_b32_e32 v3, s0
; GFX10-NEXT: s_lshr_b32 s0, s3, 24
; GFX10-NEXT: v_mov_b32_e32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index b33b8a7d8cd72..4a22a911c60b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 74552a500ac51..746ffcff5667a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -3105,22 +3105,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-LABEL: bitcast_v32i32_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -3253,6 +3237,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -3284,14 +3284,13 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB12_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -3523,7 +3522,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: s_cbranch_execz .LBB12_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -3946,8 +3944,24 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -4295,44 +4309,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v32i32_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -4437,6 +4419,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -4542,129 +4540,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/166210
More information about the llvm-branch-commits
mailing list