[llvm] Revert "CodeGen: Record MMOs in finalizeBundle" (PR #166520)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 5 01:00:30 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jan Patrick Lehr (jplehr)
<details>
<summary>Changes</summary>
Reverts llvm/llvm-project#<!-- -->166210
Buildbot failures in the libc on GPU bot: https://lab.llvm.org/buildbot/#/builders/10/builds/16711
---
Patch is 2.73 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166520.diff
53 Files Affected:
- (modified) llvm/lib/CodeGen/MIRParser/MIParser.cpp (-2)
- (modified) llvm/lib/CodeGen/MachineInstrBundle.cpp (-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+5115-4839)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+191-172)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+779-724)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+152-143)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+185-172)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+186-184)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+282-252)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+445-394)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+384-372)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+704-657)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+10-39)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+31-32)
- (modified) llvm/test/CodeGen/AMDGPU/finalizebundle.mir (-52)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+77-76)
- (modified) llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll (+8-13)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+92-64)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+8-7)
- (modified) llvm/test/CodeGen/AMDGPU/load-local-i16.ll (+109-105)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll (+17-14)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll (+31-27)
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/max.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+1060-911)
- (modified) llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+100-103)
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir (+2-3)
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+20-21)
- (modified) llvm/test/CodeGen/AMDGPU/scratch-simple.ll (+1183-1197)
- (modified) llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll (+3-12)
- (modified) llvm/test/CodeGen/AMDGPU/spill-agpr.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+12-10)
- (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir (+22-23)
``````````diff
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 434a579c3be3f..4795d81e3f348 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1161,8 +1161,6 @@ bool MIParser::parse(MachineInstr *&MI) {
MemOperands.push_back(MemOp);
if (Token.isNewlineOrEOF())
break;
- if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
- break;
if (Token.isNot(MIToken::comma))
return error("expected ',' before the next machine memory operand");
lex();
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index a8dc614288f20..88d81993fbe55 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -137,7 +137,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
SmallSet<Register, 8> KilledUseSet;
SmallSet<Register, 8> UndefUseSet;
SmallVector<std::pair<Register, Register>> TiedOperands;
- SmallVector<MachineInstr *> MemMIs;
for (auto MII = FirstMI; MII != LastMI; ++MII) {
// Debug instructions have no effects to track.
if (MII->isDebugInstr())
@@ -201,9 +200,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
MIB.setMIFlag(MachineInstr::FrameSetup);
if (MII->getFlag(MachineInstr::FrameDestroy))
MIB.setMIFlag(MachineInstr::FrameDestroy);
-
- if (MII->mayLoadOrStore())
- MemMIs.push_back(&*MII);
}
for (Register Reg : LocalDefs) {
@@ -229,8 +225,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
assert(UseIdx < ExternUses.size());
MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
}
-
- MIB->cloneMergedMemRefs(MF, MemMIs);
}
/// finalizeBundle - Same functionality as the previous finalizeBundle except
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 6076a2eec44bc..c2129c20e4543 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -33,6 +33,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN-NEXT: v_mov_b32_e32 v13, s49
; GCN-NEXT: v_mov_b32_e32 v14, s50
; GCN-NEXT: v_mov_b32_e32 v15, s51
+; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
@@ -50,7 +51,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:56
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:60
; GCN-NEXT: v_mov_b32_e32 v0, s52
-; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64
; GCN-NEXT: v_mov_b32_e32 v0, s53
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 10e83b70a57d4..1812e17800e71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -189,11 +189,15 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: s_lshr_b32 s1, s1, 24
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
+; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s5, s5, 8
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
; GFX10-NEXT: v_mov_b32_e32 v6, s6
+; GFX10-NEXT: v_mov_b32_e32 v7, s1
+; GFX10-NEXT: s_lshr_b32 s1, s9, 8
; GFX10-NEXT: v_mov_b32_e32 v8, s5
; GFX10-NEXT: v_mov_b32_e32 v9, s0
; GFX10-NEXT: ds_write_b8 v1, v0
@@ -204,22 +208,18 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: s_lshr_b32 s1, s1, 24
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
-; GFX10-NEXT: s_lshr_b32 s0, s2, 24
-; GFX10-NEXT: v_mov_b32_e32 v7, s1
-; GFX10-NEXT: s_lshr_b32 s1, s9, 8
; GFX10-NEXT: v_mov_b32_e32 v3, s2
+; GFX10-NEXT: v_mov_b32_e32 v10, s1
+; GFX10-NEXT: s_lshr_b32 s0, s2, 24
+; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
+; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
+; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_and_b32 s0, 0xffff, s3
-; GFX10-NEXT: v_mov_b32_e32 v10, s1
-; GFX10-NEXT: s_lshr_b32 s0, s0, 8
; GFX10-NEXT: s_lshr_b32 s1, s3, 16
+; GFX10-NEXT: s_lshr_b32 s0, s0, 8
; GFX10-NEXT: v_mov_b32_e32 v2, s3
-; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
-; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: v_mov_b32_e32 v3, s0
; GFX10-NEXT: s_lshr_b32 s0, s3, 24
; GFX10-NEXT: v_mov_b32_e32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 4a22a911c60b7..b33b8a7d8cd72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -272,6 +272,10 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -284,9 +288,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 746ffcff5667a..74552a500ac51 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -3105,6 +3105,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-LABEL: bitcast_v32i32_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -3237,22 +3253,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -3284,13 +3284,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB12_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -3522,6 +3523,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: s_cbranch_execz .LBB12_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -3944,24 +3946,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v13
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -4309,12 +4295,44 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v32i32_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -4419,22 +4437,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -4540,129 +4542,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/166520
More information about the llvm-commits
mailing list