[llvm] [AMDGPU][SelectionDAG] Use COPY instead of S_MOV_B32 to assign values to M0 (PR #132957)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 25 10:10:44 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Juan Manuel Martinez CaamaƱo (jmmartinez)
<details>
<summary>Changes</summary>
This is consistent with what's done on GISel. This allows the register coalescer to remove the redundant intermediate `s_mov_b32` instructions by using `m0` directly as the result register.
---
Patch is 53.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132957.diff
12 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+6-4)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+7-11)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+41-63)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+5-9)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+4-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll (+10-15)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll (+7-11)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll (+10-23)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll (+10-22)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll (+14-28)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll (+26-54)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll (+14-28)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8657c0389cd40..b0c18715ef810 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4691,7 +4691,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
} else {
// Move index from VCC into M0
if (Offset == 0) {
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
.addReg(CurrentIdxReg, RegState::Kill);
} else {
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
@@ -4805,7 +4805,7 @@ static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
if (Offset == 0) {
// clang-format off
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
.add(*Idx);
// clang-format on
} else {
@@ -5400,9 +5400,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}
case AMDGPU::SI_INIT_M0: {
+ MachineOperand &M0Init = MI.getOperand(0);
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
- TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .add(MI.getOperand(0));
+ TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
+ AMDGPU::M0)
+ .add(M0Init);
MI.eraseFromParent();
return BB;
}
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index a72e74167d564..10de973dac0c5 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -301,12 +301,11 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s10, s0
; GCN-NEXT: s_mov_b32 s12, s0
; GCN-NEXT: s_mov_b32 s14, s0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s18, s18, 1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v15, s15
-; GCN-NEXT: s_mov_b32 m0, s18
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 m0, s18, 1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v4, s4
@@ -352,11 +351,10 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s10, s0
; GCN-NEXT: s_mov_b32 s12, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s16, s16, 1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v15, s15
-; GCN-NEXT: s_mov_b32 m0, s16
+; GCN-NEXT: s_lshl_b32 m0, s16, 1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v4, s4
@@ -451,12 +449,11 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s60, s36
; GCN-NEXT: s_mov_b32 s62, s36
; GCN-NEXT: s_mov_b32 s64, s36
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 1
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v31, s67
-; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 m0, s2, 1
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
; GCN-NEXT: v_mov_b32_e32 v4, s40
@@ -535,12 +532,11 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s62, s36
; GCN-NEXT: s_mov_b32 s64, s36
; GCN-NEXT: s_mov_b32 s66, s36
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 1
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v31, s67
-; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 m0, s2, 1
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
; GCN-NEXT: v_mov_b32_e32 v4, s40
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index c75dc539cdcee..d0b54a866718c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -154,8 +154,7 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1
-; SI-MOVREL-NEXT: s_mov_b32 m0, s6
+; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
@@ -183,8 +182,7 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
@@ -439,12 +437,12 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1
+; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1
; SI-MOVREL-NEXT: s_or_b32 s8, s8, 1
; SI-MOVREL-NEXT: s_or_b32 s4, s23, 16
; SI-MOVREL-NEXT: s_or_b32 s5, s22, 15
-; SI-MOVREL-NEXT: s_or_b32 s7, s21, 14
-; SI-MOVREL-NEXT: s_or_b32 s20, s20, 13
+; SI-MOVREL-NEXT: s_or_b32 s6, s21, 14
+; SI-MOVREL-NEXT: s_or_b32 s7, s20, 13
; SI-MOVREL-NEXT: s_or_b32 s19, s19, 12
; SI-MOVREL-NEXT: s_or_b32 s18, s18, 11
; SI-MOVREL-NEXT: s_or_b32 s17, s17, 10
@@ -457,7 +455,6 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; SI-MOVREL-NEXT: s_or_b32 s10, s10, 3
; SI-MOVREL-NEXT: s_or_b32 s9, s9, 2
; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8
-; SI-MOVREL-NEXT: s_mov_b32 m0, s6
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s11
@@ -469,8 +466,8 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s17
; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18
; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19
-; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20
-; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s6
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s5
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s4
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
@@ -483,16 +480,16 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; VI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; VI-MOVREL-NEXT: s_or_b32 s10, s10, 3
; VI-MOVREL-NEXT: s_or_b32 s9, s9, 2
; VI-MOVREL-NEXT: s_or_b32 s8, s8, 1
-; VI-MOVREL-NEXT: s_or_b32 s3, s23, 16
-; VI-MOVREL-NEXT: s_or_b32 s4, s22, 15
-; VI-MOVREL-NEXT: s_or_b32 s5, s21, 14
-; VI-MOVREL-NEXT: s_or_b32 s6, s20, 13
-; VI-MOVREL-NEXT: s_or_b32 s7, s19, 12
-; VI-MOVREL-NEXT: s_or_b32 s18, s18, 11
+; VI-MOVREL-NEXT: s_or_b32 s2, s23, 16
+; VI-MOVREL-NEXT: s_or_b32 s3, s22, 15
+; VI-MOVREL-NEXT: s_or_b32 s4, s21, 14
+; VI-MOVREL-NEXT: s_or_b32 s5, s20, 13
+; VI-MOVREL-NEXT: s_or_b32 s6, s19, 12
+; VI-MOVREL-NEXT: s_or_b32 s7, s18, 11
; VI-MOVREL-NEXT: s_or_b32 s17, s17, 10
; VI-MOVREL-NEXT: s_or_b32 s16, s16, 9
; VI-MOVREL-NEXT: s_or_b32 s15, s15, 8
@@ -503,7 +500,6 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s12
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s13
@@ -511,12 +507,12 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s15
; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s16
; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s17
-; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s18
-; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s7
-; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s6
-; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s5
-; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s4
-; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s7
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s6
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s2
; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1
@@ -2079,7 +2075,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1
+; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
@@ -2094,7 +2090,6 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
-; SI-MOVREL-NEXT: s_mov_b32 m0, s6
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16
@@ -2112,8 +2107,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
@@ -2435,7 +2429,7 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_and_b32 s4, s6, 0xffff
+; SI-MOVREL-NEXT: s_and_b32 m0, s6, 0xffff
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
@@ -2450,7 +2444,6 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
-; SI-MOVREL-NEXT: s_mov_b32 m0, s4
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16
@@ -2468,8 +2461,7 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_and_b32 m0, s2, 0xffff
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
@@ -2794,7 +2786,7 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT: s_sext_i32_i16 s4, s6
-; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1
+; SI-MOVREL-NEXT: s_add_i32 m0, s4, 1
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
@@ -2809,7 +2801,6 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
-; SI-MOVREL-NEXT: s_mov_b32 m0, s4
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16
@@ -2828,8 +2819,7 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out,
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT: s_sext_i32_i16 s2, s2
-; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
@@ -6932,9 +6922,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
-; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s3, s2, 1
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
@@ -6948,10 +6938,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000
-; SI-MOVREL-NEXT: s_mov_b32 m0, s3
; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
-; SI-MOVREL-NEXT: s_add_i32 s2, s2, 2
; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15
+; SI-MOVREL-NEXT: s_add_i32 m0, s2, 2
; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14
; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13
; SI-MOVREL-NEXT: v_mov_b32_e32 v28, v12
@@ -6967,7 +6956,6 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v2
; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v1
; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v0
-; SI-MOVREL-NEXT: s_mov_b32 m0, s2
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32
@@ -6988,9 +6976,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
-; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_add_i32 s3, s2, 1
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
@@ -7004,11 +6992,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000
-; VI-MOVREL-NEXT: s_mov_b32 m0, s3
-; VI-MOVREL-NEXT: s_add_i32 s2, s2, 2
; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 2
; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14
; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13
@@ -8057,8 +8043,7 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s12, s12, 15
-; SI-MOVREL-NEXT: s_mov_b32 m0, s12
+; SI-MOVREL-NEXT: s_add_i32 m0, s12, 15
; SI-MOVREL-NEXT: s_mov_b32 s4, s0
; SI-MOVREL-NEXT: s_mov_b32 s5, s1
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
@@ -8089,9 +8074,8 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out
; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3
-; VI-MOVREL-NEXT: s_add_i32 s6, s6, 15
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2
-; VI-MOVREL-NEXT: s_mov_b32 m0, s6
+; VI-MOVREL-NEXT: s_add_i32 m0, s6, 15
; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0
@@ -8321,8 +8305,7 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s12, s12, 16
-; SI-MOVREL-NEXT: s_mov_b32 m0, s12
+; SI-MOVREL-NEXT: s_add_i32 m0, s12, 16
; SI-MOVREL-NEXT: s_mov_b32 s4, s0
; SI-MOVREL-NEXT: s_mov_b32 s5, s1
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
@@ -8353,9 +8336,8 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p
; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3
-; VI-MOVREL-NEXT: s_add_i32 s6, s6, 16
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2
-; VI-MOVREL-NEXT: s_mov_b32 m0, s6
+; VI-MOVREL-NEXT: s_add_i32 m0, s6, 16
; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0
@@ -8586,9 +8568,8 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_lshl_b32 m0, s12, 2
; SI-MOVREL-NEXT: s_mov_b32 s4, s0
-; SI-MOVREL-NEXT: s_lshl_b32 s0, s12, 2
-; SI-MOVREL-NEXT: s_mov_b32 m0, s0
; SI-MOVREL-NEXT: s_mov_b32 s5, s1
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1
; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -8618,12 +8599,11 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out,
; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3
-; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0
-; VI-MOVREL-NEXT: s_lshl_b32 s0, s6, 2
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2
-; VI-MOVREL-NEXT: s_mov_b32 m0, s0
+; VI-MOVREL-NEXT: s_lshl_b32 m0, s6, 2
; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0
; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1
; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0
@@ -8862,7 +8842,7 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_lshl_b32 s4, s6, 2
+; SI-MOVREL-NEXT: s_lshl_b32 m0, s6, 2
; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8
; SI-MOVREL-NEXT: v_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/132957
More information about the llvm-commits
mailing list