[llvm] [AMDGPU][SelectionDAG] Use COPY instead of S_MOV_B32 to assign values to M0 (PR #132957)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 25 10:10:08 PDT 2025
https://github.com/jmmartinez created https://github.com/llvm/llvm-project/pull/132957
This is consistent with what's done on GISel. This allows the register coalescer to remove the redundant intermediate `s_mov_b32` instructions by using `m0` directly as the result register.
>From c76738a6d366ec7f1be3cb1d133afca1a6f92478 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 24 Mar 2025 16:16:19 +0100
Subject: [PATCH] [AMDGPU][SelectionDAG] Use COPY instead of S_MOV_B32 to
assign values to M0
This is consistent with what happens on GISel side. And allows the
register coalescer to remove the redundant intermediate s_mov_b32.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +-
.../CodeGen/AMDGPU/extract_vector_dynelt.ll | 18 ++-
.../CodeGen/AMDGPU/indirect-addressing-si.ll | 104 +++++++-----------
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 14 +--
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 10 +-
.../CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll | 25 ++---
.../CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll | 18 ++-
.../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll | 33 ++----
.../CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll | 32 ++----
.../llvm.amdgcn.struct.buffer.load.lds.ll | 42 +++----
...mdgcn.struct.ptr.buffer.load.lds.gfx950.ll | 80 +++++---------
.../llvm.amdgcn.struct.ptr.buffer.load.lds.ll | 42 +++----
12 files changed, 154 insertions(+), 274 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8657c0389cd40..b0c18715ef810 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4691,7 +4691,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
} else {
// Move index from VCC into M0
if (Offset == 0) {
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
.addReg(CurrentIdxReg, RegState::Kill);
} else {
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
@@ -4805,7 +4805,7 @@ static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
if (Offset == 0) {
// clang-format off
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
.add(*Idx);
// clang-format on
} else {
@@ -5400,9 +5400,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}
case AMDGPU::SI_INIT_M0: {
+ MachineOperand &M0Init = MI.getOperand(0);
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
- TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .add(MI.getOperand(0));
+ TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
+ AMDGPU::M0)
+ .add(M0Init);
MI.eraseFromParent();
return BB;
}
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index a72e74167d564..10de973dac0c5 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -301,12 +301,11 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s10, s0
; GCN-NEXT: s_mov_b32 s12, s0
; GCN-NEXT: s_mov_b32 s14, s0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s18, s18, 1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v15, s15
-; GCN-NEXT: s_mov_b32 m0, s18
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 m0, s18, 1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v4, s4
@@ -352,11 +351,10 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s10, s0
; GCN-NEXT: s_mov_b32 s12, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s16, s16, 1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v15, s15
-; GCN-NEXT: s_mov_b32 m0, s16
+; GCN-NEXT: s_lshl_b32 m0, s16, 1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v4, s4
@@ -451,12 +449,11 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s60, s36
; GCN-NEXT: s_mov_b32 s62, s36
; GCN-NEXT: s_mov_b32 s64, s36
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 1
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v31, s67
-; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 m0, s2, 1
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
; GCN-NEXT: v_mov_b32_e32 v4, s40
@@ -535,12 +532,11 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s62, s36
; GCN-NEXT: s_mov_b32 s64, s36
; GCN-NEXT: s_mov_b32 s66, s36
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 1
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v31, s67
-; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 m0, s2, 1
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
; GCN-NEXT: v_mov_b32_e32 v4, s40
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index c75dc539cdcee..d0b54a866718c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -154,8 +154,7 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1
-; SI-MOVREL-NEXT: s_mov_b32 m0, s6
+; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
@@ -183,8 +182,7 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
@@ -439,12 +437,12 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1
+; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1
; SI-MOVREL-NEXT: s_or_b32 s8, s8, 1
; SI-MOVREL-NEXT: s_or_b32 s4, s23, 16
; SI-MOVREL-NEXT: s_or_b32 s5, s22, 15
-; SI-MOVREL-NEXT: s_or_b32 s7, s21, 14
-; SI-MOVREL-NEXT: s_or_b32 s20, s20, 13
+; SI-MOVREL-NEXT: s_or_b32 s6, s21, 14
+; SI-MOVREL-NEXT: s_or_b32 s7, s20, 13
; SI-MOVREL-NEXT: s_or_b32 s19, s19, 12
; SI-MOVREL-NEXT: s_or_b32 s18, s18, 11
; SI-MOVREL-NEXT: s_or_b32 s17, s17, 10
@@ -457,7 +455,6 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; SI-MOVREL-NEXT: s_or_b32 s10, s10, 3
; SI-MOVREL-NEXT: s_or_b32 s9, s9, 2
; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8
-; SI-MOVREL-NEXT: s_mov_b32 m0, s6
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s11
@@ -469,8 +466,8 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s17
; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18
; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19
-; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20
-; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s6
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s5
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s4
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
@@ -483,16 +480,16 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; VI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; VI-MOVREL-NEXT: s_or_b32 s10, s10, 3
; VI-MOVREL-NEXT: s_or_b32 s9, s9, 2
; VI-MOVREL-NEXT: s_or_b32 s8, s8, 1
-; VI-MOVREL-NEXT: s_or_b32 s3, s23, 16
-; VI-MOVREL-NEXT: s_or_b32 s4, s22, 15
-; VI-MOVREL-NEXT: s_or_b32 s5, s21, 14
-; VI-MOVREL-NEXT: s_or_b32 s6, s20, 13
-; VI-MOVREL-NEXT: s_or_b32 s7, s19, 12
-; VI-MOVREL-NEXT: s_or_b32 s18, s18, 11
+; VI-MOVREL-NEXT: s_or_b32 s2, s23, 16
+; VI-MOVREL-NEXT: s_or_b32 s3, s22, 15
+; VI-MOVREL-NEXT: s_or_b32 s4, s21, 14
+; VI-MOVREL-NEXT: s_or_b32 s5, s20, 13
+; VI-MOVREL-NEXT: s_or_b32 s6, s19, 12
+; VI-MOVREL-NEXT: s_or_b32 s7, s18, 11
; VI-MOVREL-NEXT: s_or_b32 s17, s17, 10
; VI-MOVREL-NEXT: s_or_b32 s16, s16, 9
; VI-MOVREL-NEXT: s_or_b32 s15, s15, 8
@@ -503,7 +500,6 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s12
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s13
@@ -511,12 +507,12 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s15
; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s16
; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s17
-; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s18
-; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s7
-; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s6
-; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s5
-; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s4
-; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s7
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s6
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s2
; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1
@@ -2079,7 +2075,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1
+; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
@@ -2094,7 +2090,6 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
-; SI-MOVREL-NEXT: s_mov_b32 m0, s6
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16
@@ -2112,8 +2107,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
@@ -2435,7 +2429,7 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_and_b32 s4, s6, 0xffff
+; SI-MOVREL-NEXT: s_and_b32 m0, s6, 0xffff
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
@@ -2450,7 +2444,6 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
-; SI-MOVREL-NEXT: s_mov_b32 m0, s4
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16
@@ -2468,8 +2461,7 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_and_b32 m0, s2, 0xffff
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
@@ -2794,7 +2786,7 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; SI-MOVREL-NEXT: s_sext_i32_i16 s4, s6
-; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1
+; SI-MOVREL-NEXT: s_add_i32 m0, s4, 1
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
@@ -2809,7 +2801,6 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
-; SI-MOVREL-NEXT: s_mov_b32 m0, s4
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16
@@ -2828,8 +2819,7 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out,
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT: s_sext_i32_i16 s2, s2
-; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
@@ -6932,9 +6922,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
-; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s3, s2, 1
; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
@@ -6948,10 +6938,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000
-; SI-MOVREL-NEXT: s_mov_b32 m0, s3
; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
-; SI-MOVREL-NEXT: s_add_i32 s2, s2, 2
; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15
+; SI-MOVREL-NEXT: s_add_i32 m0, s2, 2
; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14
; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13
; SI-MOVREL-NEXT: v_mov_b32_e32 v28, v12
@@ -6967,7 +6956,6 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v2
; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v1
; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v0
-; SI-MOVREL-NEXT: s_mov_b32 m0, s2
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32
@@ -6988,9 +6976,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
-; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_add_i32 s3, s2, 1
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
@@ -7004,11 +6992,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000
-; VI-MOVREL-NEXT: s_mov_b32 m0, s3
-; VI-MOVREL-NEXT: s_add_i32 s2, s2, 2
; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 2
; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14
; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13
@@ -8057,8 +8043,7 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s12, s12, 15
-; SI-MOVREL-NEXT: s_mov_b32 m0, s12
+; SI-MOVREL-NEXT: s_add_i32 m0, s12, 15
; SI-MOVREL-NEXT: s_mov_b32 s4, s0
; SI-MOVREL-NEXT: s_mov_b32 s5, s1
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
@@ -8089,9 +8074,8 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out
; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3
-; VI-MOVREL-NEXT: s_add_i32 s6, s6, 15
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2
-; VI-MOVREL-NEXT: s_mov_b32 m0, s6
+; VI-MOVREL-NEXT: s_add_i32 m0, s6, 15
; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0
@@ -8321,8 +8305,7 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: s_add_i32 s12, s12, 16
-; SI-MOVREL-NEXT: s_mov_b32 m0, s12
+; SI-MOVREL-NEXT: s_add_i32 m0, s12, 16
; SI-MOVREL-NEXT: s_mov_b32 s4, s0
; SI-MOVREL-NEXT: s_mov_b32 s5, s1
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
@@ -8353,9 +8336,8 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p
; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3
-; VI-MOVREL-NEXT: s_add_i32 s6, s6, 16
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2
-; VI-MOVREL-NEXT: s_mov_b32 m0, s6
+; VI-MOVREL-NEXT: s_add_i32 m0, s6, 16
; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0
@@ -8586,9 +8568,8 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_lshl_b32 m0, s12, 2
; SI-MOVREL-NEXT: s_mov_b32 s4, s0
-; SI-MOVREL-NEXT: s_lshl_b32 s0, s12, 2
-; SI-MOVREL-NEXT: s_mov_b32 m0, s0
; SI-MOVREL-NEXT: s_mov_b32 s5, s1
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1
; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -8618,12 +8599,11 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out,
; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3
-; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0
-; VI-MOVREL-NEXT: s_lshl_b32 s0, s6, 2
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2
-; VI-MOVREL-NEXT: s_mov_b32 m0, s0
+; VI-MOVREL-NEXT: s_lshl_b32 m0, s6, 2
; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0
; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1
; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0
@@ -8862,7 +8842,7 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000
; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_lshl_b32 s4, s6, 2
+; SI-MOVREL-NEXT: s_lshl_b32 m0, s6, 2
; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8
; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9
; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10
@@ -8879,7 +8859,6 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out,
; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s21
; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s22
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s23
-; SI-MOVREL-NEXT: s_mov_b32 m0, s4
; SI-MOVREL-NEXT: s_mov_b32 s2, -1
; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16
; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
@@ -8895,9 +8874,8 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out,
; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_lshl_b32 s2, s2, 2
+; VI-MOVREL-NEXT: s_lshl_b32 m0, s2, 2
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8
-; VI-MOVREL-NEXT: s_mov_b32 m0, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 6f0c850117208..4b9da7b49e997 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -654,7 +654,7 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 1
+; GCN-NEXT: s_lshl_b32 m0, s2, 1
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
@@ -671,7 +671,6 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v
; GCN-NEXT: v_mov_b32_e32 v13, s21
; GCN-NEXT: v_mov_b32_e32 v14, s22
; GCN-NEXT: v_mov_b32_e32 v15, s23
-; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: s_add_u32 s2, s0, 48
; GCN-NEXT: v_movreld_b32_e32 v0, 0
; GCN-NEXT: s_addc_u32 s3, s1, 0
@@ -720,15 +719,14 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v
; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: v_mov_b32_e32 v5, s13
; GCN-NEXT: v_mov_b32_e32 v6, s14
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s0, s0, 1
; GCN-NEXT: v_mov_b32_e32 v7, s15
; GCN-NEXT: v_mov_b32_e32 v9, s1
; GCN-NEXT: v_mov_b32_e32 v10, s2
; GCN-NEXT: v_mov_b32_e32 v11, s3
; GCN-NEXT: v_mov_b32_e32 v12, s16
; GCN-NEXT: v_mov_b32_e32 v13, s17
-; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 m0, s0, 1
; GCN-NEXT: v_movreld_b32_e32 v0, 0
; GCN-NEXT: s_add_u32 s0, s6, 16
; GCN-NEXT: v_movreld_b32_e32 v1, v16
@@ -765,8 +763,7 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double>
; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s36
-; GCN-NEXT: s_lshl_b32 s0, s0, 1
-; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_lshl_b32 m0, s0, 1
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v2, s38
@@ -872,8 +869,7 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double>
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s0, s0, 1
-; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_lshl_b32 m0, s0, 1
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v5, s13
; GCN-NEXT: v_mov_b32_e32 v6, s14
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 20d2b12a1ebfe..837c18fe7af0a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -2266,13 +2266,12 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4
define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 {
; SI-LABEL: dynamic_insertelement_v8f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s4, s[8:9], 0x20
; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10
+; SI-NEXT: s_load_dword s4, s[8:9], 0x20
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: v_mov_b32_e32 v16, 0x40200000
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_lshl_b32 s4, s4, 1
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s14
@@ -2289,7 +2288,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8
; SI-NEXT: v_mov_b32_e32 v13, s25
; SI-NEXT: v_mov_b32_e32 v14, s26
; SI-NEXT: v_mov_b32_e32 v15, s27
-; SI-NEXT: s_mov_b32 m0, s4
+; SI-NEXT: s_lshl_b32 m0, s4, 1
; SI-NEXT: v_movreld_b32_e32 v0, 0
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_movreld_b32_e32 v1, v16
@@ -2301,13 +2300,12 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8
;
; VI-LABEL: dynamic_insertelement_v8f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[8:9], 0x80
; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40
+; VI-NEXT: s_load_dword s4, s[8:9], 0x80
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v16, 0x40200000
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s4, s4, 1
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
@@ -2324,7 +2322,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8
; VI-NEXT: v_mov_b32_e32 v13, s25
; VI-NEXT: v_mov_b32_e32 v14, s26
; VI-NEXT: v_mov_b32_e32 v15, s27
-; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: s_lshl_b32 m0, s4, 1
; VI-NEXT: v_movreld_b32_e32 v0, 0
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_movreld_b32_e32 v1, v16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
index d97fc356b30fc..2776e24379b9d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-SDAG,GCN-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
; GCN-LABEL: {{^}}ds_append_lds:
; GCN: s_load_dword [[PTR:s[0-9]+]]
@@ -35,8 +35,7 @@ define amdgpu_kernel void @ds_append_lds_max_offset(ptr addrspace(3) %lds, ptr a
; GCN-LABEL: {{^}}ds_append_no_fold_offset_si:
; GCN: s_load_dword [[PTR:s[0-9]+]]
-; SI: s_add_i32 [[PTR]], [[PTR]], 16
-; SI: s_mov_b32 m0, [[PTR]]
+; SI: s_add_i32 m0, [[PTR]], 16
; SI: ds_append [[RESULT:v[0-9]+]]{{$}}
; CIPLUS: s_mov_b32 m0, [[PTR]]
@@ -55,12 +54,8 @@ define amdgpu_kernel void @ds_append_no_fold_offset_si(ptr addrspace(4) %lds.ptr
; GCN-LABEL: {{^}}ds_append_lds_over_max_offset:
; GCN: s_load_dword [[PTR:s[0-9]+]]
-; SI-SDAG: s_bitset1_b32 [[PTR]], 16
-; CIPLUS-SDAG: s_add_i32 [[PTR]], [[PTR]], 0x10000
-; GCN-SDAG: s_mov_b32 m0, [[PTR]]
-
-; SI-GISEL: s_bitset1_b32 m0, 16
-; CIPLUS-GISEL: s_add_u32 m0, [[PTR]], 0x10000
+; SI: s_or_b32 m0, [[PTR]], 0x10000
+; CIPLUSi|u: s_add_{{i|u}}32 m0, [[PTR]], 0x10000
; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
; GCN-NOT: buffer_wbinvl1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
index ffd0142d1d42c..5795af702f34f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
@@ -1,10 +1,10 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,GCN-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
; GCN-LABEL: {{^}}ds_consume_lds:
@@ -35,8 +35,7 @@ define amdgpu_kernel void @ds_consume_lds_max_offset(ptr addrspace(3) %lds, ptr
; GCN-LABEL: {{^}}ds_consume_no_fold_offset_si:
; GCN: s_load_dword [[PTR:s[0-9]+]]
-; SI: s_add_i32 [[PTR]], [[PTR]], 16
-; SI: s_mov_b32 m0, [[PTR]]
+; SI: s_add_i32 m0, [[PTR]], 16
; SI: ds_consume [[RESULT:v[0-9]+]]{{$}}
; CIPLUS: s_mov_b32 m0, [[PTR]]
@@ -55,11 +54,8 @@ define amdgpu_kernel void @ds_consume_no_fold_offset_si(ptr addrspace(4) %lds.pt
; GCN-LABEL: {{^}}ds_consume_lds_over_max_offset:
; GCN: s_load_dword [[PTR:s[0-9]+]]
-; SI: s_bitset1_b32 [[PTR]], 16
-; CIPLUS-SDAG: s_add_i32 [[PTR]], [[PTR]], 0x10000
-; CIPLUS-GISEL: s_add_u32 [[PTR]], [[PTR]], 0x10000
-
-; GCN-SDAG: s_mov_b32 m0, [[PTR]]
+; SI: s_or_b32 m0, [[PTR]], 0x10000
+; CIPLUS: s_add_{{i|u}}32 m0, [[PTR]], 0x10000
; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
; GCN-NOT: buffer_wbinvl1
; GCN: {{.*}}store{{.*}} [[RESULT]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index a6fd38cab13d0..1e031517adb30 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -4,12 +4,12 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX9 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos.
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s
@@ -61,11 +61,7 @@ define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
; GCN-LABEL: {{^}}gws_barrier_sgpr_offset:
; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]]
-; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
-; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
-
-; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16
-
+; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16
; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]]
; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}}
@@ -78,10 +74,7 @@ define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 {
; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1:
; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]]
-; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
-; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
-
-; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16
+; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16
; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]]
; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:1 gds{{$}}
@@ -95,10 +88,7 @@ define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.ba
; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]]
; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
-; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
-; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
-
-; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16
+; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16
; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]]
; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}}
@@ -113,10 +103,7 @@ define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 {
; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]]
; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
-; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
-; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
-
-; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16
+; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16
; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]]
; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:3 gds{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
index 5db68e3a6c202..0949a60eae185 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
@@ -4,12 +4,12 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
; Minimum offset
; GCN-LABEL: {{^}}gws_init_offset0:
@@ -55,10 +55,7 @@ define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 {
; GCN-LABEL: {{^}}gws_init_sgpr_offset:
; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]]
-; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
-; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
-
-; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16
+; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16
; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]]
; NOLOOP: ds_gws_init [[GWS_VAL]] gds{{$}}
@@ -71,10 +68,7 @@ define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 {
; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1:
; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]]
-; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
-; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
-
-; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16
+; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16
; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]]
; NOLOOP: ds_gws_init [[GWS_VAL]] offset:1 gds{{$}}
@@ -88,10 +82,7 @@ define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base)
; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]]
; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
-; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
-; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
-
-; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16
+; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16
; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
; NOLOOP: ds_gws_init v0 gds{{$}}
@@ -106,10 +97,7 @@ define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 {
; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]]
; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
-; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
-; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
-
-; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16
+; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16
; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
; NOLOOP: ds_gws_init v0 offset:3 gds{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
index 21dc07cf28fd6..5b752949859f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
@@ -1,36 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
-; SDAG-LABEL: buffer_load_lds_dword:
-; SDAG: ; %bb.0: ; %main_body
-; SDAG-NEXT: v_mov_b32_e32 v0, 8
-; SDAG-NEXT: s_mov_b32 m0, s4
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
-; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
-; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
-; SDAG-NEXT: v_mov_b32_e32 v0, s4
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: ds_read_b32 v0, v0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: buffer_load_lds_dword:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_mov_b32 m0, s4
-; GISEL-NEXT: v_mov_b32_e32 v0, 8
-; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
-; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
-; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
-; GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: ds_read_b32 v0, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: ; return to shader part epilog
+; GCN-LABEL: buffer_load_lds_dword:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, 8
+; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
+; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
+; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
main_body:
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll
index 654e72daffedd..c1b8df0898076 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950 %s
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
@@ -14,32 +14,18 @@ declare void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr
;---------------------------------------------------------------------
define amdgpu_ps float @buffer_load_lds_dwordx3(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
-; GFX950-SDAG-LABEL: buffer_load_lds_dwordx3:
-; GFX950-SDAG: ; %bb.0:
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 8
-; GFX950-SDAG-NEXT: s_mov_b32 m0, s4
-; GFX950-SDAG-NEXT: s_nop 0
-; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds
-; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds
-; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX950-SDAG-NEXT: ds_read_b32 v0, v0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX950-GISEL-LABEL: buffer_load_lds_dwordx3:
-; GFX950-GISEL: ; %bb.0:
-; GFX950-GISEL-NEXT: s_mov_b32 m0, s4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 8
-; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds
-; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds
-; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX950-GISEL-NEXT: ds_read_b32 v0, v0
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: ; return to shader part epilog
+; GFX950-LABEL: buffer_load_lds_dwordx3:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_mov_b32 m0, s4
+; GFX950-NEXT: v_mov_b32_e32 v0, 8
+; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds
+; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds
+; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds
+; GFX950-NEXT: v_mov_b32_e32 v0, s4
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: ds_read_b32 v0, v0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: ; return to shader part epilog
call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 4, i32 1)
call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 8, i32 2)
@@ -107,32 +93,18 @@ define amdgpu_ps void @buffer_load_lds_dwordx3_vs_imm_offset(ptr addrspace(8) in
;---------------------------------------------------------------------
define amdgpu_ps float @buffer_load_lds_dwordx4(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
-; GFX950-SDAG-LABEL: buffer_load_lds_dwordx4:
-; GFX950-SDAG: ; %bb.0:
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 8
-; GFX950-SDAG-NEXT: s_mov_b32 m0, s4
-; GFX950-SDAG-NEXT: s_nop 0
-; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds
-; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds
-; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX950-SDAG-NEXT: ds_read_b32 v0, v0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX950-GISEL-LABEL: buffer_load_lds_dwordx4:
-; GFX950-GISEL: ; %bb.0:
-; GFX950-GISEL-NEXT: s_mov_b32 m0, s4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 8
-; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds
-; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds
-; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX950-GISEL-NEXT: ds_read_b32 v0, v0
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: ; return to shader part epilog
+; GFX950-LABEL: buffer_load_lds_dwordx4:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_mov_b32 m0, s4
+; GFX950-NEXT: v_mov_b32_e32 v0, 8
+; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds
+; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds
+; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds
+; GFX950-NEXT: v_mov_b32_e32 v0, s4
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: ds_read_b32 v0, v0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: ; return to shader part epilog
call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 4, i32 1)
call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 8, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll
index 04a9f926acd5b..35c959f2e805c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll
@@ -1,36 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
declare void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
define amdgpu_ps float @buffer_load_lds_dword(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
-; SDAG-LABEL: buffer_load_lds_dword:
-; SDAG: ; %bb.0: ; %main_body
-; SDAG-NEXT: v_mov_b32_e32 v0, 8
-; SDAG-NEXT: s_mov_b32 m0, s4
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
-; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
-; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
-; SDAG-NEXT: v_mov_b32_e32 v0, s4
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: ds_read_b32 v0, v0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: buffer_load_lds_dword:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_mov_b32 m0, s4
-; GISEL-NEXT: v_mov_b32_e32 v0, 8
-; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
-; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
-; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
-; GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: ds_read_b32 v0, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: ; return to shader part epilog
+; GCN-LABEL: buffer_load_lds_dword:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, 8
+; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
+; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
+; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
main_body:
call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
More information about the llvm-commits
mailing list