[llvm] c8a9872 - AMDGPU/GlobalISel: Look through copies in getPtrBaseWithConstantOffset
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 17 09:31:44 PDT 2020
Author: Matt Arsenault
Date: 2020-08-17T12:31:38-04:00
New Revision: c8a98722590c57842720d6055df0875b5f6b40a4
URL: https://github.com/llvm/llvm-project/commit/c8a98722590c57842720d6055df0875b5f6b40a4
DIFF: https://github.com/llvm/llvm-project/commit/c8a98722590c57842720d6055df0875b5f6b40a4.diff
LOG: AMDGPU/GlobalISel: Look through copies in getPtrBaseWithConstantOffset
We may have an SGPR->VGPR copy if a totally uniform pointer
calculation is used for a VGPR pointer operand.
Also hack around a bug in MUBUF matching which would incorrectly use
MUBUF for global when flat was requested. This should really be a
predicate on the parent pattern, but the DAG always checked this
manually inside the complex pattern.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5ab1af811e950..f64aaf5062c50 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3488,7 +3488,7 @@ AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) c
std::pair<Register, int64_t>
AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
Register Root, const MachineRegisterInfo &MRI) const {
- MachineInstr *RootI = MRI.getVRegDef(Root);
+ MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
return {Root, 0};
@@ -3679,6 +3679,11 @@ bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
MachineOperand &Root, Register &RSrcReg, Register &SOffset,
int64_t &Offset) const {
+
+ // FIXME: Pattern should not reach here.
+ if (STI.useFlatForGlobal())
+ return false;
+
MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
if (shouldUseAddr64(AddrData))
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index 02039a39e4b9d..7b5d677c5ab0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -9,19 +9,18 @@ define amdgpu_kernel void @use_lds_globals(i32 addrspace(1)* %out, i32 addrspace
; CHECK-LABEL: use_lds_globals:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: s_add_u32 s2, 4, 4
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-NEXT: v_mov_b32_e32 v0, 4
; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: ds_read_b32 v2, v0
+; CHECK-NEXT: ds_read_b32 v3, v0 offset:4
+; CHECK-NEXT: v_mov_b32_e32 v2, 9
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_add_u32 s0, s0, 4
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: flat_store_dword v[0:1], v2
-; CHECK-NEXT: v_mov_b32_e32 v0, 9
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: ds_write_b32 v1, v0
+; CHECK-NEXT: flat_store_dword v[0:1], v3
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: ds_write_b32 v0, v2
; CHECK-NEXT: s_endpgm
entry:
%tmp0 = getelementptr [128 x i32], [128 x i32] addrspace(3)* @lds_512_4, i32 0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
index 527a66c48142c..8817de69bdbab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
@@ -72,9 +72,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out,
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s2, s2, 16
; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0
+; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -88,9 +87,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out,
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s2, s2, 16
; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0
+; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -164,9 +162,8 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %pt
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s0, s0, 16
; CI-NEXT: v_mov_b32_e32 v1, s0
-; CI-NEXT: ds_dec_rtn_u32 v0, v1, v0
+; CI-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16
; CI-NEXT: s_endpgm
;
; VI-LABEL: lds_atomic_dec_noret_i32_offset:
@@ -175,9 +172,8 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %pt
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: ds_dec_rtn_u32 v0, v1, v0
+; VI-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16
; VI-NEXT: s_endpgm
; GFX9-LABEL: lds_atomic_dec_noret_i32_offset:
; GFX9: ; %bb.0:
@@ -1256,9 +1252,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out,
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1273,9 +1268,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out,
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1345,9 +1339,8 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %pt
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: s_endpgm
;
; VI-LABEL: lds_atomic_dec_noret_i64_offset:
@@ -1357,9 +1350,8 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %pt
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: s_endpgm
; GFX9-LABEL: lds_atomic_dec_noret_i64_offset:
; GFX9: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index 74c6fe270b794..ce898c2a73d4c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -76,9 +76,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out,
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s2, s2, 16
; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
+; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -92,9 +91,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out,
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s2, s2, 16
; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
+; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -105,11 +103,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out,
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s2, 16
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -161,9 +158,8 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %pt
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s0, s0, 16
; CI-NEXT: v_mov_b32_e32 v1, s0
-; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0
+; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16
; CI-NEXT: s_endpgm
;
; VI-LABEL: lds_atomic_inc_noret_i32_offset:
@@ -172,19 +168,17 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %pt
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0
+; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_inc_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 16
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16
; GFX9-NEXT: s_endpgm
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
@@ -622,9 +616,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out,
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -639,9 +632,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out,
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -655,9 +647,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out,
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s2, 32
; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -713,9 +704,8 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %pt
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: s_endpgm
;
; VI-LABEL: lds_atomic_inc_noret_i64_offset:
@@ -725,9 +715,8 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %pt
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_inc_noret_i64_offset:
@@ -736,9 +725,8 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %pt
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 32
; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; GFX9-NEXT: s_endpgm
%gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
index 7616bd111ab28..dd6bd2e5835f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
@@ -26,20 +26,18 @@ define amdgpu_ps float @ds_fadd_f32_ss(float addrspace(3)* inreg %ptr, float inr
define amdgpu_ps float @ds_fadd_f32_ss_offset(float addrspace(3)* inreg %ptr, float inreg %val) {
; GFX8-LABEL: ds_fadd_f32_ss_offset:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_add_u32 s0, s2, 0x200
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1
+; GFX8-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: ds_fadd_f32_ss_offset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 s0, s2, 0x200
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(3)* %ptr, i32 128
@@ -69,19 +67,17 @@ define amdgpu_ps void @ds_fadd_f32_ss_nortn(float addrspace(3)* inreg %ptr, floa
define amdgpu_ps void @ds_fadd_f32_ss_offset_nortn(float addrspace(3)* inreg %ptr, float inreg %val) {
; GFX8-LABEL: ds_fadd_f32_ss_offset_nortn:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_add_u32 s0, s2, 0x200
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1
+; GFX8-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: ds_fadd_f32_ss_offset_nortn:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 s0, s2, 0x200
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512
; GFX9-NEXT: s_endpgm
%gep = getelementptr float, float addrspace(3)* %ptr, i32 128
%unused = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
index 1128be4c7118b..f5092c1fc400f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
@@ -53,20 +53,18 @@ define amdgpu_ps float @ds_fmax_f32_ss(float addrspace(3)* inreg %ptr, float inr
define amdgpu_ps float @ds_fmax_f32_ss_offset(float addrspace(3)* inreg %ptr, float inreg %val) {
; GFX8-LABEL: ds_fmax_f32_ss_offset:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_add_u32 s0, s2, 0x200
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1
+; GFX8-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: ds_fmax_f32_ss_offset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 s0, s2, 0x200
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset
@@ -74,12 +72,10 @@ define amdgpu_ps float @ds_fmax_f32_ss_offset(float addrspace(3)* inreg %ptr, fl
; GFX8-MIR: liveins: $sgpr2, $sgpr3
; GFX8-MIR: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; GFX8-MIR: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX8-MIR: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 512
- ; GFX8-MIR: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
- ; GFX8-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]]
- ; GFX8-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX8-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
; GFX8-MIR: $m0 = S_MOV_B32 -1
- ; GFX8-MIR: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY2]], [[COPY3]], 0, 0, implicit $m0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3)
+ ; GFX8-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX8-MIR: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY3]], [[COPY2]], 512, 0, implicit $m0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3)
; GFX8-MIR: $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
; GFX8-MIR: SI_RETURN_TO_EPILOG implicit $vgpr0
; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_offset
@@ -87,11 +83,9 @@ define amdgpu_ps float @ds_fmax_f32_ss_offset(float addrspace(3)* inreg %ptr, fl
; GFX9-MIR: liveins: $sgpr2, $sgpr3
; GFX9-MIR: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; GFX9-MIR: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX9-MIR: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 512
- ; GFX9-MIR: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
- ; GFX9-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]]
- ; GFX9-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GFX9-MIR: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY2]], [[COPY3]], 0, 0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3)
+ ; GFX9-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX9-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX9-MIR: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY3]], [[COPY2]], 512, 0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3)
; GFX9-MIR: $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
; GFX9-MIR: SI_RETURN_TO_EPILOG implicit $vgpr0
%gep = getelementptr float, float addrspace(3)* %ptr, i32 128
@@ -140,42 +134,36 @@ define amdgpu_ps void @ds_fmax_f32_ss_nortn(float addrspace(3)* inreg %ptr, floa
define amdgpu_ps void @ds_fmax_f32_ss_offset_nortn(float addrspace(3)* inreg %ptr, float inreg %val) {
; GFX8-LABEL: ds_fmax_f32_ss_offset_nortn:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_add_u32 s0, s2, 0x200
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1
+; GFX8-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: ds_fmax_f32_ss_offset_nortn:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 s0, s2, 0x200
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512
; GFX9-NEXT: s_endpgm
; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset_nortn
; GFX8-MIR: bb.1 (%ir-block.0):
; GFX8-MIR: liveins: $sgpr2, $sgpr3
; GFX8-MIR: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; GFX8-MIR: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX8-MIR: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 512
- ; GFX8-MIR: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
- ; GFX8-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]]
- ; GFX8-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX8-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
; GFX8-MIR: $m0 = S_MOV_B32 -1
- ; GFX8-MIR: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY2]], [[COPY3]], 0, 0, implicit $m0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3)
+ ; GFX8-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX8-MIR: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY3]], [[COPY2]], 512, 0, implicit $m0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3)
; GFX8-MIR: S_ENDPGM 0
; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_offset_nortn
; GFX9-MIR: bb.1 (%ir-block.0):
; GFX9-MIR: liveins: $sgpr2, $sgpr3
; GFX9-MIR: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; GFX9-MIR: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX9-MIR: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 512
- ; GFX9-MIR: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
- ; GFX9-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]]
- ; GFX9-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GFX9-MIR: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY2]], [[COPY3]], 0, 0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3)
+ ; GFX9-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX9-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX9-MIR: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY3]], [[COPY2]], 512, 0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3)
; GFX9-MIR: S_ENDPGM 0
%gep = getelementptr float, float addrspace(3)* %ptr, i32 128
%unused = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
index 91c869d5ea8fb..738ecba6d4795 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
@@ -26,20 +26,18 @@ define amdgpu_ps float @ds_fmin_f32_ss(float addrspace(3)* inreg %ptr, float inr
define amdgpu_ps float @ds_fmin_f32_ss_offset(float addrspace(3)* inreg %ptr, float inreg %val) {
; GFX8-LABEL: ds_fmin_f32_ss_offset:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_add_u32 s0, s2, 0x200
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1
+; GFX8-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: ds_fmin_f32_ss_offset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 s0, s2, 0x200
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(3)* %ptr, i32 128
@@ -69,19 +67,17 @@ define amdgpu_ps void @ds_fmin_f32_ss_nortn(float addrspace(3)* inreg %ptr, floa
define amdgpu_ps void @ds_fmin_f32_ss_offset_nortn(float addrspace(3)* inreg %ptr, float inreg %val) {
; GFX8-LABEL: ds_fmin_f32_ss_offset_nortn:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_add_u32 s0, s2, 0x200
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1
+; GFX8-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: ds_fmin_f32_ss_offset_nortn:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 s0, s2, 0x200
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512
; GFX9-NEXT: s_endpgm
%gep = getelementptr float, float addrspace(3)* %ptr, i32 128
%unused = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false)
More information about the llvm-commits
mailing list