[llvm] 6e29846 - [AMDGPU] Fix whole wavefront mode
Sebastian Neubauer via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 17 09:31:18 PDT 2020
Author: Sebastian Neubauer
Date: 2020-03-17T17:23:23+01:00
New Revision: 6e29846b29d2bcaa8a7a3d869f24f242bd93d272
URL: https://github.com/llvm/llvm-project/commit/6e29846b29d2bcaa8a7a3d869f24f242bd93d272
DIFF: https://github.com/llvm/llvm-project/commit/6e29846b29d2bcaa8a7a3d869f24f242bd93d272.diff
LOG: [AMDGPU] Fix whole wavefront mode
We cannot move wwm over exec copies because the exec register needs an exact exec mask.
Differential Revision: https://reviews.llvm.org/D76232
Added:
Modified:
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/test/CodeGen/AMDGPU/wqm.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 2f33bc8467be..6b2b54d896fa 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -171,8 +171,6 @@ class SIWholeQuadMode : public MachineFunctionPass {
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
char analyzeFunction(MachineFunction &MF);
- bool requiresCorrectState(const MachineInstr &MI) const;
-
MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before);
MachineBasicBlock::iterator
@@ -526,36 +524,6 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
return GlobalFlags;
}
-/// Whether \p MI really requires the exec state computed during analysis.
-///
-/// Scalar instructions must occasionally be marked WQM for correct propagation
-/// (e.g. thread masks leading up to branches), but when it comes to actual
-/// execution, they don't care about EXEC.
-bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
- if (MI.isTerminator())
- return true;
-
- // Skip instructions that are not affected by EXEC
- if (TII->isScalarUnit(MI))
- return false;
-
- // Generic instructions such as COPY will either disappear by register
- // coalescing or be lowered to SALU or VALU instructions.
- if (MI.isTransient()) {
- if (MI.getNumExplicitOperands() >= 1) {
- const MachineOperand &Op = MI.getOperand(0);
- if (Op.isReg()) {
- if (TRI->isSGPRReg(*MRI, Op.getReg())) {
- // SGPR instructions are not affected by EXEC
- return false;
- }
- }
- }
- }
-
- return true;
-}
-
MachineBasicBlock::iterator
SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before) {
@@ -742,7 +710,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (II != IE) {
MachineInstr &MI = *II;
- if (requiresCorrectState(MI)) {
+ if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
auto III = Instructions.find(&MI);
if (III != Instructions.end()) {
if (III->second.Needs & StateWWM)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 55796554b08e..3695586633df 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -375,9 +375,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: add_i32_varying:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -428,9 +428,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
;
; GFX9-LABEL: add_i32_varying:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -480,9 +480,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: add_i32_varying:
; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -539,10 +539,10 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: add_i32_varying:
; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: ; implicit-def: $vcc_hi
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
@@ -614,9 +614,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: add_i32_varying_gfx1032:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -667,9 +667,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
;
; GFX9-LABEL: add_i32_varying_gfx1032:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -719,9 +719,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: add_i32_varying_gfx1032:
; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -778,10 +778,10 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: add_i32_varying_gfx1032:
; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: ; implicit-def: $vcc_hi
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
@@ -853,9 +853,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: add_i32_varying_gfx1064:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -906,9 +906,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
;
; GFX9-LABEL: add_i32_varying_gfx1064:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -958,9 +958,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: add_i32_varying_gfx1064:
; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -1017,10 +1017,10 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: add_i32_varying_gfx1064:
; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: ; implicit-def: $vcc_hi
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
@@ -1934,9 +1934,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: sub_i32_varying:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -1987,9 +1987,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
;
; GFX9-LABEL: sub_i32_varying:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -2039,9 +2039,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: sub_i32_varying:
; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -2098,10 +2098,10 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: sub_i32_varying:
; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: ; implicit-def: $vcc_hi
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
@@ -2917,9 +2917,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: or_i32_varying:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -2970,9 +2970,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
;
; GFX9-LABEL: or_i32_varying:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -3022,9 +3022,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: or_i32_varying:
; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -3081,10 +3081,10 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: or_i32_varying:
; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: ; implicit-def: $vcc_hi
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
@@ -3159,9 +3159,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: xor_i32_varying:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -3212,9 +3212,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
;
; GFX9-LABEL: xor_i32_varying:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -3264,9 +3264,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: xor_i32_varying:
; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -3323,10 +3323,10 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: xor_i32_varying:
; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: ; implicit-def: $vcc_hi
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
@@ -4265,9 +4265,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: umax_i32_varying:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -4318,9 +4318,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
;
; GFX9-LABEL: umax_i32_varying:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -4370,9 +4370,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: umax_i32_varying:
; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -4429,10 +4429,10 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: umax_i32_varying:
; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: ; implicit-def: $vcc_hi
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index ec285db3aaba..288afea1f5e9 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -107,3 +107,38 @@ body: |
S_ENDPGM 0
...
+
+---
+# Ensure that wwm is not put around an EXEC copy
+#CHECK-LABEL: name: copy_exec
+#CHECK: %7:sreg_64 = COPY $exec
+#CHECK-NEXT: %14:sreg_64 = ENTER_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
+#CHECK-NEXT: %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+#CHECK-NEXT: $exec = EXIT_WWM %14
+#CHECK-NEXT: %9:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %7.sub0, 0, implicit $exec
+name: copy_exec
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %4:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+ %5:sreg_32 = S_MOV_B32 0
+ %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %4, %5, 0, 0, 0, 0, 0, 0, implicit $exec
+
+ %8:sreg_64 = COPY $exec
+ %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %10:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %8.sub0:sreg_64, 0, implicit $exec
+ %11:vgpr_32 = V_MOV_B32_dpp %9:vgpr_32, %10:vgpr_32, 312, 15, 15, 0, implicit $exec
+ %12:sreg_32 = V_READLANE_B32 %11:vgpr_32, 63
+ early-clobber %13:sreg_32 = WWM %9:vgpr_32, implicit $exec
+
+ %14:vgpr_32 = COPY %13
+ BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
More information about the llvm-commits
mailing list