[llvm] 6e29846 - [AMDGPU] Fix whole wavefront mode

Sebastian Neubauer via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 17 09:31:18 PDT 2020


Author: Sebastian Neubauer
Date: 2020-03-17T17:23:23+01:00
New Revision: 6e29846b29d2bcaa8a7a3d869f24f242bd93d272

URL: https://github.com/llvm/llvm-project/commit/6e29846b29d2bcaa8a7a3d869f24f242bd93d272
DIFF: https://github.com/llvm/llvm-project/commit/6e29846b29d2bcaa8a7a3d869f24f242bd93d272.diff

LOG: [AMDGPU] Fix whole wavefront mode

We cannot move wwm over exec copies because the exec register needs an exact exec mask.

Differential Revision: https://reviews.llvm.org/D76232

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/wqm.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 2f33bc8467be..6b2b54d896fa 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -171,8 +171,6 @@ class SIWholeQuadMode : public MachineFunctionPass {
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
   char analyzeFunction(MachineFunction &MF);
 
-  bool requiresCorrectState(const MachineInstr &MI) const;
-
   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator Before);
   MachineBasicBlock::iterator
@@ -526,36 +524,6 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
   return GlobalFlags;
 }
 
-/// Whether \p MI really requires the exec state computed during analysis.
-///
-/// Scalar instructions must occasionally be marked WQM for correct propagation
-/// (e.g. thread masks leading up to branches), but when it comes to actual
-/// execution, they don't care about EXEC.
-bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
-  if (MI.isTerminator())
-    return true;
-
-  // Skip instructions that are not affected by EXEC
-  if (TII->isScalarUnit(MI))
-    return false;
-
-  // Generic instructions such as COPY will either disappear by register
-  // coalescing or be lowered to SALU or VALU instructions.
-  if (MI.isTransient()) {
-    if (MI.getNumExplicitOperands() >= 1) {
-      const MachineOperand &Op = MI.getOperand(0);
-      if (Op.isReg()) {
-        if (TRI->isSGPRReg(*MRI, Op.getReg())) {
-          // SGPR instructions are not affected by EXEC
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
 MachineBasicBlock::iterator
 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator Before) {
@@ -742,7 +710,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
     if (II != IE) {
       MachineInstr &MI = *II;
 
-      if (requiresCorrectState(MI)) {
+      if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
         auto III = Instructions.find(&MI);
         if (III != Instructions.end()) {
           if (III->second.Needs & StateWWM)

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 55796554b08e..3695586633df 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -375,9 +375,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX8-LABEL: add_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -428,9 +428,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX9-LABEL: add_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -480,9 +480,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: add_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -539,10 +539,10 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1032-LABEL: add_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    ; implicit-def: $vcc_hi
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
@@ -614,9 +614,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
 ;
 ; GFX8-LABEL: add_i32_varying_gfx1032:
 ; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -667,9 +667,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
 ;
 ; GFX9-LABEL: add_i32_varying_gfx1032:
 ; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -719,9 +719,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: add_i32_varying_gfx1032:
 ; GFX1064:       ; %bb.0: ; %entry
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -778,10 +778,10 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
 ;
 ; GFX1032-LABEL: add_i32_varying_gfx1032:
 ; GFX1032:       ; %bb.0: ; %entry
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    ; implicit-def: $vcc_hi
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
@@ -853,9 +853,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
 ;
 ; GFX8-LABEL: add_i32_varying_gfx1064:
 ; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -906,9 +906,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
 ;
 ; GFX9-LABEL: add_i32_varying_gfx1064:
 ; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -958,9 +958,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: add_i32_varying_gfx1064:
 ; GFX1064:       ; %bb.0: ; %entry
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -1017,10 +1017,10 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
 ;
 ; GFX1032-LABEL: add_i32_varying_gfx1064:
 ; GFX1032:       ; %bb.0: ; %entry
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    ; implicit-def: $vcc_hi
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
@@ -1934,9 +1934,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX8-LABEL: sub_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -1987,9 +1987,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX9-LABEL: sub_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -2039,9 +2039,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: sub_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -2098,10 +2098,10 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1032-LABEL: sub_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    ; implicit-def: $vcc_hi
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
@@ -2917,9 +2917,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX8-LABEL: or_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -2970,9 +2970,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX9-LABEL: or_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -3022,9 +3022,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: or_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -3081,10 +3081,10 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1032-LABEL: or_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    ; implicit-def: $vcc_hi
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
@@ -3159,9 +3159,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX8-LABEL: xor_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -3212,9 +3212,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX9-LABEL: xor_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -3264,9 +3264,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: xor_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -3323,10 +3323,10 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1032-LABEL: xor_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    ; implicit-def: $vcc_hi
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
@@ -4265,9 +4265,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX8-LABEL: umax_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -4318,9 +4318,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX9-LABEL: umax_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -4370,9 +4370,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: umax_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
@@ -4429,10 +4429,10 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
 ;
 ; GFX1032-LABEL: umax_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    ; implicit-def: $vcc_hi
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index ec285db3aaba..288afea1f5e9 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -107,3 +107,38 @@ body:             |
     S_ENDPGM 0
 
 ...
+
+---
+# Ensure that wwm is not put around an EXEC copy
+#CHECK-LABEL: name: copy_exec
+#CHECK: %7:sreg_64 = COPY $exec
+#CHECK-NEXT: %14:sreg_64 = ENTER_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
+#CHECK-NEXT: %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+#CHECK-NEXT: $exec = EXIT_WWM %14
+#CHECK-NEXT: %9:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %7.sub0, 0, implicit $exec
+name:            copy_exec
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %4:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+    %5:sreg_32 = S_MOV_B32 0
+    %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %4, %5, 0, 0, 0, 0, 0, 0, implicit $exec
+
+    %8:sreg_64 = COPY $exec
+    %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %10:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %8.sub0:sreg_64, 0, implicit $exec
+    %11:vgpr_32 = V_MOV_B32_dpp %9:vgpr_32, %10:vgpr_32, 312, 15, 15, 0, implicit $exec
+    %12:sreg_32 = V_READLANE_B32 %11:vgpr_32, 63
+    early-clobber %13:sreg_32 = WWM %9:vgpr_32, implicit $exec
+
+    %14:vgpr_32 = COPY %13
+    BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...


        


More information about the llvm-commits mailing list