[llvm] c262b69 - [AMDGPU] Fix endcf collapse

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 13 13:50:31 PDT 2020


Author: Stanislav Mekhanoshin
Date: 2020-03-13T13:50:21-07:00
New Revision: c262b69dcc0a280ec4d551244b3571123c36a370

URL: https://github.com/llvm/llvm-project/commit/c262b69dcc0a280ec4d551244b3571123c36a370
DIFF: https://github.com/llvm/llvm-project/commit/c262b69dcc0a280ec4d551244b3571123c36a370.diff

LOG: [AMDGPU] Fix endcf collapse

Only collapse inner endcf if the outer one belongs to SI_IF.
If it does belong to SI_ELSE then mask being restored in fact
a partial inverse of what we need.

Differential Revision: https://reviews.llvm.org/D76154

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
    llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
    llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
    llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index de9d8fa29005..b46a515c0b5e 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -76,7 +76,7 @@ using namespace llvm;
 
 static cl::opt<bool>
 RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
-    cl::init(false), cl::ReallyHidden);
+    cl::init(true), cl::ReallyHidden);
 
 namespace {
 
@@ -87,6 +87,7 @@ class SILowerControlFlow : public MachineFunctionPass {
   LiveIntervals *LIS = nullptr;
   MachineRegisterInfo *MRI = nullptr;
   DenseSet<const MachineInstr*> LoweredEndCf;
+  DenseSet<Register> LoweredIf;
 
   const TargetRegisterClass *BoolRC = nullptr;
   unsigned AndOpc;
@@ -212,6 +213,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
     .addReg(Exec)
     .addReg(Exec, RegState::ImplicitDefine);
+  LoweredIf.insert(CopyReg);
 
   Register Tmp = MRI->createVirtualRegister(BoolRC);
 
@@ -453,11 +455,19 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
       skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI.getIterator()));
     if (Next != MBB.end() && (Next->getOpcode() == AMDGPU::SI_END_CF ||
                               LoweredEndCf.count(&*Next))) {
-      LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump());
-      if (LIS)
-        LIS->RemoveMachineInstrFromMaps(MI);
-      MI.eraseFromParent();
-      return;
+      // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
+      // If that belongs to SI_ELSE then saved mask has an inverted value.
+      Register SavedExec = Next->getOperand(0).getReg();
+      const MachineInstr *Def = MRI.getUniqueVRegDef(SavedExec);
+      // A lowered SI_IF turns definition into COPY of exec.
+      if (Def && (Def->getOpcode() == AMDGPU::SI_IF ||
+                  LoweredIf.count(SavedExec))) {
+        LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump());
+        if (LIS)
+          LIS->RemoveMachineInstrFromMaps(MI);
+        MI.eraseFromParent();
+        return;
+      }
     }
   }
 
@@ -617,6 +627,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   }
 
   LoweredEndCf.clear();
+  LoweredIf.clear();
 
   return true;
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index c2676eaeb989..1af2ca55308b 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -142,16 +142,15 @@ bb.outer.end:                                        ; preds = %bb, %bb.then, %b
 ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]]
 ; GCN-NEXT: ; %bb.{{[0-9]+}}:
 ; GCN:      store_dword
-; GCN-NEXT: s_and_b64 exec, exec,
+; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_ELSE:s\[[0-9:]+\]]],
 ; GCN-NEXT: s_cbranch_execz [[FLOW1:BB[0-9_]+]]
 ; GCN-NEXT: ; %bb.{{[0-9]+}}:
 ; GCN:      store_dword
 ; GCN-NEXT: [[FLOW1]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]]
-; GCN-NOT:  s_or_b64 exec
-; GCN-NOT:  {{^.*:}}
-; GCN: ds_write_b32
-; GCN: s_endpgm
+; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_ELSE]]
+; GCN:      s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]]
+; GCN:      ds_write_b32
+; GCN:      s_endpgm
 define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
index 277797fc2fba..7760c921149e 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
@@ -834,3 +834,124 @@ body:             |
 
     SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
+
+...
+
+# Both s_or_b64 shall be preserved since the outer SI_END_CF belongs to SI_ELSE.
+
+---
+name: simple_outer_if_else
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GCN-LABEL: name: simple_outer_if_else
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN:   [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 1, [[COPY1]], implicit $exec
+  ; GCN:   [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
+  ; GCN:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_LT_U32_e64_]], implicit-def dead $scc
+  ; GCN:   [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc
+  ; GCN:   $exec = S_MOV_B64_term killed [[S_AND_B64_]]
+  ; GCN:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.2(0x80000000)
+  ; GCN:   S_BRANCH %bb.2
+  ; GCN: bb.2:
+  ; GCN:   successors: %bb.3(0x40000000), %bb.6(0x40000000)
+  ; GCN:   [[COPY3:%[0-9]+]]:sreg_64 = COPY [[S_XOR_B64_]]
+  ; GCN:   [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[COPY3]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN:   $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
+  ; GCN:   S_CBRANCH_EXECZ %bb.6, implicit $exec
+  ; GCN: bb.3:
+  ; GCN:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; GCN:   undef %5.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4)
+  ; GCN:   undef %6.sub0:vreg_64 = V_LSHLREV_B32_e32 2, [[COPY1]], implicit $exec
+  ; GCN:   %6.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY %5.sub1
+  ; GCN:   undef %8.sub0:vreg_64, %9:sreg_64_xexec = V_ADD_I32_e64 %5.sub0, %6.sub0, 0, implicit $exec
+  ; GCN:   %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY4]], %9, 0, implicit $exec
+  ; GCN:   %5.sub3:sgpr_128 = S_MOV_B32 61440
+  ; GCN:   %5.sub2:sgpr_128 = S_MOV_B32 0
+  ; GCN:   BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+  ; GCN:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec
+  ; GCN:   [[COPY5:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
+  ; GCN:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY5]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+  ; GCN:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
+  ; GCN:   S_CBRANCH_EXECZ %bb.3, implicit $exec
+  ; GCN: bb.4:
+  ; GCN:   successors: %bb.5(0x80000000)
+  ; GCN:   %5.sub0:sgpr_128 = COPY %5.sub2
+  ; GCN:   %5.sub1:sgpr_128 = COPY %5.sub2
+  ; GCN:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+  ; GCN: bb.5:
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   $exec = S_OR_B64 $exec, [[COPY5]], implicit-def $scc
+  ; GCN: bb.6:
+  ; GCN:   $exec = S_OR_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
+  ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+  ; GCN:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN:   $m0 = S_MOV_B32 -1
+  ; GCN:   DS_WRITE_B32 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %2:sreg_64 = V_CMP_LT_U32_e64 1, %0, implicit $exec
+    %3:sreg_64 = SI_IF %2:sreg_64, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3, %bb.6
+    %4:sreg_64 = SI_ELSE %3:sreg_64, %bb.6, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.3:
+    successors: %bb.3, %bb.4
+
+    undef %5.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %1, 9, 0, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4)
+    undef %6.sub0:vreg_64 = V_LSHLREV_B32_e32 2, %0, implicit $exec
+    %6.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+    %7:vgpr_32 = COPY %5.sub1
+    undef %8.sub0:vreg_64, %9:sreg_64_xexec = V_ADD_I32_e64 %5.sub0, %6.sub0, 0, implicit $exec
+    %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec
+    %5.sub3:sgpr_128 = S_MOV_B32 61440
+    %5.sub2:sgpr_128 = S_MOV_B32 0
+    BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec
+    %12:sreg_64 = SI_IF %11:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.4:
+    successors: %bb.5
+
+    %5.sub0:sgpr_128 = COPY %5.sub2
+    %5.sub1:sgpr_128 = COPY %5.sub2
+    %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+
+  bb.5:
+    successors: %bb.6
+
+    SI_END_CF %12:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.6:
+    SI_END_CF %4:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+    %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    $m0 = S_MOV_B32 -1
+    DS_WRITE_B32 %16, %15, 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
+    S_ENDPGM 0
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 9f717df480fb..5f0b5aaecac9 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -58,7 +58,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz BB1_4
+; GFX9-NEXT:    s_cbranch_execz BB1_3
 ; GFX9-NEXT:  ; %bb.1: ; %bb19
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v6
@@ -100,9 +100,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[12:13]
 ; GFX9-NEXT:    s_cbranch_execnz BB1_2
-; GFX9-NEXT:  ; %bb.3: ; %Flow
-; GFX9-NEXT:    s_or_b64 exec, exec, s[12:13]
-; GFX9-NEXT:  BB1_4: ; %Flow3
+; GFX9-NEXT:  BB1_3: ; %Flow3
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]


        


More information about the llvm-commits mailing list