[llvm] r286133 - AMDGPU: Preserve vcc undef flags when inverting branch

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 7 11:09:27 PST 2016


Author: arsenm
Date: Mon Nov  7 13:09:27 2016
New Revision: 286133

URL: http://llvm.org/viewvc/llvm-project?rev=286133&view=rev
Log:
AMDGPU: Preserve vcc undef flags when inverting branch

If the branch was on a read-undef of vcc, passes that used
analyzeBranch to invert the branch condition wouldn't preserve
the undef flag resulting in a verifier error.

Fixes verifier failures in a future commit.

Also fix verifier error when inserting copy for vccz
corruption bug.

Added:
    llvm/trunk/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir
    llvm/trunk/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir
Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp?rev=286133&r1=286132&r2=286133&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp Mon Nov  7 13:09:27 2016
@@ -178,8 +178,10 @@ FunctionPass *llvm::createSIInsertWaitsP
 
 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
 
-static bool readsVCCZ(unsigned Opcode) {
-  return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ;
+static bool readsVCCZ(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+         !MI.getOperand(1).isUndef();
 }
 
 bool SIInsertWaits::hasOutstandingLGKM() const {
@@ -574,7 +576,7 @@ bool SIInsertWaits::runOnMachineFunction
         }
 
         // Check if we need to apply the bug work-around
-        if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) {
+        if (VCCZCorrupt && readsVCCZ(*I)) {
           DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
 
           // Wait on everything, not just LGKM.  vccz reads usually come from
@@ -589,7 +591,7 @@ bool SIInsertWaits::runOnMachineFunction
           // vcc and then writing it back to the register.
           BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
                   AMDGPU::VCC)
-                  .addReg(AMDGPU::VCC);
+            .addReg(AMDGPU::VCC);
         }
       }
 

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=286133&r1=286132&r2=286133&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Mon Nov  7 13:09:27 2016
@@ -1196,6 +1196,7 @@ bool SIInstrInfo::analyzeBranchImpl(Mach
 
   MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
   Cond.push_back(MachineOperand::CreateImm(Pred));
+  Cond.push_back(I->getOperand(1)); // Save the branch register.
 
   ++I;
 
@@ -1298,9 +1299,16 @@ unsigned SIInstrInfo::insertBranch(Machi
     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
 
   if (!FBB) {
-    BuildMI(&MBB, DL, get(Opcode))
+    Cond[1].isUndef();
+    MachineInstr *CondBr =
+      BuildMI(&MBB, DL, get(Opcode))
       .addMBB(TBB);
 
+    // Copy the flags onto the implicit condition register operand.
+    MachineOperand &CondReg = CondBr->getOperand(1);
+    CondReg.setIsUndef(Cond[1].isUndef());
+    CondReg.setIsKill(Cond[1].isKill());
+
     if (BytesAdded)
       *BytesAdded = 4;
     return 1;
@@ -1308,11 +1316,16 @@ unsigned SIInstrInfo::insertBranch(Machi
 
   assert(TBB && FBB);
 
-  BuildMI(&MBB, DL, get(Opcode))
+  MachineInstr *CondBr =
+    BuildMI(&MBB, DL, get(Opcode))
     .addMBB(TBB);
   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
     .addMBB(FBB);
 
+  MachineOperand &CondReg = CondBr->getOperand(1);
+  CondReg.setIsUndef(Cond[1].isUndef());
+  CondReg.setIsKill(Cond[1].isKill());
+
   if (BytesAdded)
       *BytesAdded = 8;
 
@@ -1321,7 +1334,7 @@ unsigned SIInstrInfo::insertBranch(Machi
 
 bool SIInstrInfo::reverseBranchCondition(
   SmallVectorImpl<MachineOperand> &Cond) const {
-  assert(Cond.size() == 1);
+  assert(Cond.size() == 2);
   Cond[0].setImm(-Cond[0].getImm());
   return false;
 }

Added: llvm/trunk/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir?rev=286133&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir (added)
+++ llvm/trunk/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir Mon Nov  7 13:09:27 2016
@@ -0,0 +1,89 @@
+# RUN: llc -run-pass block-placement -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s
+--- |
+
+  define void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  entry:
+    br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+  else:                                             ; preds = %entry
+    store volatile i32 100, i32 addrspace(1)* undef
+    br label %done, !structurizecfg.uniform !0
+
+  if:                                               ; preds = %entry
+    store volatile i32 9, i32 addrspace(1)* undef
+    br label %done, !structurizecfg.uniform !0
+
+  done:                                             ; preds = %if, %else
+    %value = phi i32 [ 0, %if ], [ 1, %else ]
+    store i32 %value, i32 addrspace(1)* %out
+    ret void
+  }
+
+  attributes #0 = { nounwind }
+
+  !0 = !{}
+
+...
+---
+# CHECK-LABEL: name: invert_br_undef_vcc
+# CHECK: S_CBRANCH_VCCZ %bb.1.else, implicit undef %vcc
+
+name:            invert_br_undef_vcc
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    successors: %bb.2.if, %bb.1.else
+    liveins: %sgpr0_sgpr1
+
+    %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    S_CBRANCH_VCCNZ %bb.2.if, implicit undef %vcc
+
+  bb.1.else:
+    successors: %bb.3.done
+    liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+    %vgpr0 = V_MOV_B32_e32 100, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+    S_BRANCH %bb.3.done
+
+  bb.2.if:
+    successors: %bb.3.done
+    liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+    %vgpr0 = V_MOV_B32_e32 9, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+    %vgpr0 = V_MOV_B32_e32 0, implicit %exec
+
+  bb.3.done:
+    liveins: %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+    %sgpr3 = S_MOV_B32 61440
+    %sgpr2 = S_MOV_B32 -1
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out)
+    S_ENDPGM
+
+...

Added: llvm/trunk/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir?rev=286133&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir (added)
+++ llvm/trunk/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir Mon Nov  7 13:09:27 2016
@@ -0,0 +1,177 @@
+# RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s
+--- |
+
+  define void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
+  entry:
+    %cmp0 = fcmp oeq float %cond, 0.000000e+00
+    br i1 %cmp0, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+  else:                                             ; preds = %entry
+    store volatile i32 100, i32 addrspace(1)* undef
+    br label %done, !structurizecfg.uniform !0
+
+  if:                                               ; preds = %entry
+    store volatile i32 9, i32 addrspace(1)* undef
+    br label %done, !structurizecfg.uniform !0
+
+  done:                                             ; preds = %if, %else
+    %value = phi i32 [ 0, %if ], [ 1, %else ]
+    store i32 %value, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  entry:
+    br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+  else:                                             ; preds = %entry
+    store volatile i32 100, i32 addrspace(1)* undef
+    br label %done, !structurizecfg.uniform !0
+
+  if:                                               ; preds = %entry
+    store volatile i32 9, i32 addrspace(1)* undef
+    br label %done, !structurizecfg.uniform !0
+
+  done:                                             ; preds = %if, %else
+    %value = phi i32 [ 0, %if ], [ 1, %else ]
+    store i32 %value, i32 addrspace(1)* %out
+    ret void
+  }
+
+  attributes #0 = { nounwind }
+  attributes #1 = { readnone }
+
+  !0 = !{}
+
+...
+---
+# CHECK-LABEL: name: vccz_corrupt_workaround
+# CHECK: %vcc = V_CMP_EQ_F32
+# CHECK-NEXT: %vcc = S_MOV_B64 %vcc
+# CHECK-NEXT: S_CBRANCH_VCCZ %bb.2.else, implicit killed %vcc
+
+name:            vccz_corrupt_workaround
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    successors: %bb.2.if, %bb.1.else
+    liveins: %sgpr0_sgpr1
+
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 9, 0 :: (non-temporal dereferenceable invariant load 4 from `float addrspace(2)* undef`)
+    %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    %vcc = V_CMP_EQ_F32_e64 0, 0, 0, %sgpr2, 0, 0, implicit %exec
+    S_CBRANCH_VCCZ %bb.1.else, implicit killed %vcc
+
+  bb.2.if:
+    successors: %bb.3.done
+    liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+    %vgpr0 = V_MOV_B32_e32 9, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+    %vgpr0 = V_MOV_B32_e32 0, implicit %exec
+    S_BRANCH %bb.3.done
+
+  bb.1.else:
+    successors: %bb.3.done
+    liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+    %vgpr0 = V_MOV_B32_e32 100, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+
+  bb.3.done:
+    liveins: %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+    %sgpr3 = S_MOV_B32 61440
+    %sgpr2 = S_MOV_B32 -1
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+# CHECK-LABEL: name: vccz_corrupt_undef_vcc
+# CHECK: S_WAITCNT
+# CHECK-NEXT: S_CBRANCH_VCCZ %bb.2.else, implicit undef %vcc
+
+name:            vccz_corrupt_undef_vcc
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    successors: %bb.2.if, %bb.1.else
+    liveins: %sgpr0_sgpr1
+
+    %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    S_CBRANCH_VCCZ %bb.1.else, implicit undef %vcc
+
+  bb.2.if:
+    successors: %bb.3.done
+    liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+    %vgpr0 = V_MOV_B32_e32 9, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+    %vgpr0 = V_MOV_B32_e32 0, implicit %exec
+    S_BRANCH %bb.3.done
+
+  bb.1.else:
+    successors: %bb.3.done
+    liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+    %vgpr0 = V_MOV_B32_e32 100, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+
+  bb.3.done:
+    liveins: %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+    %sgpr3 = S_MOV_B32 61440
+    %sgpr2 = S_MOV_B32 -1
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out)
+    S_ENDPGM
+
+...




More information about the llvm-commits mailing list