[llvm] d619964 - AMDGPU: Increase branch size estimate with offset bug

Fri Oct 23 07:34:31 PDT 2020

Author: Matt Arsenault
Date: 2020-10-23T10:34:24-04:00
New Revision: d61996473dd9481da9ffc40cea608a5e6eaacf06

URL: https://github.com/llvm/llvm-project/commit/d61996473dd9481da9ffc40cea608a5e6eaacf06
DIFF: https://github.com/llvm/llvm-project/commit/d61996473dd9481da9ffc40cea608a5e6eaacf06.diff

LOG: AMDGPU: Increase branch size estimate with offset bug

This will be relaxed to insert a nop if the offset hits the bad value,
so over estimate branch instruction sizes.

Added: 
    llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SOPInstructions.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 99d229c9b74e..91723d34f080 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -323,7 +323,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // The isPseudo check really shouldn't be here, but unfortunately there are
     // some negative lit tests that depend on being able to continue through
     // here even when pseudo instructions haven't been lowered.
-    if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU())) {
+    //
+    // We also overestimate branch sizes with the offset bug.
+    if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) &&
+        (!STI.hasOffset3fBug() || !MI->isBranch())) {
       SmallVector<MCFixup, 4> Fixups;
       SmallVector<char, 16> CodeBytes;
       raw_svector_ostream CodeStream(CodeBytes);

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index dbd3d3517295..076b7c5d2a38 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2284,7 +2284,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
       .addMBB(TBB);
     if (BytesAdded)
-      *BytesAdded = 4;
+      *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
     return 1;
   }
 
@@ -2311,7 +2311,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
     fixImplicitOperands(*CondBr);
 
     if (BytesAdded)
-      *BytesAdded = 4;
+      *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
     return 1;
   }
 
@@ -2328,7 +2328,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
   CondReg.setIsKill(Cond[1].isKill());
 
   if (BytesAdded)
-      *BytesAdded = 8;
+    *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
 
   return 2;
 }
@@ -6628,8 +6628,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
 
   // If we have a definitive size, we can use it. Otherwise we need to inspect
   // the operands to know the size.
-  if (isFixedSize(MI))
-    return DescSize;
+  if (isFixedSize(MI)) {
+    unsigned Size = DescSize;
+
+    // If we hit the buggy offset, an extra nop will be inserted in MC so
+    // estimate the worst case.
+    if (MI.isBranch() && ST.hasOffset3fBug())
+      Size += 4;
+
+    return Size;
+  }
 
   // 4-byte instructions may have a 32-bit literal encoded after them. Check
   // operands that coud ever be literals.

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index f92e51d23ea8..77c7e6068258 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1058,6 +1058,7 @@ class SOPP_Pseudo<string opName, dag ins,
   let hasSideEffects = 0;
   let SALU = 1;
   let SOPP = 1;
+  let FixedSize = 1;
   let SchedRW = [WriteSALU];
   let UseNamedOperandTable = 1;
   bits <16> simm16;

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
new file mode 100644
index 000000000000..a651edc759c1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s
+
+; For gfx1010, overestimate the branch size in case we need to insert
+; a nop for the buggy offset.
+
+; GCN-LABEL: long_forward_scc_branch_3f_offset_bug:
+; GFX1030: s_cmp_lg_u32
+; GFX1030-NEXT: s_cbranch_scc1  [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; GFX1010: s_cmp_lg_u32
+; GFX1010-NEXT: s_cbranch_scc0  [[RELAX_BB:BB[0-9]+_[0-9]+]]
+; GFX1010: s_getpc_b64
+; GFX1010-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[ENDBB:BB[0-9]+_[0-9]+]]-(BB
+; GFX1010-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX1010: [[RELAX_BB]]:
+
+; GCN: v_nop
+; GCN: s_sleep
+; GCN: s_cbranch_scc1
+
+; GCN: [[ENDBB]]:
+; GCN: global_store_dword
+define amdgpu_kernel void @long_forward_scc_branch_3f_offset_bug(i32 addrspace(1)* %arg, i32 %cnd0) #0 {
+bb0:
+  %cmp0 = icmp eq i32 %cnd0, 0
+  br i1 %cmp0, label %bb2, label %bb3
+
+bb2:
+  %val = call i32 asm sideeffect
+   "s_mov_b32 $0, 0
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64", "=s"()   ; 20 * 12 = 240
+  call void @llvm.amdgcn.s.sleep(i32 0) ; +4 = 244
+  %cmp1 = icmp eq i32 %val, 0           ; +4 = 248
+  br i1 %cmp1, label %bb2, label %bb3   ; +4 (gfx1030), +8 with workaround (gfx1010)
+
+bb3:
+  store volatile i32 %cnd0, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug:
+; GFX1030: v_cmp_eq_u32
+; GFX1030: s_and_saveexec_b32
+; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:BB[0-9]+_[0-9]+]]
+
+; GFX1010: v_cmp_eq_u32
+; GFX1010: s_and_saveexec_b32
+; GFX1010-NEXT: s_cbranch_execnz  [[RELAX_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: s_getpc_b64
+; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[ENDBB:BB[0-9]+_[0-9]+]]-(BB
+; GCN-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}
+; GCN: [[RELAX_BB]]:
+
+; GCN: v_nop
+; GCN: s_sleep
+; GCN: s_cbranch_execz
+
+; GCN: [[ENDBB]]:
+; GCN: global_store_dword
+define void @long_forward_exec_branch_3f_offset_bug(i32 addrspace(1)* %arg, i32 %cnd0) #0 {
+bb0:
+  %cmp0 = icmp eq i32 %cnd0, 0
+  br i1 %cmp0, label %bb2, label %bb3
+
+bb2:
+  %val = call i32 asm sideeffect
+   "v_mov_b32 $0, 0
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64", "=v"()   ; 20 * 12 = 240
+  call void @llvm.amdgcn.s.sleep(i32 0) ; +4 = 244
+  %cmp1 = icmp eq i32 %val, 0           ; +4 = 248
+  br i1 %cmp1, label %bb2, label %bb3   ; +4 (gfx1030), +8 with workaround (gfx1010)
+
+bb3:
+  store volatile i32 %cnd0, i32 addrspace(1)* %arg
+  ret void
+}
+
+declare void @llvm.amdgcn.s.sleep(i32 immarg)