[llvm-branch-commits] [llvm] e07c05b - [AMDGPU] Clear bodies of function with incompatible features
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Nov 30 03:55:58 PST 2022
Author: Pierre van Houtryve
Date: 2022-11-30T06:14:35-05:00
New Revision: e07c05bc91ae1dfb625b7b0d93a83e5c6039fcb2
URL: https://github.com/llvm/llvm-project/commit/e07c05bc91ae1dfb625b7b0d93a83e5c6039fcb2
DIFF: https://github.com/llvm/llvm-project/commit/e07c05bc91ae1dfb625b7b0d93a83e5c6039fcb2.diff
LOG: [AMDGPU] Clear bodies of function with incompatible features
Adds a new passs that replaces the body of a function with trap+unreachable
if it uses features that are not supported on the current GPU.
This change is aimed at preventing crashes when building code at O0 that
uses idioms such as `if (ISA_VERSION >= N) intrinsic_a(); else intrinsic_b();`
where ISA_VERSION is not constexpr, and intrinsic_a is not selectable
on older targets.
This is a pattern that's used all over the ROCm device libs. The main
motive behind this change is to allow code using ROCm device libs
to be built at O0.
Note: the feature checking logic is done ad-hoc in the pass. There is no other
pass that needs (or will need in the foreseeable future) to do similar
feature-checking logic so I did not see a need to generalize the feature
checking logic yet. It can (and should probably) be generalized later and
moved to a TargetInfo-like class or helper file.
Added:
llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp
llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/CMakeLists.txt
llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 355aa0ba465b4..6a9ac1d165724 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -47,6 +47,7 @@ FunctionPass *createSIFormMemoryClausesPass();
FunctionPass *createSIPostRABundlerPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
FunctionPass *createAMDGPUUseNativeCallsPass();
+FunctionPass *createAMDGPUClearIncompatibleFunctionsPass(const TargetMachine *);
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
@@ -287,6 +288,9 @@ extern char &AMDGPUAnnotateUniformValuesPassID;
void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
extern char &AMDGPUCodeGenPrepareID;
+void initializeAMDGPUClearIncompatibleFunctionsPass(PassRegistry &);
+extern char &AMDGPUClearIncompatibleFunctionsID;
+
void initializeAMDGPULateCodeGenPreparePass(PassRegistry &);
extern char &AMDGPULateCodeGenPrepareID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp
new file mode 100644
index 0000000000000..e0ea3aac5b7f5
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp
@@ -0,0 +1,120 @@
+//===-- AMDGPUClearIncompatibleFunctions.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass replaces the bodies of functions that have attributes incompatible
+/// with the current target with trap/unreachable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "amdgpu-clear-incompatible-functions"
+
+using namespace llvm;
+
+namespace llvm {
+extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1];
+}
+
+namespace {
+
+using Generation = AMDGPUSubtarget::Generation;
+
+class AMDGPUClearIncompatibleFunctions : public FunctionPass {
+public:
+ static char ID;
+
+ AMDGPUClearIncompatibleFunctions(const TargetMachine *TM = nullptr) : FunctionPass(ID), TM(TM) {
+ assert(TM && "No TargetMachine!");
+ }
+
+ StringRef getPassName() const override {
+ return "AMDGPU Clear Incompatible Functions Bodies";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // If changes are made, no analyses are preserved.
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ const TargetMachine *TM = nullptr;
+};
+
+// List of features alongside the minimum GPU generation needed to support them.
+constexpr std::array<std::pair<unsigned, Generation>, 6> FeatureAndMinGen = {{
+ { AMDGPU::FeatureGFX11Insts, Generation::GFX11 },
+ { AMDGPU::FeatureGFX10Insts, Generation::GFX10 },
+ { AMDGPU::FeatureGFX9Insts, Generation::GFX9 },
+ { AMDGPU::FeatureGFX8Insts, Generation::VOLCANIC_ISLANDS },
+ { AMDGPU::FeatureDPP, Generation::VOLCANIC_ISLANDS },
+ { AMDGPU::Feature16BitInsts, Generation::VOLCANIC_ISLANDS }
+}};
+
+StringRef GetFeatureName(unsigned Feature) {
+ for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV)
+ if (Feature == KV.Value)
+ return KV.Key;
+
+ llvm_unreachable("Unknown Target feature");
+}
+
+} // end anonymous namespace
+
+bool AMDGPUClearIncompatibleFunctions::runOnFunction(Function &F) {
+ if (skipFunction(F) || F.empty())
+ return false;
+
+ LLVMContext &Ctx = F.getContext();
+ const GCNSubtarget *ST = static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F));
+ Generation GPUGen = ST->getGeneration();
+
+ // Note: this pass checks attributes for GCN, so check we have a GCN GPU.
+ if(GPUGen < Generation::SOUTHERN_ISLANDS)
+ return false;
+
+ bool Remove = false;
+ for(const auto &[Feature, MinGPUGen]: FeatureAndMinGen) {
+ if(ST->hasFeature(Feature) && GPUGen < MinGPUGen) {
+ Remove = true;
+ std::string Msg = "+" + GetFeatureName(Feature).str() + " is not supported on the current target. Deleting function body.";
+ DiagnosticInfoUnsupported DiagInfo(F, Msg, DiagnosticLocation(), DS_Warning);
+ Ctx.diagnose(DiagInfo);
+ }
+ }
+
+ if (!Remove)
+ return false;
+
+ F.dropAllReferences();
+ assert(F.empty());
+
+ BasicBlock* Entry = BasicBlock::Create(Ctx, "entry", &F);
+ IRBuilder<> Builder(Entry);
+ Builder.CreateIntrinsic(Intrinsic::trap, {}, {});
+ Builder.CreateUnreachable();
+ return true;
+}
+
+INITIALIZE_PASS(AMDGPUClearIncompatibleFunctions, DEBUG_TYPE,
+ "AMDGPU Clear Incompatible Functions Bodies", false, false)
+
+char AMDGPUClearIncompatibleFunctions::ID = 0;
+
+FunctionPass *llvm::createAMDGPUClearIncompatibleFunctionsPass(const TargetMachine *TM) {
+ return new AMDGPUClearIncompatibleFunctions(TM);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0cecf95c007d0..84d6879d4c36c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -213,6 +213,13 @@ static cl::opt<bool> EarlyInlineAll(
cl::init(false),
cl::Hidden);
+static cl::opt<bool> ClearIncompatibleFunctionsBodies(
+ "amdgpu-incompatible-features-clear-fns",
+ cl::Hidden,
+ cl::desc("Enable deletion of function bodies when they"
+ "use features not supported by the target GPU"),
+ cl::init(true));
+
static cl::opt<bool> EnableSDWAPeephole(
"amdgpu-sdwa-peephole",
cl::desc("Enable SDWA peepholer"),
@@ -376,6 +383,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULateCodeGenPreparePass(*PR);
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
initializeAMDGPUPropagateAttributesLatePass(*PR);
+ initializeAMDGPUClearIncompatibleFunctionsPass(*PR);
initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
initializeAMDGPULowerModuleLDSPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -1058,6 +1066,10 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
bool AMDGPUPassConfig::addPreISel() {
if (TM->getOptLevel() > CodeGenOpt::None)
addPass(createFlattenCFGPass());
+
+ if(ClearIncompatibleFunctionsBodies)
+ addPass(createAMDGPUClearIncompatibleFunctionsPass(&getAMDGPUTargetMachine()));
+
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 76ae4faf792e7..1d7b0d6c26dc9 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -50,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUAtomicOptimizer.cpp
AMDGPUAttributor.cpp
AMDGPUCallLowering.cpp
+ AMDGPUClearIncompatibleFunctions.cpp
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
AMDGPUCtorDtorLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
index 5dfde116785db..ac121f9e6661c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=legalizer -o - %s | FileCheck %s
+; RUN: llc -global-isel -amdgpu-incompatible-features-clear-fns=0 -mtriple=amdgcn-amd-amdhsa -stop-after=legalizer -o - %s | FileCheck %s
; Make sure legalizer info doesn't assert on dummy targets
diff --git a/llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll b/llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll
new file mode 100644
index 0000000000000..a19497d6a03b6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll
@@ -0,0 +1,628 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX7 %s
+; RUN: FileCheck --check-prefixes=GFX8-WARN,GFX9-WARN,GFX10-WARN,GFX11-WARN %s < %t
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX8 %s
+; RUN: FileCheck --check-prefixes=GFX9-WARN,GFX10-WARN,GFX11-WARN %s < %t
+
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX9 %s
+; RUN: FileCheck --check-prefixes=GFX10-WARN,GFX11-WARN %s < %t
+
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX10 %s
+; RUN: FileCheck --check-prefixes=GFX11-WARN %s < %t
+
+; Use --fatal-warnings to confirm no diagnostics are emitted for GFX11.
+; RUN: llc --fatal-warnings -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+
+; GFX8-WARN: needs_dpp {{.*}} +dpp is not supported on the current target. Deleting function body.
+; GFX8-WARN: needs_16bit_insts {{.*}} +16-bit-insts is not supported on the current target. Deleting function body.
+; GFX8-WARN: needs_gfx8_insts {{.*}} +gfx8-insts is not supported on the current target. Deleting function body.
+; GFX9-WARN: needs_gfx9_insts {{.*}} +gfx9-insts is not supported on the current target. Deleting function body.
+; GFX10-WARN: needs_gfx10_insts {{.*}} +gfx10-insts is not supported on the current target. Deleting function body.
+; GFX11-WARN: needs_gfx11_insts {{.*}} +gfx11-insts is not supported on the current target. Deleting function body.
+
+define void @needs_dpp(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #0 {
+; GFX7-LABEL: needs_dpp:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: needs_dpp:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX8-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: ; %bb.1: ; %else
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v6
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v7, vcc
+; GFX8-NEXT: ; implicit-def: $vgpr2
+; GFX8-NEXT: ; %bb.2: ; %Flow
+; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_cbranch_execz .LBB0_4
+; GFX8-NEXT: ; %bb.3: ; %if
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[2:3]
+; GFX8-NEXT: .LBB0_4: ; %endif
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: needs_dpp:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: ; %bb.1: ; %else
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr2
+; GFX9-NEXT: ; %bb.2: ; %Flow
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB0_4
+; GFX9-NEXT: ; %bb.3: ; %if
+; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT: .LBB0_4: ; %endif
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: needs_dpp:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX10-NEXT: ; %bb.1: ; %else
+; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT: ; implicit-def: $vgpr2
+; GFX10-NEXT: ; %bb.2: ; %Flow
+; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT: s_cbranch_execz .LBB0_4
+; GFX10-NEXT: ; %bb.3: ; %if
+; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT: .LBB0_4: ; %endif
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: needs_dpp:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: ; %bb.1: ; %else
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT: ; implicit-def: $vgpr2
+; GFX11-NEXT: ; %bb.2: ; %Flow
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB0_4
+; GFX11-NEXT: ; %bb.3: ; %if
+; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT: .LBB0_4: ; %endif
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = icmp eq i64 %a, 0
+ br i1 %0, label %if, label %else
+
+if:
+ %1 = load i64, i64 addrspace(1)* %in
+ br label %endif
+
+else:
+ %2 = add i64 %a, %b
+ br label %endif
+
+endif:
+ %3 = phi i64 [%1, %if], [%2, %else]
+ store i64 %3, i64 addrspace(1)* %out
+ ret void
+}
+
+define void @needs_16bit_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #1 {
+; GFX7-LABEL: needs_16bit_insts:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: needs_16bit_insts:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX8-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: ; %bb.1: ; %else
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v6
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v7, vcc
+; GFX8-NEXT: ; implicit-def: $vgpr2
+; GFX8-NEXT: ; %bb.2: ; %Flow
+; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_cbranch_execz .LBB1_4
+; GFX8-NEXT: ; %bb.3: ; %if
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[2:3]
+; GFX8-NEXT: .LBB1_4: ; %endif
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: needs_16bit_insts:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: ; %bb.1: ; %else
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr2
+; GFX9-NEXT: ; %bb.2: ; %Flow
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB1_4
+; GFX9-NEXT: ; %bb.3: ; %if
+; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT: .LBB1_4: ; %endif
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: needs_16bit_insts:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX10-NEXT: ; %bb.1: ; %else
+; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT: ; implicit-def: $vgpr2
+; GFX10-NEXT: ; %bb.2: ; %Flow
+; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT: s_cbranch_execz .LBB1_4
+; GFX10-NEXT: ; %bb.3: ; %if
+; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT: .LBB1_4: ; %endif
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: needs_16bit_insts:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: ; %bb.1: ; %else
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT: ; implicit-def: $vgpr2
+; GFX11-NEXT: ; %bb.2: ; %Flow
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB1_4
+; GFX11-NEXT: ; %bb.3: ; %if
+; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT: .LBB1_4: ; %endif
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = icmp eq i64 %a, 0
+ br i1 %0, label %if, label %else
+
+if:
+ %1 = load i64, i64 addrspace(1)* %in
+ br label %endif
+
+else:
+ %2 = add i64 %a, %b
+ br label %endif
+
+endif:
+ %3 = phi i64 [%1, %if], [%2, %else]
+ store i64 %3, i64 addrspace(1)* %out
+ ret void
+}
+
+define void @needs_gfx8_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #2 {
+; GFX7-LABEL: needs_gfx8_insts:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: needs_gfx8_insts:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX8-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: ; %bb.1: ; %else
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v6
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v7, vcc
+; GFX8-NEXT: ; implicit-def: $vgpr2
+; GFX8-NEXT: ; %bb.2: ; %Flow
+; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_cbranch_execz .LBB2_4
+; GFX8-NEXT: ; %bb.3: ; %if
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[2:3]
+; GFX8-NEXT: .LBB2_4: ; %endif
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: needs_gfx8_insts:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: ; %bb.1: ; %else
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr2
+; GFX9-NEXT: ; %bb.2: ; %Flow
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB2_4
+; GFX9-NEXT: ; %bb.3: ; %if
+; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT: .LBB2_4: ; %endif
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: needs_gfx8_insts:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX10-NEXT: ; %bb.1: ; %else
+; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT: ; implicit-def: $vgpr2
+; GFX10-NEXT: ; %bb.2: ; %Flow
+; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT: s_cbranch_execz .LBB2_4
+; GFX10-NEXT: ; %bb.3: ; %if
+; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT: .LBB2_4: ; %endif
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: needs_gfx8_insts:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: ; %bb.1: ; %else
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT: ; implicit-def: $vgpr2
+; GFX11-NEXT: ; %bb.2: ; %Flow
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB2_4
+; GFX11-NEXT: ; %bb.3: ; %if
+; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT: .LBB2_4: ; %endif
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = icmp eq i64 %a, 0
+ br i1 %0, label %if, label %else
+
+if:
+ %1 = load i64, i64 addrspace(1)* %in
+ br label %endif
+
+else:
+ %2 = add i64 %a, %b
+ br label %endif
+
+endif:
+ %3 = phi i64 [%1, %if], [%2, %else]
+ store i64 %3, i64 addrspace(1)* %out
+ ret void
+}
+
+define void @needs_gfx9_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #3 {
+; GFX7-LABEL: needs_gfx9_insts:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: needs_gfx9_insts:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: needs_gfx9_insts:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: ; %bb.1: ; %else
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr2
+; GFX9-NEXT: ; %bb.2: ; %Flow
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB3_4
+; GFX9-NEXT: ; %bb.3: ; %if
+; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT: .LBB3_4: ; %endif
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: needs_gfx9_insts:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX10-NEXT: ; %bb.1: ; %else
+; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT: ; implicit-def: $vgpr2
+; GFX10-NEXT: ; %bb.2: ; %Flow
+; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT: s_cbranch_execz .LBB3_4
+; GFX10-NEXT: ; %bb.3: ; %if
+; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT: .LBB3_4: ; %endif
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: needs_gfx9_insts:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: ; %bb.1: ; %else
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT: ; implicit-def: $vgpr2
+; GFX11-NEXT: ; %bb.2: ; %Flow
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB3_4
+; GFX11-NEXT: ; %bb.3: ; %if
+; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT: .LBB3_4: ; %endif
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = icmp eq i64 %a, 0
+ br i1 %0, label %if, label %else
+
+if:
+ %1 = load i64, i64 addrspace(1)* %in
+ br label %endif
+
+else:
+ %2 = add i64 %a, %b
+ br label %endif
+
+endif:
+ %3 = phi i64 [%1, %if], [%2, %else]
+ store i64 %3, i64 addrspace(1)* %out
+ ret void
+}
+
+define void @needs_gfx10_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #4 {
+; GFX7-LABEL: needs_gfx10_insts:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: needs_gfx10_insts:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: needs_gfx10_insts:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: needs_gfx10_insts:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX10-NEXT: ; %bb.1: ; %else
+; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT: ; implicit-def: $vgpr2
+; GFX10-NEXT: ; %bb.2: ; %Flow
+; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT: s_cbranch_execz .LBB4_4
+; GFX10-NEXT: ; %bb.3: ; %if
+; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT: .LBB4_4: ; %endif
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: needs_gfx10_insts:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: ; %bb.1: ; %else
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT: ; implicit-def: $vgpr2
+; GFX11-NEXT: ; %bb.2: ; %Flow
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB4_4
+; GFX11-NEXT: ; %bb.3: ; %if
+; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT: .LBB4_4: ; %endif
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = icmp eq i64 %a, 0
+ br i1 %0, label %if, label %else
+
+if:
+ %1 = load i64, i64 addrspace(1)* %in
+ br label %endif
+
+else:
+ %2 = add i64 %a, %b
+ br label %endif
+
+endif:
+ %3 = phi i64 [%1, %if], [%2, %else]
+ store i64 %3, i64 addrspace(1)* %out
+ ret void
+}
+
+define void @needs_gfx11_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #5 {
+; GFX7-LABEL: needs_gfx11_insts:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: needs_gfx11_insts:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: needs_gfx11_insts:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: needs_gfx11_insts:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: needs_gfx11_insts:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: ; %bb.1: ; %else
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT: ; implicit-def: $vgpr2
+; GFX11-NEXT: ; %bb.2: ; %Flow
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB5_4
+; GFX11-NEXT: ; %bb.3: ; %if
+; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT: .LBB5_4: ; %endif
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = icmp eq i64 %a, 0
+ br i1 %0, label %if, label %else
+
+if:
+ %1 = load i64, i64 addrspace(1)* %in
+ br label %endif
+
+else:
+ %2 = add i64 %a, %b
+ br label %endif
+
+endif:
+ %3 = phi i64 [%1, %if], [%2, %else]
+ store i64 %3, i64 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { "target-features"="+dpp" }
+attributes #1 = { "target-features"="+16-bit-insts" }
+attributes #2 = { "target-features"="+gfx8-insts" }
+attributes #3 = { "target-features"="+gfx9-insts" }
+attributes #4 = { "target-features"="+gfx10-insts" }
+attributes #5 = { "target-features"="+gfx11-insts" }
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 9055e5c9ba799..3013cf9175cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -59,6 +59,7 @@
; GCN-O0-NEXT: Lower SwitchInst's to branches
; GCN-O0-NEXT: Lower invoke and unwind, for unwindless code generators
; GCN-O0-NEXT: Remove unreachable blocks from the CFG
+; GCN-O0-NEXT: AMDGPU Clear Incompatible Functions Bodies
; GCN-O0-NEXT: Post-Dominator Tree Construction
; GCN-O0-NEXT: Dominator Tree Construction
; GCN-O0-NEXT: Natural Loop Information
@@ -238,6 +239,7 @@
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-NEXT: Function Alias Analysis Results
; GCN-O1-NEXT: Flatten the CFG
+; GCN-O1-NEXT: AMDGPU Clear Incompatible Functions Bodies
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Post-Dominator Tree Construction
; GCN-O1-NEXT: Natural Loop Information
@@ -525,6 +527,7 @@
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
; GCN-O1-OPTS-NEXT: Flatten the CFG
+; GCN-O1-OPTS-NEXT: AMDGPU Clear Incompatible Functions Bodies
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
@@ -820,6 +823,7 @@
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Flatten the CFG
+; GCN-O2-NEXT: AMDGPU Clear Incompatible Functions Bodies
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Post-Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
@@ -1130,6 +1134,7 @@
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Flatten the CFG
+; GCN-O3-NEXT: AMDGPU Clear Incompatible Functions Bodies
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Post-Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
More information about the llvm-branch-commits
mailing list