[llvm] [WIP][AMDGPU] combine uniform AMDGPU lane Intrinsics (PR #116953)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 15 03:28:33 PST 2025
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/116953
>From 1f450cf4d41138575c67ad396f9d9ff33f98c7b6 Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Thu, 21 Nov 2024 12:35:56 +0530
Subject: [PATCH 1/6] [WIP][AMDGPU] combine uniform AMDGPU lane Intrinsics
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 11 +
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 183 +++++++++++++++
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../amdgpu-uniform-intrinsic-combine.ll | 221 ++++++++++++++++++
5 files changed, 417 insertions(+)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 95d0ad0f9dc96a..33572bdb35eacc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -453,6 +453,17 @@ void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &);
extern char &GCNRewritePartialRegUsesID;
+void initializeAMDGPUUniformIntrinsicCombinePass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineID;
+FunctionPass *createAMDGPUUniformIntrinsicCombinePass();
+
+struct AMDGPUUniformIntrinsicCombinePass
+ : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
+ const AMDGPUTargetMachine &TM;
+ AMDGPUUniformIntrinsicCombinePass(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 174a90f0aa419d..bb0b7d97950132 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -65,6 +65,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass(*this))
#undef FUNCTION_PASS
#ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
new file mode 100644
index 00000000000000..1288b70697e639
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -0,0 +1,183 @@
+//===-- AMDGPUUniformIntrinsicCombine.cpp
+//-----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass combines uniform intrinsic instructions.
+/// Unifrom Intrinsic combine uses pattern match to identify and optimize
+/// redundent intrinsic instruction.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+using namespace llvm::PatternMatch;
+
+namespace {
+
+class AMDGPUUniformIntrinsicCombine : public FunctionPass {
+public:
+ static char ID;
+ AMDGPUUniformIntrinsicCombine() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<UniformityInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+
+class AMDGPUUniformIntrinsicCombineImpl
+ : public InstVisitor<AMDGPUUniformIntrinsicCombineImpl> {
+private:
+ const UniformityInfo *UI;
+
+ void optimizeUniformIntrinsicInst(IntrinsicInst &II) const;
+
+public:
+ AMDGPUUniformIntrinsicCombineImpl() = delete;
+
+ AMDGPUUniformIntrinsicCombineImpl(const UniformityInfo *UI) : UI(UI) {}
+
+ bool run(Function &F);
+};
+
+} // namespace
+
+char AMDGPUUniformIntrinsicCombine::ID = 0;
+
+char &llvm::AMDGPUUniformIntrinsicCombineID = AMDGPUUniformIntrinsicCombine::ID;
+
+bool AMDGPUUniformIntrinsicCombine::runOnFunction(Function &F) {
+ if (skipFunction(F)) {
+ return false;
+ }
+
+ const UniformityInfo *UI =
+ &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+
+ return AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
+}
+
+PreservedAnalyses
+AMDGPUUniformIntrinsicCombinePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+
+ const auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+
+ // @todo check if it is required that this method must return bool, if so
+ // figure out what can be returned.
+ bool IsChanged = AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
+
+ if (!IsChanged) {
+ return PreservedAnalyses::all();
+ }
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
+
+bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
+
+ // @todo check if it is required that this method must return bool, if so
+ // figure out what can be returned.
+ const bool IsChanged{false};
+
+ // Iterate over each instruction in the function to get the desired intrinsic
+ // inst to check for optimization.
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ if (auto *Call = dyn_cast<CallInst>(&I)) {
+ if (auto *Intrinsic = dyn_cast<IntrinsicInst>(Call)) {
+ optimizeUniformIntrinsicInst(*Intrinsic);
+ }
+ }
+ }
+ }
+
+ return IsChanged;
+}
+
+void AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
+ IntrinsicInst &II) const {
+ llvm::Intrinsic::ID IID = II.getIntrinsicID();
+
+ switch (IID) {
+ case Intrinsic::amdgcn_permlane64: {
+ Value *Src = II.getOperand(0);
+ if (UI->isUniform(Src)) {
+ return II.replaceAllUsesWith(Src);
+ }
+ break;
+ }
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane: {
+ Value *Srcv = II.getOperand(0);
+ if (UI->isUniform(Srcv)) {
+ return II.replaceAllUsesWith(Srcv);
+ }
+
+ // The rest of these may not be safe if the exec may not be the same between
+ // the def and use.
+ Value *Src = II.getArgOperand(0);
+ Instruction *SrcInst = dyn_cast<Instruction>(Src);
+ if (SrcInst && SrcInst->getParent() != II.getParent())
+ break;
+
+ // readfirstlane (readfirstlane x) -> readfirstlane x
+ // readlane (readfirstlane x), y -> readfirstlane x
+ if (match(Src,
+ PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
+ return II.replaceAllUsesWith(Src);
+ }
+
+ if (IID == Intrinsic::amdgcn_readfirstlane) {
+ // readfirstlane (readlane x, y) -> readlane x, y
+ if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
+ return II.replaceAllUsesWith(Src);
+ }
+ } else {
+ // readlane (readlane x, y), y -> readlane x, y
+ if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
+ PatternMatch::m_Value(),
+ PatternMatch::m_Specific(II.getArgOperand(1))))) {
+ return II.replaceAllUsesWith(Src);
+ }
+ }
+ break;
+ }
+ }
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombine, DEBUG_TYPE,
+ "AMDGPU uniformIntrinsic Combine", false, false)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombine, DEBUG_TYPE,
+ "AMDGPU uniformIntrinsic Combine", false, false)
+
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombinePass() {
+ return new AMDGPUUniformIntrinsicCombine();
+}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index fed29c3e14aae2..13e0fc61a82443 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -61,6 +61,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
+ AMDGPUUniformIntrinsicCombine.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
AMDGPUISelDAGToDAG.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
new file mode 100644
index 00000000000000..6f5279bb717c77
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -0,0 +1,221 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+
+define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.permlane64(i32 77)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_sgpr(ptr addrspace(1) %out, i32 %src) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
+; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readlane_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; GFX-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %tidx2 = add i32 %tidx, 1
+ %tidy2 = add i32 %tidy, 2
+ %v = call i32 @llvm.amdgcn.readlane(i32 %tidx2, i32 %tidy2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_sgpr(ptr addrspace(1) %out, i32 %src0) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
>From 4d52200a7b5915b4026cc4b63d29107a5c77d754 Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Fri, 22 Nov 2024 19:42:42 +0530
Subject: [PATCH 2/6] refactored and updated intrinsics handling
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 71 +++++++------------
1 file changed, 27 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 1288b70697e639..00ea2505862316 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -1,5 +1,4 @@
-//===-- AMDGPUUniformIntrinsicCombine.cpp
-//-----------------------------------------===//
+//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -9,13 +8,16 @@
//
/// \file
/// This pass combines uniform intrinsic instructions.
-/// Unifrom Intrinsic combine uses pattern match to identify and optimize
-/// redundent intrinsic instruction.
+/// Uniform Intrinsic Combine uses pattern match to identify and optimize
+/// redundant intrinsic instructions.
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
@@ -42,7 +44,7 @@ class AMDGPUUniformIntrinsicCombine : public FunctionPass {
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.setPreservesCFG();
AU.addRequired<UniformityInfoWrapperPass>();
AU.addRequired<TargetPassConfig>();
}
@@ -53,7 +55,7 @@ class AMDGPUUniformIntrinsicCombineImpl
private:
const UniformityInfo *UI;
- void optimizeUniformIntrinsicInst(IntrinsicInst &II) const;
+ bool optimizeUniformIntrinsicInst(IntrinsicInst &II) const;
public:
AMDGPUUniformIntrinsicCombineImpl() = delete;
@@ -86,8 +88,6 @@ AMDGPUUniformIntrinsicCombinePass::run(Function &F,
const auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
- // @todo check if it is required that this method must return bool, if so
- // figure out what can be returned.
bool IsChanged = AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
if (!IsChanged) {
@@ -96,14 +96,16 @@ AMDGPUUniformIntrinsicCombinePass::run(Function &F,
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ PA.preserve<UniformityInfoAnalysis>();
+ PA.preserve<TargetLibraryAnalysis>();
return PA;
}
bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
- // @todo check if it is required that this method must return bool, if so
- // figure out what can be returned.
- const bool IsChanged{false};
+ bool IsChanged{false};
// Iterate over each instruction in the function to get the desired intrinsic
// inst to check for optimization.
@@ -111,7 +113,7 @@ bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
for (Instruction &I : BB) {
if (auto *Call = dyn_cast<CallInst>(&I)) {
if (auto *Intrinsic = dyn_cast<IntrinsicInst>(Call)) {
- optimizeUniformIntrinsicInst(*Intrinsic);
+ IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
}
}
}
@@ -120,55 +122,36 @@ bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
return IsChanged;
}
-void AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
+bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
IntrinsicInst &II) const {
llvm::Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
- case Intrinsic::amdgcn_permlane64: {
- Value *Src = II.getOperand(0);
- if (UI->isUniform(Src)) {
- return II.replaceAllUsesWith(Src);
- }
- break;
- }
+ case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
- Value *Srcv = II.getOperand(0);
- if (UI->isUniform(Srcv)) {
- return II.replaceAllUsesWith(Srcv);
- }
-
- // The rest of these may not be safe if the exec may not be the same between
- // the def and use.
Value *Src = II.getArgOperand(0);
+ // The below part may not be safe if the exec is not same between the def
+ // and use. Is this part stilll required??
Instruction *SrcInst = dyn_cast<Instruction>(Src);
if (SrcInst && SrcInst->getParent() != II.getParent())
break;
// readfirstlane (readfirstlane x) -> readfirstlane x
+ // readfirstlane (readlane x, y) -> readlane x, y
// readlane (readfirstlane x), y -> readfirstlane x
- if (match(Src,
- PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
- return II.replaceAllUsesWith(Src);
- }
-
- if (IID == Intrinsic::amdgcn_readfirstlane) {
- // readfirstlane (readlane x, y) -> readlane x, y
- if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
- return II.replaceAllUsesWith(Src);
- }
- } else {
- // readlane (readlane x, y), y -> readlane x, y
- if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
- PatternMatch::m_Value(),
- PatternMatch::m_Specific(II.getArgOperand(1))))) {
- return II.replaceAllUsesWith(Src);
- }
+ // readlane (readlane x, y), z -> readlane x, y
+ // All these cases are identical and are dependent on the inner intrinsic
+ // results value.(i.e.irrespective of the which of these case is inner
+ // intrinsic will write the same value across all output lane indexes)
+ if (UI->isUniform(II.getOperandUse(0))) {
+ II.replaceAllUsesWith(Src);
+ return true;
}
break;
}
}
+ return false;
}
INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombine, DEBUG_TYPE,
>From 60b0b140998da703a431135d7ba95833a10daffc Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Fri, 22 Nov 2024 20:39:15 +0530
Subject: [PATCH 3/6] removed redundant casting
---
.../Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 11 ++---------
1 file changed, 2 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 00ea2505862316..b7321f85157507 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -111,10 +111,8 @@ bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
// inst to check for optimization.
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
- if (auto *Call = dyn_cast<CallInst>(&I)) {
- if (auto *Intrinsic = dyn_cast<IntrinsicInst>(Call)) {
- IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
- }
+ if (auto *Intrinsic = dyn_cast<IntrinsicInst>(&I)) {
+ IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
}
}
}
@@ -131,11 +129,6 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
- // The below part may not be safe if the exec is not same between the def
- // and use. Is this part stilll required??
- Instruction *SrcInst = dyn_cast<Instruction>(Src);
- if (SrcInst && SrcInst->getParent() != II.getParent())
- break;
// readfirstlane (readfirstlane x) -> readfirstlane x
// readfirstlane (readlane x, y) -> readlane x, y
>From 2f6339de3b8ef30e669cf17897a3ff84ef1a877b Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Fri, 6 Dec 2024 01:23:42 +0530
Subject: [PATCH 4/6] refactored, added more test
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 29 +--
.../amdgpu-uniform-intrinsic-combine.ll | 222 ++++++++++++++++--
2 files changed, 208 insertions(+), 43 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index b7321f85157507..94f1366691929d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -7,9 +7,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// This pass combines uniform intrinsic instructions.
-/// Uniform Intrinsic Combine uses pattern match to identify and optimize
-/// redundant intrinsic instructions.
+/// This pass simplifies certain intrinsic calls when the arguments are uniform.
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -21,6 +19,7 @@
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
@@ -35,14 +34,11 @@ using namespace llvm::AMDGPU;
using namespace llvm::PatternMatch;
namespace {
-
class AMDGPUUniformIntrinsicCombine : public FunctionPass {
public:
static char ID;
AMDGPUUniformIntrinsicCombine() : FunctionPass(ID) {}
-
bool runOnFunction(Function &F) override;
-
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<UniformityInfoWrapperPass>();
@@ -54,46 +50,36 @@ class AMDGPUUniformIntrinsicCombineImpl
: public InstVisitor<AMDGPUUniformIntrinsicCombineImpl> {
private:
const UniformityInfo *UI;
-
bool optimizeUniformIntrinsicInst(IntrinsicInst &II) const;
public:
AMDGPUUniformIntrinsicCombineImpl() = delete;
-
AMDGPUUniformIntrinsicCombineImpl(const UniformityInfo *UI) : UI(UI) {}
-
bool run(Function &F);
};
-
} // namespace
char AMDGPUUniformIntrinsicCombine::ID = 0;
-
char &llvm::AMDGPUUniformIntrinsicCombineID = AMDGPUUniformIntrinsicCombine::ID;
bool AMDGPUUniformIntrinsicCombine::runOnFunction(Function &F) {
if (skipFunction(F)) {
return false;
}
-
const UniformityInfo *UI =
&getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
-
return AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
}
PreservedAnalyses
AMDGPUUniformIntrinsicCombinePass::run(Function &F,
FunctionAnalysisManager &AM) {
-
const auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
-
bool IsChanged = AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
if (!IsChanged) {
return PreservedAnalyses::all();
}
-
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<LoopAnalysis>();
@@ -104,19 +90,14 @@ AMDGPUUniformIntrinsicCombinePass::run(Function &F,
}
bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
-
bool IsChanged{false};
-
// Iterate over each instruction in the function to get the desired intrinsic
// inst to check for optimization.
- for (BasicBlock &BB : F) {
- for (Instruction &I : BB) {
- if (auto *Intrinsic = dyn_cast<IntrinsicInst>(&I)) {
- IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
- }
+ for (Instruction &I : instructions(F)) {
+ if (auto *Intrinsic = dyn_cast<IntrinsicInst>(&I)) {
+ IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
}
}
-
return IsChanged;
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index 6f5279bb717c77..8f4b70c632e443 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -25,18 +25,20 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @permlane64_sgpr(ptr addrspace(1) %out, i32 %src) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_sgpr(
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_uniform(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[SRC]])
+; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
- %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr(
+define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
@@ -52,8 +54,8 @@ define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @permlane64_vgpr_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr_expression(
+define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
@@ -92,8 +94,8 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_sgpr(
+define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
@@ -104,8 +106,8 @@ define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %
ret void
}
-define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr(
+define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -123,8 +125,8 @@ define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @readlane_vgpr_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr_expression(
+define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -167,8 +169,8 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @readfirstlane_sgpr(ptr addrspace(1) %out, i32 %src0) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_sgpr(
+define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
@@ -179,8 +181,8 @@ define amdgpu_kernel void @readfirstlane_sgpr(ptr addrspace(1) %out, i32 %src0)
ret void
}
-define amdgpu_kernel void @readfirstlane_vgpr(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr(
+define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
@@ -196,8 +198,8 @@ define amdgpu_kernel void @readfirstlane_vgpr(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @readfirstlane_vgpr_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr_expression(
+define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_expression(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
@@ -215,6 +217,188 @@ define amdgpu_kernel void @readfirstlane_vgpr_expression(i32 addrspace(1)* %out)
ret void
}
+define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_readlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; GFX-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; GFX-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; GFX-NEXT: ret void
+;
+ %min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
+ store i32 %min_v, ptr addrspace(1) %out_min
+ %max_v = call i32 @llvm.amdgcn.permlane64(i32 2147483647)
+ store i32 %max_v, ptr addrspace(1) %out_max
+ ret void
+}
+
+define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = add i32 %tidx, 5
+ %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %random = xor i32 123, 456
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_invalid(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ store i32 %undef_v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %idx1 = call i32 @llvm.amdgcn.workitem.id.x()
+ %idx2 = mul i32 %idx1, 2
+ %v = call i32 @llvm.amdgcn.readlane(i32 %idx1, i32 %idx2)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+; Test case: Ensure that a loop with a divergent exit and a uniform value
+; used by an intrinsic outside the loop is not optimized due to temporal divergence.
+
+define amdgpu_kernel void @test_divergent_exit(ptr addrspace(1) %out, i32 %max_iter, i32 %div_cond) {
+; GFX-LABEL: define amdgpu_kernel void @test_divergent_exit(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: [[ITER:%.*]] = alloca i32, align 4
+; GFX-NEXT: store i32 0, ptr [[ITER]], align 4
+; GFX-NEXT: br label %[[LOOP:.*]]
+; GFX: [[LOOP]]:
+; GFX-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
+; GFX-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
+; GFX-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
+; GFX-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
+; GFX-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
+; GFX-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
+; GFX: [[EXIT_BLOCK]]:
+; GFX-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
+; GFX-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+entry:
+ %iter = alloca i32, align 4
+ store i32 0, ptr %iter, align 4
+ br label %loop
+
+loop:
+ ; Increment loop counter
+ %iter_val = load i32, ptr %iter, align 4
+ %new_iter = add i32 %iter_val, 1
+ store i32 %new_iter, ptr %iter, align 4
+
+ ; Check exit conditions
+ %cond1 = icmp sgt i32 %new_iter, %max_iter
+ %cond2 = icmp eq i32 %div_cond, 0
+ %exit = or i1 %cond1, %cond2
+ br i1 %exit, label %exit_block, label %loop
+
+exit_block:
+ ; Use the uniform value in an intrinsic outside the loop
+ %final_val = load i32, ptr %iter, align 4
+ %result = call i32 @llvm.amdgcn.permlane64(i32 %final_val)
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.permlane64(i32)
+
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX10: {{.*}}
; GFX11: {{.*}}
>From d3d0ff2e5be690e665915e0179c9ba32d659c384 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Thu, 9 Jan 2025 23:24:16 +0530
Subject: [PATCH 5/6] integrated in pipeline, more test added
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +--
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 11 +++-
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 25 +++++---
.../amdgpu-simplify-trivial-waterfall-loop.ll | 49 +++++++++++++++
.../amdgpu-uniform-intrinsic-combine.ll | 59 ++++++++++++++++---
5 files changed, 132 insertions(+), 21 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 33572bdb35eacc..960da145b7bf02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -453,14 +453,15 @@ void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &);
extern char &GCNRewritePartialRegUsesID;
-void initializeAMDGPUUniformIntrinsicCombinePass(PassRegistry &);
-extern char &AMDGPUUniformIntrinsicCombineID;
-FunctionPass *createAMDGPUUniformIntrinsicCombinePass();
+void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
+FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass(
+ const AMDGPUTargetMachine *TM = nullptr);
struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
const AMDGPUTargetMachine &TM;
- AMDGPUUniformIntrinsicCombinePass(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
+ AMDGPUUniformIntrinsicCombinePass(const AMDGPUTargetMachine &TM) : TM(TM) {}
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 786baa6820e860..9fa71da12bc484 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -449,6 +449,11 @@ static cl::opt<bool>
cl::desc("Enable AMDGPUAttributorPass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableUniformIntrinsicCombine(
+ "amdgpu-enable-uniform-intrinsic-combine",
+ cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
+ cl::init(true), cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -529,6 +534,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeGCNPreRALongBranchRegPass(*PR);
initializeGCNRewritePartialRegUsesPass(*PR);
initializeGCNRegPressurePrinterPass(*PR);
+ initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -770,13 +776,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
});
PB.registerPeepholeEPCallback(
- [](FunctionPassManager &FPM, OptimizationLevel Level) {
+ [this](FunctionPassManager &FPM, OptimizationLevel Level) {
if (Level == OptimizationLevel::O0)
return;
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify)
FPM.addPass(AMDGPUSimplifyLibCallsPass());
+
+ if (EnableUniformIntrinsicCombine)
+ FPM.addPass(AMDGPUUniformIntrinsicCombinePass(*this));
});
PB.registerCGSCCOptimizerLateEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 94f1366691929d..ad2c5e223aa89d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -34,10 +34,15 @@ using namespace llvm::AMDGPU;
using namespace llvm::PatternMatch;
namespace {
-class AMDGPUUniformIntrinsicCombine : public FunctionPass {
+class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
public:
static char ID;
- AMDGPUUniformIntrinsicCombine() : FunctionPass(ID) {}
+ const AMDGPUTargetMachine *AMDGPUTM;
+ AMDGPUUniformIntrinsicCombineLegacy(const AMDGPUTargetMachine *TM)
+ : FunctionPass(ID), AMDGPUTM(TM) {
+ initializeAMDGPUUniformIntrinsicCombineLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -59,10 +64,11 @@ class AMDGPUUniformIntrinsicCombineImpl
};
} // namespace
-char AMDGPUUniformIntrinsicCombine::ID = 0;
-char &llvm::AMDGPUUniformIntrinsicCombineID = AMDGPUUniformIntrinsicCombine::ID;
+char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
+char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
+ AMDGPUUniformIntrinsicCombineLegacy::ID;
-bool AMDGPUUniformIntrinsicCombine::runOnFunction(Function &F) {
+bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
if (skipFunction(F)) {
return false;
}
@@ -128,13 +134,14 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
return false;
}
-INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombine, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU uniformIntrinsic Combine", false, false)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombine, DEBUG_TYPE,
+INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU uniformIntrinsic Combine", false, false)
-FunctionPass *llvm::createAMDGPUUniformIntrinsicCombinePass() {
- return new AMDGPUUniformIntrinsicCombine();
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass(
+ const AMDGPUTargetMachine *TM) {
+ return new AMDGPUUniformIntrinsicCombineLegacy(TM);
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
new file mode 100644
index 00000000000000..2a1fddff5f3c8d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+
+define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
+; CHECK-NOT: br label %loop
+; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+entry:
+ ; Initialize result to zero
+ %result = alloca i32, align 4
+ store i32 0, ptr %result, align 4
+ br label %loop
+
+loop:
+ ; Load the current result
+ %cur_result = load i32, ptr %result, align 4
+
+ ; Compute the next value
+ %next_value = add i32 %cur_result, %src
+
+ ; Apply the readfirstlane intrinsic for uniformity
+ %uniform_value = call i32 @llvm.amdgcn.readfirstlane(i32 %next_value)
+
+ ; Store the uniform result back
+ store i32 %uniform_value, ptr %result, align 4
+
+ ; This is a trivial loop that always exits after one iteration
+ br i1 true, label %exit, label %loop
+
+exit:
+ ; Store the result to the output pointer
+ %final_result = load i32, ptr %result, align 4
+ store i32 %final_result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index 8f4b70c632e443..40c0d11c68e5e9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
; GFX-LABEL: define amdgpu_kernel void @permlane64_constant(
@@ -28,7 +28,6 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
; GFX-LABEL: define amdgpu_kernel void @permlane64_uniform(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[SRC]])
; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
@@ -97,7 +96,6 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
@@ -172,7 +170,6 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
@@ -396,8 +393,56 @@ exit_block:
ret void
}
-declare i32 @llvm.amdgcn.permlane64(i32)
+; Define the kernel function
+define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
+; CHECK-NOT: br label %loop
+; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: [[RESULT:%.*]] = alloca i32, align 4
+; GFX-NEXT: store i32 0, ptr [[RESULT]], align 4
+; GFX-NEXT: br label %[[LOOP:.*]]
+; GFX: [[LOOP]]:
+; GFX-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
+; GFX-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
+; GFX-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
+; GFX-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
+; GFX: [[EXIT]]:
+; GFX-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+entry:
+ ; Initialize result to zero
+ %result = alloca i32, align 4
+ store i32 0, ptr %result, align 4
+ br label %loop
+loop:
+ ; Load the current result
+ %cur_result = load i32, ptr %result, align 4
+
+ ; Compute the next value
+ %next_value = add i32 %cur_result, %src
+
+ ; Apply the readfirstlane intrinsic for uniformity
+ %uniform_value = call i32 @llvm.amdgcn.readfirstlane(i32 %next_value)
+
+ ; Store the uniform result back
+ store i32 %uniform_value, ptr %result, align 4
+
+ ; This is a trivial loop that always exits after one iteration
+ br i1 true, label %exit, label %loop
+
+exit:
+ ; Store the result to the output pointer
+ %final_result = load i32, ptr %result, align 4
+ store i32 %final_result, ptr addrspace(1) %out, align 4
+ ret void
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX10: {{.*}}
>From 0b63ef35dada41093f98c86cca8b8163567904e7 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 15 Jan 2025 16:58:15 +0530
Subject: [PATCH 6/6] removed unused gfx checks
---
.../amdgpu-simplify-trivial-waterfall-loop.ll | 21 +-
.../amdgpu-uniform-intrinsic-combine.ll | 355 +++++++++---------
2 files changed, 179 insertions(+), 197 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
index 2a1fddff5f3c8d..56ba117ce1d304 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
@@ -1,18 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX10
define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
-; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
-; CHECK-NOT: br label %loop
-; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; GFX-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; GFX-NEXT: [[ENTRY:.*:]]
-; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX10-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; GFX10-NEXT: [[ENTRY:.*:]]
+; GFX10-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
entry:
; Initialize result to zero
@@ -43,7 +38,3 @@ exit:
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index 40c0d11c68e5e9..ddedd435d35aa5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -1,14 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX10
define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_constant(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; GFX-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_constant(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX10-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 77)
store i32 %v, ptr addrspace(1) %out
@@ -16,9 +14,9 @@ define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_undef(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_undef(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -26,10 +24,10 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_uniform(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_uniform(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 %src)
store i32 %v, ptr addrspace(1) %out
@@ -37,14 +35,14 @@ define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
}
define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
@@ -54,15 +52,15 @@ define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
}
define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -73,10 +71,10 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
}
define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_constant(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_constant(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
store i32 %v, ptr addrspace(1) %out
@@ -84,9 +82,9 @@ define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_undef(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_undef(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -94,10 +92,10 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
store i32 %v, ptr addrspace(1) %out
@@ -105,15 +103,15 @@ define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i3
}
define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -124,17 +122,17 @@ define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out)
}
define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
-; GFX-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; GFX10-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -147,10 +145,10 @@ define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_constant(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
store i32 %v, ptr addrspace(1) %out
@@ -158,9 +156,9 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_undef(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -168,10 +166,10 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
store i32 %v, ptr addrspace(1) %out
@@ -179,14 +177,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3
}
define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
@@ -196,15 +194,15 @@ define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -215,10 +213,10 @@ define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
}
define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
%v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
@@ -227,13 +225,13 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou
}
define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -244,12 +242,12 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
@@ -259,13 +257,13 @@ define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_readlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_readlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -277,11 +275,11 @@ define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_boundary(
-; GFX-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
-; GFX-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; GFX10-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; GFX10-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; GFX10-NEXT: ret void
;
%min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
store i32 %min_v, ptr addrspace(1) %out_min
@@ -291,13 +289,13 @@ define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr ad
}
define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_cross_lane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = add i32 %tidx, 5
@@ -307,10 +305,10 @@ define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_random(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%random = xor i32 123, 456
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
@@ -319,9 +317,9 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_invalid(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_invalid(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %undef_v, ptr addrspace(1) %out
@@ -329,13 +327,13 @@ define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%idx1 = call i32 @llvm.amdgcn.workitem.id.x()
%idx2 = mul i32 %idx1, 2
@@ -348,25 +346,25 @@ define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
; used by an intrinsic outside the loop is not optimized due to temporal divergence.
define amdgpu_kernel void @test_divergent_exit(ptr addrspace(1) %out, i32 %max_iter, i32 %div_cond) {
-; GFX-LABEL: define amdgpu_kernel void @test_divergent_exit(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[ENTRY:.*:]]
-; GFX-NEXT: [[ITER:%.*]] = alloca i32, align 4
-; GFX-NEXT: store i32 0, ptr [[ITER]], align 4
-; GFX-NEXT: br label %[[LOOP:.*]]
-; GFX: [[LOOP]]:
-; GFX-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
-; GFX-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
-; GFX-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
-; GFX-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
-; GFX-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
-; GFX-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
-; GFX-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
-; GFX: [[EXIT_BLOCK]]:
-; GFX-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
-; GFX-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
-; GFX-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @test_divergent_exit(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[ENTRY:.*:]]
+; GFX10-NEXT: [[ITER:%.*]] = alloca i32, align 4
+; GFX10-NEXT: store i32 0, ptr [[ITER]], align 4
+; GFX10-NEXT: br label %[[LOOP:.*]]
+; GFX10: [[LOOP]]:
+; GFX10-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX10-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
+; GFX10-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
+; GFX10-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
+; GFX10-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
+; GFX10-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
+; GFX10-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
+; GFX10: [[EXIT_BLOCK]]:
+; GFX10-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX10-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
+; GFX10-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
entry:
%iter = alloca i32, align 4
@@ -395,25 +393,22 @@ exit_block:
; Define the kernel function
define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
-; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
-; CHECK-NOT: br label %loop
-; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[ENTRY:.*:]]
-; GFX-NEXT: [[RESULT:%.*]] = alloca i32, align 4
-; GFX-NEXT: store i32 0, ptr [[RESULT]], align 4
-; GFX-NEXT: br label %[[LOOP:.*]]
-; GFX: [[LOOP]]:
-; GFX-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
-; GFX-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
-; GFX-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
-; GFX-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
-; GFX-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
-; GFX: [[EXIT]]:
-; GFX-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
-; GFX-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[ENTRY:.*:]]
+; GFX10-NEXT: [[RESULT:%.*]] = alloca i32, align 4
+; GFX10-NEXT: store i32 0, ptr [[RESULT]], align 4
+; GFX10-NEXT: br label %[[LOOP:.*]]
+; GFX10: [[LOOP]]:
+; GFX10-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX10-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
+; GFX10-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
+; GFX10-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
+; GFX10-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
+; GFX10: [[EXIT]]:
+; GFX10-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX10-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
entry:
; Initialize result to zero
@@ -444,7 +439,3 @@ exit:
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX12: {{.*}}
More information about the llvm-commits
mailing list