[llvm] [AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. (PR #128687)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 25 02:08:26 PST 2025
https://github.com/PankajDwivedi-25 created https://github.com/llvm/llvm-project/pull/128687
This PR enables "amdgpu-uniform-intrinsic-combine" pass in the pipeline.
Also introduces "amdgpu-enable-uniform-intrinsic-combine" cmd line flag to enbale/disable the pass.
This PR will be merged once PR https://github.com/llvm/llvm-project/pull/116953 is merged.
>From c2f5258fad02d0545d0d0b6a58ca914a15fed1bd Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Thu, 21 Nov 2024 12:35:56 +0530
Subject: [PATCH 01/17] [WIP][AMDGPU] combine uniform AMDGPU lane Intrinsics
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 11 +
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 183 +++++++++++++++
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../amdgpu-uniform-intrinsic-combine.ll | 221 ++++++++++++++++++
5 files changed, 417 insertions(+)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 428355a739628..2971a90623394 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -466,6 +466,17 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
+void initializeAMDGPUUniformIntrinsicCombinePass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineID;
+FunctionPass *createAMDGPUUniformIntrinsicCombinePass();
+
+struct AMDGPUUniformIntrinsicCombinePass
+ : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
+ const AMDGPUTargetMachine &TM;
+ AMDGPUUniformIntrinsicCombinePass(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index e89d84c8a105f..02a121a2b18b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass(*this))
#undef FUNCTION_PASS
#ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
new file mode 100644
index 0000000000000..1288b70697e63
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -0,0 +1,183 @@
+//===-- AMDGPUUniformIntrinsicCombine.cpp
+//-----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass combines uniform intrinsic instructions.
+/// Unifrom Intrinsic combine uses pattern match to identify and optimize
+/// redundent intrinsic instruction.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+using namespace llvm::PatternMatch;
+
+namespace {
+
+class AMDGPUUniformIntrinsicCombine : public FunctionPass {
+public:
+ static char ID;
+ AMDGPUUniformIntrinsicCombine() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<UniformityInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+
+class AMDGPUUniformIntrinsicCombineImpl
+ : public InstVisitor<AMDGPUUniformIntrinsicCombineImpl> {
+private:
+ const UniformityInfo *UI;
+
+ void optimizeUniformIntrinsicInst(IntrinsicInst &II) const;
+
+public:
+ AMDGPUUniformIntrinsicCombineImpl() = delete;
+
+ AMDGPUUniformIntrinsicCombineImpl(const UniformityInfo *UI) : UI(UI) {}
+
+ bool run(Function &F);
+};
+
+} // namespace
+
+char AMDGPUUniformIntrinsicCombine::ID = 0;
+
+char &llvm::AMDGPUUniformIntrinsicCombineID = AMDGPUUniformIntrinsicCombine::ID;
+
+bool AMDGPUUniformIntrinsicCombine::runOnFunction(Function &F) {
+ if (skipFunction(F)) {
+ return false;
+ }
+
+ const UniformityInfo *UI =
+ &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+
+ return AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
+}
+
+PreservedAnalyses
+AMDGPUUniformIntrinsicCombinePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+
+ const auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+
+ // @todo check if it is required that this method must return bool, if so
+ // figure out what can be returned.
+ bool IsChanged = AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
+
+ if (!IsChanged) {
+ return PreservedAnalyses::all();
+ }
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
+
+bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
+
+ // @todo check if it is required that this method must return bool, if so
+ // figure out what can be returned.
+ const bool IsChanged{false};
+
+ // Iterate over each instruction in the function to get the desired intrinsic
+ // inst to check for optimization.
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ if (auto *Call = dyn_cast<CallInst>(&I)) {
+ if (auto *Intrinsic = dyn_cast<IntrinsicInst>(Call)) {
+ optimizeUniformIntrinsicInst(*Intrinsic);
+ }
+ }
+ }
+ }
+
+ return IsChanged;
+}
+
+void AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
+ IntrinsicInst &II) const {
+ llvm::Intrinsic::ID IID = II.getIntrinsicID();
+
+ switch (IID) {
+ case Intrinsic::amdgcn_permlane64: {
+ Value *Src = II.getOperand(0);
+ if (UI->isUniform(Src)) {
+ return II.replaceAllUsesWith(Src);
+ }
+ break;
+ }
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane: {
+ Value *Srcv = II.getOperand(0);
+ if (UI->isUniform(Srcv)) {
+ return II.replaceAllUsesWith(Srcv);
+ }
+
+ // The rest of these may not be safe if the exec may not be the same between
+ // the def and use.
+ Value *Src = II.getArgOperand(0);
+ Instruction *SrcInst = dyn_cast<Instruction>(Src);
+ if (SrcInst && SrcInst->getParent() != II.getParent())
+ break;
+
+ // readfirstlane (readfirstlane x) -> readfirstlane x
+ // readlane (readfirstlane x), y -> readfirstlane x
+ if (match(Src,
+ PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
+ return II.replaceAllUsesWith(Src);
+ }
+
+ if (IID == Intrinsic::amdgcn_readfirstlane) {
+ // readfirstlane (readlane x, y) -> readlane x, y
+ if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
+ return II.replaceAllUsesWith(Src);
+ }
+ } else {
+ // readlane (readlane x, y), y -> readlane x, y
+ if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
+ PatternMatch::m_Value(),
+ PatternMatch::m_Specific(II.getArgOperand(1))))) {
+ return II.replaceAllUsesWith(Src);
+ }
+ }
+ break;
+ }
+ }
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombine, DEBUG_TYPE,
+ "AMDGPU uniformIntrinsic Combine", false, false)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombine, DEBUG_TYPE,
+ "AMDGPU uniformIntrinsic Combine", false, false)
+
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombinePass() {
+ return new AMDGPUUniformIntrinsicCombine();
+}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 408da0536237e..620037dca1c2b 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -61,6 +61,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
+ AMDGPUUniformIntrinsicCombine.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
AMDGPUISelDAGToDAG.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
new file mode 100644
index 0000000000000..6f5279bb717c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -0,0 +1,221 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+
+define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.permlane64(i32 77)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_sgpr(ptr addrspace(1) %out, i32 %src) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
+; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readlane_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; GFX-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %tidx2 = add i32 %tidx, 1
+ %tidy2 = add i32 %tidy, 2
+ %v = call i32 @llvm.amdgcn.readlane(i32 %tidx2, i32 %tidy2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_sgpr(ptr addrspace(1) %out, i32 %src0) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
>From 91dd6d908bb6e8ca3ce13ea8260eeebd95e5d6b2 Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Fri, 22 Nov 2024 19:42:42 +0530
Subject: [PATCH 02/17] refactored and updated intrinsics handling
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 71 +++++++------------
1 file changed, 27 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 1288b70697e63..00ea250586231 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -1,5 +1,4 @@
-//===-- AMDGPUUniformIntrinsicCombine.cpp
-//-----------------------------------------===//
+//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -9,13 +8,16 @@
//
/// \file
/// This pass combines uniform intrinsic instructions.
-/// Unifrom Intrinsic combine uses pattern match to identify and optimize
-/// redundent intrinsic instruction.
+/// Uniform Intrinsic Combine uses pattern match to identify and optimize
+/// redundant intrinsic instructions.
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
@@ -42,7 +44,7 @@ class AMDGPUUniformIntrinsicCombine : public FunctionPass {
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.setPreservesCFG();
AU.addRequired<UniformityInfoWrapperPass>();
AU.addRequired<TargetPassConfig>();
}
@@ -53,7 +55,7 @@ class AMDGPUUniformIntrinsicCombineImpl
private:
const UniformityInfo *UI;
- void optimizeUniformIntrinsicInst(IntrinsicInst &II) const;
+ bool optimizeUniformIntrinsicInst(IntrinsicInst &II) const;
public:
AMDGPUUniformIntrinsicCombineImpl() = delete;
@@ -86,8 +88,6 @@ AMDGPUUniformIntrinsicCombinePass::run(Function &F,
const auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
- // @todo check if it is required that this method must return bool, if so
- // figure out what can be returned.
bool IsChanged = AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
if (!IsChanged) {
@@ -96,14 +96,16 @@ AMDGPUUniformIntrinsicCombinePass::run(Function &F,
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ PA.preserve<UniformityInfoAnalysis>();
+ PA.preserve<TargetLibraryAnalysis>();
return PA;
}
bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
- // @todo check if it is required that this method must return bool, if so
- // figure out what can be returned.
- const bool IsChanged{false};
+ bool IsChanged{false};
// Iterate over each instruction in the function to get the desired intrinsic
// inst to check for optimization.
@@ -111,7 +113,7 @@ bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
for (Instruction &I : BB) {
if (auto *Call = dyn_cast<CallInst>(&I)) {
if (auto *Intrinsic = dyn_cast<IntrinsicInst>(Call)) {
- optimizeUniformIntrinsicInst(*Intrinsic);
+ IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
}
}
}
@@ -120,55 +122,36 @@ bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
return IsChanged;
}
-void AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
+bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
IntrinsicInst &II) const {
llvm::Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
- case Intrinsic::amdgcn_permlane64: {
- Value *Src = II.getOperand(0);
- if (UI->isUniform(Src)) {
- return II.replaceAllUsesWith(Src);
- }
- break;
- }
+ case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
- Value *Srcv = II.getOperand(0);
- if (UI->isUniform(Srcv)) {
- return II.replaceAllUsesWith(Srcv);
- }
-
- // The rest of these may not be safe if the exec may not be the same between
- // the def and use.
Value *Src = II.getArgOperand(0);
+ // The below part may not be safe if the exec is not same between the def
+ // and use. Is this part stilll required??
Instruction *SrcInst = dyn_cast<Instruction>(Src);
if (SrcInst && SrcInst->getParent() != II.getParent())
break;
// readfirstlane (readfirstlane x) -> readfirstlane x
+ // readfirstlane (readlane x, y) -> readlane x, y
// readlane (readfirstlane x), y -> readfirstlane x
- if (match(Src,
- PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
- return II.replaceAllUsesWith(Src);
- }
-
- if (IID == Intrinsic::amdgcn_readfirstlane) {
- // readfirstlane (readlane x, y) -> readlane x, y
- if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
- return II.replaceAllUsesWith(Src);
- }
- } else {
- // readlane (readlane x, y), y -> readlane x, y
- if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
- PatternMatch::m_Value(),
- PatternMatch::m_Specific(II.getArgOperand(1))))) {
- return II.replaceAllUsesWith(Src);
- }
+ // readlane (readlane x, y), z -> readlane x, y
+ // All these cases are identical and are dependent on the inner intrinsic
+ // results value.(i.e.irrespective of the which of these case is inner
+ // intrinsic will write the same value across all output lane indexes)
+ if (UI->isUniform(II.getOperandUse(0))) {
+ II.replaceAllUsesWith(Src);
+ return true;
}
break;
}
}
+ return false;
}
INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombine, DEBUG_TYPE,
>From c98a967fb653c2ecdff8c4125756221aaeb2592a Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Fri, 22 Nov 2024 20:39:15 +0530
Subject: [PATCH 03/17] removed redundant casting
---
.../Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 11 ++---------
1 file changed, 2 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 00ea250586231..b7321f8515750 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -111,10 +111,8 @@ bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
// inst to check for optimization.
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
- if (auto *Call = dyn_cast<CallInst>(&I)) {
- if (auto *Intrinsic = dyn_cast<IntrinsicInst>(Call)) {
- IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
- }
+ if (auto *Intrinsic = dyn_cast<IntrinsicInst>(&I)) {
+ IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
}
}
}
@@ -131,11 +129,6 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
- // The below part may not be safe if the exec is not same between the def
- // and use. Is this part stilll required??
- Instruction *SrcInst = dyn_cast<Instruction>(Src);
- if (SrcInst && SrcInst->getParent() != II.getParent())
- break;
// readfirstlane (readfirstlane x) -> readfirstlane x
// readfirstlane (readlane x, y) -> readlane x, y
>From ef78ece6dabd93729ec0d9c23e686bf8c25e161f Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Fri, 6 Dec 2024 01:23:42 +0530
Subject: [PATCH 04/17] refactored, added more test
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 29 +--
.../amdgpu-uniform-intrinsic-combine.ll | 222 ++++++++++++++++--
2 files changed, 208 insertions(+), 43 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index b7321f8515750..94f1366691929 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -7,9 +7,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// This pass combines uniform intrinsic instructions.
-/// Uniform Intrinsic Combine uses pattern match to identify and optimize
-/// redundant intrinsic instructions.
+/// This pass simplifies certain intrinsic calls when the arguments are uniform.
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -21,6 +19,7 @@
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
@@ -35,14 +34,11 @@ using namespace llvm::AMDGPU;
using namespace llvm::PatternMatch;
namespace {
-
class AMDGPUUniformIntrinsicCombine : public FunctionPass {
public:
static char ID;
AMDGPUUniformIntrinsicCombine() : FunctionPass(ID) {}
-
bool runOnFunction(Function &F) override;
-
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<UniformityInfoWrapperPass>();
@@ -54,46 +50,36 @@ class AMDGPUUniformIntrinsicCombineImpl
: public InstVisitor<AMDGPUUniformIntrinsicCombineImpl> {
private:
const UniformityInfo *UI;
-
bool optimizeUniformIntrinsicInst(IntrinsicInst &II) const;
public:
AMDGPUUniformIntrinsicCombineImpl() = delete;
-
AMDGPUUniformIntrinsicCombineImpl(const UniformityInfo *UI) : UI(UI) {}
-
bool run(Function &F);
};
-
} // namespace
char AMDGPUUniformIntrinsicCombine::ID = 0;
-
char &llvm::AMDGPUUniformIntrinsicCombineID = AMDGPUUniformIntrinsicCombine::ID;
bool AMDGPUUniformIntrinsicCombine::runOnFunction(Function &F) {
if (skipFunction(F)) {
return false;
}
-
const UniformityInfo *UI =
&getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
-
return AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
}
PreservedAnalyses
AMDGPUUniformIntrinsicCombinePass::run(Function &F,
FunctionAnalysisManager &AM) {
-
const auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
-
bool IsChanged = AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
if (!IsChanged) {
return PreservedAnalyses::all();
}
-
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<LoopAnalysis>();
@@ -104,19 +90,14 @@ AMDGPUUniformIntrinsicCombinePass::run(Function &F,
}
bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
-
bool IsChanged{false};
-
// Iterate over each instruction in the function to get the desired intrinsic
// inst to check for optimization.
- for (BasicBlock &BB : F) {
- for (Instruction &I : BB) {
- if (auto *Intrinsic = dyn_cast<IntrinsicInst>(&I)) {
- IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
- }
+ for (Instruction &I : instructions(F)) {
+ if (auto *Intrinsic = dyn_cast<IntrinsicInst>(&I)) {
+ IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
}
}
-
return IsChanged;
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index 6f5279bb717c7..8f4b70c632e44 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -25,18 +25,20 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @permlane64_sgpr(ptr addrspace(1) %out, i32 %src) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_sgpr(
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_uniform(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[SRC]])
+; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
- %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr(
+define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
@@ -52,8 +54,8 @@ define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @permlane64_vgpr_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr_expression(
+define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
@@ -92,8 +94,8 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_sgpr(
+define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
@@ -104,8 +106,8 @@ define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %
ret void
}
-define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr(
+define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -123,8 +125,8 @@ define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @readlane_vgpr_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr_expression(
+define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -167,8 +169,8 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @readfirstlane_sgpr(ptr addrspace(1) %out, i32 %src0) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_sgpr(
+define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
@@ -179,8 +181,8 @@ define amdgpu_kernel void @readfirstlane_sgpr(ptr addrspace(1) %out, i32 %src0)
ret void
}
-define amdgpu_kernel void @readfirstlane_vgpr(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr(
+define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
@@ -196,8 +198,8 @@ define amdgpu_kernel void @readfirstlane_vgpr(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @readfirstlane_vgpr_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr_expression(
+define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_expression(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
@@ -215,6 +217,188 @@ define amdgpu_kernel void @readfirstlane_vgpr_expression(i32 addrspace(1)* %out)
ret void
}
+define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_readlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; GFX-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; GFX-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; GFX-NEXT: ret void
+;
+ %min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
+ store i32 %min_v, ptr addrspace(1) %out_min
+ %max_v = call i32 @llvm.amdgcn.permlane64(i32 2147483647)
+ store i32 %max_v, ptr addrspace(1) %out_max
+ ret void
+}
+
+define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = add i32 %tidx, 5
+ %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %random = xor i32 123, 456
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_invalid(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ store i32 %undef_v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %idx1 = call i32 @llvm.amdgcn.workitem.id.x()
+ %idx2 = mul i32 %idx1, 2
+ %v = call i32 @llvm.amdgcn.readlane(i32 %idx1, i32 %idx2)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+; Test case: Ensure that a loop with a divergent exit and a uniform value
+; used by an intrinsic outside the loop is not optimized due to temporal divergence.
+
+define amdgpu_kernel void @test_divergent_exit(ptr addrspace(1) %out, i32 %max_iter, i32 %div_cond) {
+; GFX-LABEL: define amdgpu_kernel void @test_divergent_exit(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: [[ITER:%.*]] = alloca i32, align 4
+; GFX-NEXT: store i32 0, ptr [[ITER]], align 4
+; GFX-NEXT: br label %[[LOOP:.*]]
+; GFX: [[LOOP]]:
+; GFX-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
+; GFX-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
+; GFX-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
+; GFX-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
+; GFX-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
+; GFX-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
+; GFX: [[EXIT_BLOCK]]:
+; GFX-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
+; GFX-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+entry:
+ %iter = alloca i32, align 4
+ store i32 0, ptr %iter, align 4
+ br label %loop
+
+loop:
+ ; Increment loop counter
+ %iter_val = load i32, ptr %iter, align 4
+ %new_iter = add i32 %iter_val, 1
+ store i32 %new_iter, ptr %iter, align 4
+
+ ; Check exit conditions
+ %cond1 = icmp sgt i32 %new_iter, %max_iter
+ %cond2 = icmp eq i32 %div_cond, 0
+ %exit = or i1 %cond1, %cond2
+ br i1 %exit, label %exit_block, label %loop
+
+exit_block:
+ ; Use the uniform value in an intrinsic outside the loop
+ %final_val = load i32, ptr %iter, align 4
+ %result = call i32 @llvm.amdgcn.permlane64(i32 %final_val)
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.permlane64(i32)
+
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX10: {{.*}}
; GFX11: {{.*}}
>From d3ab82830861cffce2adbd10f3eb7259b3552f6c Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Thu, 9 Jan 2025 23:24:16 +0530
Subject: [PATCH 05/17] integrated in pipeline, more test added
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 10 ++--
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 11 +++-
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 25 +++++---
.../amdgpu-simplify-trivial-waterfall-loop.ll | 49 +++++++++++++++
.../amdgpu-uniform-intrinsic-combine.ll | 59 ++++++++++++++++---
5 files changed, 132 insertions(+), 22 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 2971a90623394..9928d300fc656 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -466,14 +466,14 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
-void initializeAMDGPUUniformIntrinsicCombinePass(PassRegistry &);
-extern char &AMDGPUUniformIntrinsicCombineID;
-FunctionPass *createAMDGPUUniformIntrinsicCombinePass();
-
+void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
+FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass(
+ const AMDGPUTargetMachine *TM = nullptr);
struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
const AMDGPUTargetMachine &TM;
- AMDGPUUniformIntrinsicCombinePass(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
+ AMDGPUUniformIntrinsicCombinePass(const AMDGPUTargetMachine &TM) : TM(TM) {}
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 828c1702ae07a..6542a2d85ea4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -476,6 +476,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
"amdgpu-link-time-closed-world",
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EnableUniformIntrinsicCombine(
+ "amdgpu-enable-uniform-intrinsic-combine",
+ cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
+ cl::init(true), cl::Hidden);
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
@@ -560,6 +565,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeGCNRegPressurePrinterPass(*PR);
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
+ initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -829,13 +835,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
});
PB.registerPeepholeEPCallback(
- [](FunctionPassManager &FPM, OptimizationLevel Level) {
+ [this](FunctionPassManager &FPM, OptimizationLevel Level) {
if (Level == OptimizationLevel::O0)
return;
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify)
FPM.addPass(AMDGPUSimplifyLibCallsPass());
+
+ if (EnableUniformIntrinsicCombine)
+ FPM.addPass(AMDGPUUniformIntrinsicCombinePass(*this));
});
PB.registerCGSCCOptimizerLateEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 94f1366691929..ad2c5e223aa89 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -34,10 +34,15 @@ using namespace llvm::AMDGPU;
using namespace llvm::PatternMatch;
namespace {
-class AMDGPUUniformIntrinsicCombine : public FunctionPass {
+class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
public:
static char ID;
- AMDGPUUniformIntrinsicCombine() : FunctionPass(ID) {}
+ const AMDGPUTargetMachine *AMDGPUTM;
+ AMDGPUUniformIntrinsicCombineLegacy(const AMDGPUTargetMachine *TM)
+ : FunctionPass(ID), AMDGPUTM(TM) {
+ initializeAMDGPUUniformIntrinsicCombineLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -59,10 +64,11 @@ class AMDGPUUniformIntrinsicCombineImpl
};
} // namespace
-char AMDGPUUniformIntrinsicCombine::ID = 0;
-char &llvm::AMDGPUUniformIntrinsicCombineID = AMDGPUUniformIntrinsicCombine::ID;
+char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
+char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
+ AMDGPUUniformIntrinsicCombineLegacy::ID;
-bool AMDGPUUniformIntrinsicCombine::runOnFunction(Function &F) {
+bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
if (skipFunction(F)) {
return false;
}
@@ -128,13 +134,14 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
return false;
}
-INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombine, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU uniformIntrinsic Combine", false, false)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombine, DEBUG_TYPE,
+INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU uniformIntrinsic Combine", false, false)
-FunctionPass *llvm::createAMDGPUUniformIntrinsicCombinePass() {
- return new AMDGPUUniformIntrinsicCombine();
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass(
+ const AMDGPUTargetMachine *TM) {
+ return new AMDGPUUniformIntrinsicCombineLegacy(TM);
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
new file mode 100644
index 0000000000000..2a1fddff5f3c8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+
+define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
+; CHECK-NOT: br label %loop
+; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+entry:
+ ; Initialize result to zero
+ %result = alloca i32, align 4
+ store i32 0, ptr %result, align 4
+ br label %loop
+
+loop:
+ ; Load the current result
+ %cur_result = load i32, ptr %result, align 4
+
+ ; Compute the next value
+ %next_value = add i32 %cur_result, %src
+
+ ; Apply the readfirstlane intrinsic for uniformity
+ %uniform_value = call i32 @llvm.amdgcn.readfirstlane(i32 %next_value)
+
+ ; Store the uniform result back
+ store i32 %uniform_value, ptr %result, align 4
+
+ ; This is a trivial loop that always exits after one iteration
+ br i1 true, label %exit, label %loop
+
+exit:
+ ; Store the result to the output pointer
+ %final_result = load i32, ptr %result, align 4
+ store i32 %final_result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index 8f4b70c632e44..40c0d11c68e5e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
; GFX-LABEL: define amdgpu_kernel void @permlane64_constant(
@@ -28,7 +28,6 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
; GFX-LABEL: define amdgpu_kernel void @permlane64_uniform(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[SRC]])
; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
@@ -97,7 +96,6 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
@@ -172,7 +170,6 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
@@ -396,8 +393,56 @@ exit_block:
ret void
}
-declare i32 @llvm.amdgcn.permlane64(i32)
+; Define the kernel function
+define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
+; CHECK-NOT: br label %loop
+; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: [[RESULT:%.*]] = alloca i32, align 4
+; GFX-NEXT: store i32 0, ptr [[RESULT]], align 4
+; GFX-NEXT: br label %[[LOOP:.*]]
+; GFX: [[LOOP]]:
+; GFX-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
+; GFX-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
+; GFX-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
+; GFX-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
+; GFX: [[EXIT]]:
+; GFX-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+entry:
+ ; Initialize result to zero
+ %result = alloca i32, align 4
+ store i32 0, ptr %result, align 4
+ br label %loop
+loop:
+ ; Load the current result
+ %cur_result = load i32, ptr %result, align 4
+
+ ; Compute the next value
+ %next_value = add i32 %cur_result, %src
+
+ ; Apply the readfirstlane intrinsic for uniformity
+ %uniform_value = call i32 @llvm.amdgcn.readfirstlane(i32 %next_value)
+
+ ; Store the uniform result back
+ store i32 %uniform_value, ptr %result, align 4
+
+ ; This is a trivial loop that always exits after one iteration
+ br i1 true, label %exit, label %loop
+
+exit:
+ ; Store the result to the output pointer
+ %final_result = load i32, ptr %result, align 4
+ store i32 %final_result, ptr addrspace(1) %out, align 4
+ ret void
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX10: {{.*}}
>From f5edd781930a18956a7e1c4289d708b23713e890 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 15 Jan 2025 16:58:15 +0530
Subject: [PATCH 06/17] removed unused gfx checks
---
.../amdgpu-simplify-trivial-waterfall-loop.ll | 21 +-
.../amdgpu-uniform-intrinsic-combine.ll | 355 +++++++++---------
2 files changed, 179 insertions(+), 197 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
index 2a1fddff5f3c8..56ba117ce1d30 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
@@ -1,18 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX10
define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
-; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
-; CHECK-NOT: br label %loop
-; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; GFX-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; GFX-NEXT: [[ENTRY:.*:]]
-; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX10-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; GFX10-NEXT: [[ENTRY:.*:]]
+; GFX10-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
entry:
; Initialize result to zero
@@ -43,7 +38,3 @@ exit:
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index 40c0d11c68e5e..ddedd435d35aa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -1,14 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX10
define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_constant(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; GFX-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_constant(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX10-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 77)
store i32 %v, ptr addrspace(1) %out
@@ -16,9 +14,9 @@ define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_undef(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_undef(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -26,10 +24,10 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_uniform(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_uniform(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 %src)
store i32 %v, ptr addrspace(1) %out
@@ -37,14 +35,14 @@ define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
}
define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
@@ -54,15 +52,15 @@ define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
}
define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -73,10 +71,10 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
}
define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_constant(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_constant(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
store i32 %v, ptr addrspace(1) %out
@@ -84,9 +82,9 @@ define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_undef(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_undef(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -94,10 +92,10 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
store i32 %v, ptr addrspace(1) %out
@@ -105,15 +103,15 @@ define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i3
}
define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -124,17 +122,17 @@ define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out)
}
define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
-; GFX-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; GFX10-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -147,10 +145,10 @@ define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_constant(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
store i32 %v, ptr addrspace(1) %out
@@ -158,9 +156,9 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_undef(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -168,10 +166,10 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
store i32 %v, ptr addrspace(1) %out
@@ -179,14 +177,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3
}
define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
@@ -196,15 +194,15 @@ define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -215,10 +213,10 @@ define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
}
define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
%v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
@@ -227,13 +225,13 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou
}
define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -244,12 +242,12 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
@@ -259,13 +257,13 @@ define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_readlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_readlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -277,11 +275,11 @@ define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_boundary(
-; GFX-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
-; GFX-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; GFX10-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; GFX10-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; GFX10-NEXT: ret void
;
%min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
store i32 %min_v, ptr addrspace(1) %out_min
@@ -291,13 +289,13 @@ define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr ad
}
define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_cross_lane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = add i32 %tidx, 5
@@ -307,10 +305,10 @@ define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_random(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%random = xor i32 123, 456
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
@@ -319,9 +317,9 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_invalid(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_invalid(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %undef_v, ptr addrspace(1) %out
@@ -329,13 +327,13 @@ define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%idx1 = call i32 @llvm.amdgcn.workitem.id.x()
%idx2 = mul i32 %idx1, 2
@@ -348,25 +346,25 @@ define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
; used by an intrinsic outside the loop is not optimized due to temporal divergence.
define amdgpu_kernel void @test_divergent_exit(ptr addrspace(1) %out, i32 %max_iter, i32 %div_cond) {
-; GFX-LABEL: define amdgpu_kernel void @test_divergent_exit(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[ENTRY:.*:]]
-; GFX-NEXT: [[ITER:%.*]] = alloca i32, align 4
-; GFX-NEXT: store i32 0, ptr [[ITER]], align 4
-; GFX-NEXT: br label %[[LOOP:.*]]
-; GFX: [[LOOP]]:
-; GFX-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
-; GFX-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
-; GFX-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
-; GFX-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
-; GFX-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
-; GFX-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
-; GFX-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
-; GFX: [[EXIT_BLOCK]]:
-; GFX-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
-; GFX-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
-; GFX-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @test_divergent_exit(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[ENTRY:.*:]]
+; GFX10-NEXT: [[ITER:%.*]] = alloca i32, align 4
+; GFX10-NEXT: store i32 0, ptr [[ITER]], align 4
+; GFX10-NEXT: br label %[[LOOP:.*]]
+; GFX10: [[LOOP]]:
+; GFX10-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX10-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
+; GFX10-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
+; GFX10-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
+; GFX10-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
+; GFX10-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
+; GFX10-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
+; GFX10: [[EXIT_BLOCK]]:
+; GFX10-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX10-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
+; GFX10-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
entry:
%iter = alloca i32, align 4
@@ -395,25 +393,22 @@ exit_block:
; Define the kernel function
define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
-; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
-; CHECK-NOT: br label %loop
-; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[ENTRY:.*:]]
-; GFX-NEXT: [[RESULT:%.*]] = alloca i32, align 4
-; GFX-NEXT: store i32 0, ptr [[RESULT]], align 4
-; GFX-NEXT: br label %[[LOOP:.*]]
-; GFX: [[LOOP]]:
-; GFX-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
-; GFX-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
-; GFX-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
-; GFX-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
-; GFX-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
-; GFX: [[EXIT]]:
-; GFX-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
-; GFX-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[ENTRY:.*:]]
+; GFX10-NEXT: [[RESULT:%.*]] = alloca i32, align 4
+; GFX10-NEXT: store i32 0, ptr [[RESULT]], align 4
+; GFX10-NEXT: br label %[[LOOP:.*]]
+; GFX10: [[LOOP]]:
+; GFX10-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX10-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
+; GFX10-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
+; GFX10-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
+; GFX10-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
+; GFX10: [[EXIT]]:
+; GFX10-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX10-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
entry:
; Initialize result to zero
@@ -444,7 +439,3 @@ exit:
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX12: {{.*}}
>From 242afb1dbe28c9178b1d13b40f595e660fea069b Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 7 Feb 2025 20:08:45 +0530
Subject: [PATCH 07/17] added pass to llc pipeline, more test added
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 11 +-
...amdgpu-miscellaneous-uniform-intrinsics.ll | 132 +++++++
.../amdgpu-simplify-uniform-waterfall.ll | 155 ++++++++
.../amdgpu-uniform-intrinsic-combine.ll | 374 +++++++-----------
5 files changed, 429 insertions(+), 245 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6542a2d85ea4b..b276e42fc5c2d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1222,6 +1222,8 @@ void AMDGPUPassConfig::addIRPasses() {
if (isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
+ if (EnableUniformIntrinsicCombine)
+ addPass(createAMDGPUUniformIntrinsicCombineLegacyPass(&TM));
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index ad2c5e223aa89..8779466831c45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -114,16 +114,9 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
switch (IID) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
- case Intrinsic::amdgcn_readlane: {
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_ballot: {
Value *Src = II.getArgOperand(0);
-
- // readfirstlane (readfirstlane x) -> readfirstlane x
- // readfirstlane (readlane x, y) -> readlane x, y
- // readlane (readfirstlane x), y -> readfirstlane x
- // readlane (readlane x, y), z -> readlane x, y
- // All these cases are identical and are dependent on the inner intrinsic
- // results value.(i.e.irrespective of the which of these case is inner
- // intrinsic will write the same value across all output lane indexes)
if (UI->isUniform(II.getOperandUse(0))) {
II.replaceAllUsesWith(Src);
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
new file mode 100644
index 0000000000000..82f92d2ccb550
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs - %s | FileCheck %s
+
+define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readfirstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_with_firstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v0
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_readlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: permlane64_uniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s2, s[4:5], 0x8
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_permlane64_b32 v1, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform_expression:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: v_permlane64_b32 v1, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
new file mode 100644
index 0000000000000..63bc10c49a161
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O3 -S < %s | FileCheck %s --check-prefixes=PASS-CHECK
+
+; Test case: Ensure that a loop with a divergent exit and a uniform value
+; used by an intrinsic outside the loop is not optimized due to temporal divergence.
+
+define amdgpu_kernel void @test_divergent_exit(ptr addrspace(1) %out, i32 %max_iter, i32 %div_cond) {
+; PASS-CHECK-LABEL: define amdgpu_kernel void @test_divergent_exit(
+; PASS-CHECK-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
+; PASS-CHECK-NEXT: br label %[[LOOP:.*]]
+; PASS-CHECK: [[LOOP]]:
+; PASS-CHECK-NEXT: [[ITER_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEW_ITER:%.*]], %[[LOOP]] ]
+; PASS-CHECK-NEXT: [[NEW_ITER]] = add i32 [[ITER_VAL]], 1
+; PASS-CHECK-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
+; PASS-CHECK-NEXT: [[EXIT:%.*]] = or i1 [[COND2]], [[COND1]]
+; PASS-CHECK-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
+; PASS-CHECK: [[EXIT_BLOCK]]:
+; PASS-CHECK-NEXT: store i32 [[NEW_ITER]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+entry:
+ %iter = alloca i32, align 4
+ store i32 0, ptr %iter, align 4
+ br label %loop
+
+loop:
+ ; Increment loop counter
+ %iter_val = load i32, ptr %iter, align 4
+ %new_iter = add i32 %iter_val, 1
+ store i32 %new_iter, ptr %iter, align 4
+
+ ; Check exit conditions
+ %cond1 = icmp sgt i32 %new_iter, %max_iter
+ %cond2 = icmp eq i32 %div_cond, 0
+ %exit = or i1 %cond1, %cond2
+ br i1 %exit, label %exit_block, label %loop
+
+exit_block:
+ ; Use the uniform value in an intrinsic outside the loop
+ %final_val = load i32, ptr %iter, align 4
+ %result = call i32 @llvm.amdgcn.permlane64(i32 %final_val)
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define protected amdgpu_kernel void @trivial_waterfall() local_unnamed_addr #0 {
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall(
+; PASS-CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; PASS-CHECK-NEXT: [[_PEEL_BEGIN:.*:]]
+; PASS-CHECK-NEXT: [[TMP5:%.*]] = tail call i32 asm sideeffect "", "=v,0"(i32 1) #[[ATTR5:[0-9]+]], !srcloc [[META0:![0-9]+]]
+; PASS-CHECK-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
+; PASS-CHECK-NEXT: [[TMP7:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP6]])
+; PASS-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
+; PASS-CHECK-NEXT: br i1 [[TMP8]], [[DOTLOOPEXIT:label %.*]], [[DOTPEEL_NEWPH:label %.*]]
+; PASS-CHECK: [[_PEEL_NEWPH:.*:]]
+; PASS-CHECK-NEXT: [[TMP4:%.*]] = tail call i32 asm sideeffect "", "=v,0"(i32 0) #[[ATTR5]], !srcloc [[META0]]
+; PASS-CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP4]], 0
+; PASS-CHECK-NEXT: [[TMP10:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP9]])
+; PASS-CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
+; PASS-CHECK-NEXT: br i1 [[TMP11]], [[DOTLOOPEXIT]], [[DOTPEEL_NEWPH]], !llvm.loop [[LOOP1:![0-9]+]]
+; PASS-CHECK: [[_LOOPEXIT:.*:]]
+; PASS-CHECK-NEXT: ret void
+;
+ br label %1
+
+1: ; preds = %10, %0
+ %2 = phi i8 [ 0, %0 ], [ %12, %10 ]
+ %3 = and i8 %2, 1
+ %4 = xor i8 %3, 1
+ %5 = zext nneg i8 %4 to i32
+ %6 = tail call i32 asm sideeffect "", "=v,0"(i32 %5) #2, !srcloc !6
+ %7 = icmp ne i32 %6, 0
+ %8 = tail call i64 @llvm.amdgcn.ballot.i64(i1 %7)
+ %9 = icmp eq i64 %8, 0
+ br i1 %9, label %13, label %10
+
+10: ; preds = %1
+ %11 = icmp eq i8 %3, 0
+ %12 = select i1 %11, i8 1, i8 %2
+ br label %1, !llvm.loop !7
+
+13: ; preds = %1
+ ret void
+}
+
+define protected amdgpu_kernel void @waterfall() local_unnamed_addr #0 {
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @waterfall(
+; PASS-CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; PASS-CHECK-NEXT: br label %[[BB2:.*]]
+; PASS-CHECK: [[BB2]]:
+; PASS-CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ false, [[TMP0:%.*]] ], [ [[DOTBE:%.*]], %[[DOTBACKEDGE:.*]] ]
+; PASS-CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[TMP3]], true
+; PASS-CHECK-NEXT: [[TMP5:%.*]] = zext i1 [[TMP4]] to i32
+; PASS-CHECK-NEXT: [[TMP6:%.*]] = tail call i32 asm sideeffect "", "=v,0"(i32 [[TMP5]]) #[[ATTR5]], !srcloc [[META0]]
+; PASS-CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; PASS-CHECK-NEXT: [[TMP8:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP7]])
+; PASS-CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; PASS-CHECK-NEXT: br i1 [[TMP9]], label %[[BB14:.*]], label %[[BB10:.*]]
+; PASS-CHECK: [[BB10]]:
+; PASS-CHECK-NEXT: br i1 [[TMP3]], label %[[DOTBACKEDGE]], label %[[BB11:.*]]
+; PASS-CHECK: [[BB11]]:
+; PASS-CHECK-NEXT: [[TMP12:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]])
+; PASS-CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP1]], [[TMP12]]
+; PASS-CHECK-NEXT: br label %[[DOTBACKEDGE]]
+; PASS-CHECK: [[_BACKEDGE:.*:]]
+; PASS-CHECK-NEXT: [[DOTBE]] = phi i1 [ true, %[[BB10]] ], [ [[TMP13]], %[[BB11]] ]
+; PASS-CHECK-NEXT: br label %[[BB2]], !llvm.loop [[LOOP4:![0-9]+]]
+; PASS-CHECK: [[BB14]]:
+; PASS-CHECK-NEXT: ret void
+;
+ %1 = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %2 = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1)
+ br label %3
+
+3: ; preds = %15, %0
+ %4 = phi i1 [ false, %0 ], [ %16, %15 ]
+ %5 = xor i1 %4, true
+ %6 = zext i1 %5 to i32
+ %7 = tail call i32 asm sideeffect "", "=v,0"(i32 %6) #3, !srcloc !6
+ %8 = icmp ne i32 %7, 0
+ %9 = tail call i64 @llvm.amdgcn.ballot.i64(i1 %8)
+ %10 = icmp eq i64 %9, 0
+ br i1 %10, label %17, label %11
+
+11: ; preds = %3
+ br i1 %4, label %15, label %12
+
+12: ; preds = %11
+ %13 = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 %2)
+ %14 = icmp eq i32 %2, %13
+ br label %15
+
+15: ; preds = %12, %11
+ %16 = phi i1 [ true, %11 ], [ %14, %12 ]
+ br label %3, !llvm.loop !7
+
+17: ; preds = %3
+ ret void
+}
+
+
+declare i64 @llvm.amdgcn.ballot.i64(i1) #1
+!6 = !{i64 690}
+!7 = distinct !{!7, !8}
+!8 = !{!"llvm.loop.mustprogress"}
+;.
+; PASS-CHECK: [[META0]] = !{i64 690}
+; PASS-CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; PASS-CHECK: [[META2]] = !{!"llvm.loop.mustprogress"}
+; PASS-CHECK: [[META3]] = !{!"llvm.loop.peeled.count", i32 1}
+; PASS-CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]]}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index ddedd435d35aa..c6a639a761f75 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -1,12 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=GFX10
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=PASS-CHECK
define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_constant(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; GFX10-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PASS-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 77)
store i32 %v, ptr addrspace(1) %out
@@ -14,9 +13,9 @@ define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_undef(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_undef(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -24,10 +23,10 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_uniform(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 %src)
store i32 %v, ptr addrspace(1) %out
@@ -35,14 +34,14 @@ define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
}
define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
@@ -52,15 +51,15 @@ define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
}
define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -71,10 +70,10 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
}
define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_constant(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
store i32 %v, ptr addrspace(1) %out
@@ -82,9 +81,9 @@ define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_undef(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_undef(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -92,10 +91,10 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
store i32 %v, ptr addrspace(1) %out
@@ -103,15 +102,15 @@ define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i3
}
define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -122,17 +121,17 @@ define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out)
}
define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX10-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
-; GFX10-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; PASS-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -145,10 +144,10 @@ define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_constant(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
store i32 %v, ptr addrspace(1) %out
@@ -156,9 +155,9 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_undef(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -166,10 +165,10 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
store i32 %v, ptr addrspace(1) %out
@@ -177,14 +176,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3
}
define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
@@ -194,15 +193,15 @@ define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_expression(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -213,10 +212,10 @@ define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
}
define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
%v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
@@ -225,13 +224,13 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou
}
define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -242,12 +241,12 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
-; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
@@ -257,13 +256,13 @@ define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_readlane(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -275,11 +274,11 @@ define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_boundary(
-; GFX10-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
-; GFX10-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; PASS-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; PASS-CHECK-NEXT: ret void
;
%min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
store i32 %min_v, ptr addrspace(1) %out_min
@@ -289,13 +288,13 @@ define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr ad
}
define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_cross_lane(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = add i32 %tidx, 5
@@ -305,10 +304,10 @@ define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_random(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%random = xor i32 123, 456
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
@@ -317,9 +316,9 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_invalid(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_invalid(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: ret void
;
%undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %undef_v, ptr addrspace(1) %out
@@ -327,13 +326,13 @@ define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_expression(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%idx1 = call i32 @llvm.amdgcn.workitem.id.x()
%idx2 = mul i32 %idx1, 2
@@ -342,100 +341,3 @@ define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
ret void
}
-; Test case: Ensure that a loop with a divergent exit and a uniform value
-; used by an intrinsic outside the loop is not optimized due to temporal divergence.
-
-define amdgpu_kernel void @test_divergent_exit(ptr addrspace(1) %out, i32 %max_iter, i32 %div_cond) {
-; GFX10-LABEL: define amdgpu_kernel void @test_divergent_exit(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[ENTRY:.*:]]
-; GFX10-NEXT: [[ITER:%.*]] = alloca i32, align 4
-; GFX10-NEXT: store i32 0, ptr [[ITER]], align 4
-; GFX10-NEXT: br label %[[LOOP:.*]]
-; GFX10: [[LOOP]]:
-; GFX10-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
-; GFX10-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
-; GFX10-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
-; GFX10-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
-; GFX10-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
-; GFX10-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
-; GFX10-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
-; GFX10: [[EXIT_BLOCK]]:
-; GFX10-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
-; GFX10-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
-; GFX10-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
-;
-entry:
- %iter = alloca i32, align 4
- store i32 0, ptr %iter, align 4
- br label %loop
-
-loop:
- ; Increment loop counter
- %iter_val = load i32, ptr %iter, align 4
- %new_iter = add i32 %iter_val, 1
- store i32 %new_iter, ptr %iter, align 4
-
- ; Check exit conditions
- %cond1 = icmp sgt i32 %new_iter, %max_iter
- %cond2 = icmp eq i32 %div_cond, 0
- %exit = or i1 %cond1, %cond2
- br i1 %exit, label %exit_block, label %loop
-
-exit_block:
- ; Use the uniform value in an intrinsic outside the loop
- %final_val = load i32, ptr %iter, align 4
- %result = call i32 @llvm.amdgcn.permlane64(i32 %final_val)
- store i32 %result, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; Define the kernel function
-define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
-; GFX10-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[ENTRY:.*:]]
-; GFX10-NEXT: [[RESULT:%.*]] = alloca i32, align 4
-; GFX10-NEXT: store i32 0, ptr [[RESULT]], align 4
-; GFX10-NEXT: br label %[[LOOP:.*]]
-; GFX10: [[LOOP]]:
-; GFX10-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
-; GFX10-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
-; GFX10-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
-; GFX10-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
-; GFX10-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
-; GFX10: [[EXIT]]:
-; GFX10-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
-; GFX10-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
-;
-entry:
- ; Initialize result to zero
- %result = alloca i32, align 4
- store i32 0, ptr %result, align 4
- br label %loop
-
-loop:
- ; Load the current result
- %cur_result = load i32, ptr %result, align 4
-
- ; Compute the next value
- %next_value = add i32 %cur_result, %src
-
- ; Apply the readfirstlane intrinsic for uniformity
- %uniform_value = call i32 @llvm.amdgcn.readfirstlane(i32 %next_value)
-
- ; Store the uniform result back
- store i32 %uniform_value, ptr %result, align 4
-
- ; This is a trivial loop that always exits after one iteration
- br i1 true, label %exit, label %loop
-
-exit:
- ; Store the result to the output pointer
- %final_result = load i32, ptr %result, align 4
- store i32 %final_result, ptr addrspace(1) %out, align 4
- ret void
-}
-
>From 69dd0aa497acc6b2e970f0b3922d96891bc4ea22 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 18 Feb 2025 09:58:43 +0530
Subject: [PATCH 08/17] handled ballot with icmp for trivial waterfall
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 4 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +-
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 53 ++++-
...amdgpu-miscellaneous-uniform-intrinsics.ll | 3 +-
.../amdgpu-simplify-trivial-waterfall-loop.ll | 40 ----
.../amdgpu-simplify-uniform-waterfall.ll | 185 ++++++++--------
.../amdgpu-uniform-intrinsic-combine.ll | 208 ++++++++++++++++--
.../amdgpu-uniform-temporal-divergence.ll | 90 ++++++++
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 23 ++
9 files changed, 442 insertions(+), 166 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 9928d300fc656..7e628e126c983 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -468,8 +468,8 @@ extern char &AMDGPUWaitSGPRHazardsLegacyID;
void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
-FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass(
- const AMDGPUTargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();
+
struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
const AMDGPUTargetMachine &TM;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b276e42fc5c2d..97c3d001883f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1223,7 +1223,7 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
if (EnableUniformIntrinsicCombine)
- addPass(createAMDGPUUniformIntrinsicCombineLegacyPass(&TM));
+ addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 8779466831c45..4e02c16c61b09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -37,9 +37,7 @@ namespace {
class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
public:
static char ID;
- const AMDGPUTargetMachine *AMDGPUTM;
- AMDGPUUniformIntrinsicCombineLegacy(const AMDGPUTargetMachine *TM)
- : FunctionPass(ID), AMDGPUTM(TM) {
+ AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
initializeAMDGPUUniformIntrinsicCombineLegacyPass(
*PassRegistry::getPassRegistry());
}
@@ -114,15 +112,55 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
switch (IID) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
- case Intrinsic::amdgcn_readlane:
- case Intrinsic::amdgcn_ballot: {
+ case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
+ // Check if the argument is uniform
if (UI->isUniform(II.getOperandUse(0))) {
+ LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << "\n");
II.replaceAllUsesWith(Src);
return true;
}
break;
}
+ case Intrinsic::amdgcn_ballot: {
+ Value *Src = II.getArgOperand(0);
+ // Check if the argument is uniform and has a direct `icmp eq` use of the
+ // ballot result.
+ // %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cond)
+ // %is_done = icmp eq i64 %ballot, 0
+ // This means we are checking if *all lanes* in the ballot result are
+ // inactive.
+ if (UI->isUniform(II.getOperandUse(0))) {
+ LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << "\n");
+
+ // Look for a direct `icmp eq` use of the ballot result.
+ auto It = llvm::find_if(II.users(), [&](User *U) {
+ return match(U, m_ICmp(m_Specific(&II), m_Zero()));
+ });
+
+ // Check if a match was found
+ if (It != II.user_end()) {
+ // Extract the matching `icmp` instruction
+ ICmpInst *ICmp = dyn_cast<ICmpInst>(*It);
+ if (!ICmp)
+ break; // Safety check
+
+ IRBuilder<> Builder(ICmp);
+
+ // Convert ballot argument to match `icmp` operand type (i64)
+ Value *ConvertedSrc =
+ Builder.CreateZExtOrTrunc(Src, ICmp->getOperand(1)->getType());
+
+ LLVM_DEBUG(dbgs() << "Replacing ballot result in icmp: " << *ICmp
+ << " with " << *ConvertedSrc << "\n");
+
+ // Replace `%ballot` in `icmp` with `ConvertedSrc`
+ ICmp->setOperand(0, ConvertedSrc);
+ return true;
+ }
+ }
+ break;
+ }
}
return false;
}
@@ -134,7 +172,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU uniformIntrinsic Combine", false, false)
-FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass(
- const AMDGPUTargetMachine *TM) {
- return new AMDGPUUniformIntrinsicCombineLegacy(TM);
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
+ return new AMDGPUUniformIntrinsicCombineLegacy();
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
index 82f92d2ccb550..f450b0e6763c4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s
define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
; CHECK-LABEL: readfirstlane_with_readfirstlane:
@@ -129,4 +129,3 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
store i32 %v, i32 addrspace(1)* %out_ptr
ret void
}
-
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
deleted file mode 100644
index 56ba117ce1d30..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX10
-
-define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
-; GFX10-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; GFX10-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; GFX10-NEXT: [[ENTRY:.*:]]
-; GFX10-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
-;
-entry:
- ; Initialize result to zero
- %result = alloca i32, align 4
- store i32 0, ptr %result, align 4
- br label %loop
-
-loop:
- ; Load the current result
- %cur_result = load i32, ptr %result, align 4
-
- ; Compute the next value
- %next_value = add i32 %cur_result, %src
-
- ; Apply the readfirstlane intrinsic for uniformity
- %uniform_value = call i32 @llvm.amdgcn.readfirstlane(i32 %next_value)
-
- ; Store the uniform result back
- store i32 %uniform_value, ptr %result, align 4
-
- ; This is a trivial loop that always exits after one iteration
- br i1 true, label %exit, label %loop
-
-exit:
- ; Store the result to the output pointer
- %final_result = load i32, ptr %result, align 4
- store i32 %final_result, ptr addrspace(1) %out, align 4
- ret void
-}
-
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index 63bc10c49a161..f43d3163efd5f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -1,122 +1,109 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O3 -S < %s | FileCheck %s --check-prefixes=PASS-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=DCE-CHECK
-; Test case: Ensure that a loop with a divergent exit and a uniform value
-; used by an intrinsic outside the loop is not optimized due to temporal divergence.
-
-define amdgpu_kernel void @test_divergent_exit(ptr addrspace(1) %out, i32 %max_iter, i32 %div_cond) {
-; PASS-CHECK-LABEL: define amdgpu_kernel void @test_divergent_exit(
-; PASS-CHECK-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+define protected amdgpu_kernel void @trivial_waterfall() {
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall(
+; PASS-CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
-; PASS-CHECK-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
-; PASS-CHECK-NEXT: br label %[[LOOP:.*]]
-; PASS-CHECK: [[LOOP]]:
-; PASS-CHECK-NEXT: [[ITER_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEW_ITER:%.*]], %[[LOOP]] ]
-; PASS-CHECK-NEXT: [[NEW_ITER]] = add i32 [[ITER_VAL]], 1
-; PASS-CHECK-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
-; PASS-CHECK-NEXT: [[EXIT:%.*]] = or i1 [[COND2]], [[COND1]]
-; PASS-CHECK-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
-; PASS-CHECK: [[EXIT_BLOCK]]:
-; PASS-CHECK-NEXT: store i32 [[NEW_ITER]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ true, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
+; PASS-CHECK-NEXT: [[TMP0:%.*]] = zext i1 [[DONE]] to i64
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[TMP0]], 0
+; PASS-CHECK-NEXT: br i1 [[IS_DONE]], label %[[EXIT:.*]], label %[[IF]]
+; PASS-CHECK: [[IF]]:
+; PASS-CHECK-NEXT: [[IS_ONE:%.*]] = icmp eq i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[NEW_DONE]] = select i1 [[IS_ONE]], i1 false, i1 [[DONE]]
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
; PASS-CHECK-NEXT: ret void
;
-entry:
- %iter = alloca i32, align 4
- store i32 0, ptr %iter, align 4
- br label %loop
-
-loop:
- ; Increment loop counter
- %iter_val = load i32, ptr %iter, align 4
- %new_iter = add i32 %iter_val, 1
- store i32 %new_iter, ptr %iter, align 4
-
- ; Check exit conditions
- %cond1 = icmp sgt i32 %new_iter, %max_iter
- %cond2 = icmp eq i32 %div_cond, 0
- %exit = or i1 %cond1, %cond2
- br i1 %exit, label %exit_block, label %loop
-
-exit_block:
- ; Use the uniform value in an intrinsic outside the loop
- %final_val = load i32, ptr %iter, align 4
- %result = call i32 @llvm.amdgcn.permlane64(i32 %final_val)
- store i32 %result, ptr addrspace(1) %out, align 4
- ret void
-}
-
-define protected amdgpu_kernel void @trivial_waterfall() local_unnamed_addr #0 {
-; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall(
-; PASS-CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-; PASS-CHECK-NEXT: [[_PEEL_BEGIN:.*:]]
-; PASS-CHECK-NEXT: [[TMP5:%.*]] = tail call i32 asm sideeffect "", "=v,0"(i32 1) #[[ATTR5:[0-9]+]], !srcloc [[META0:![0-9]+]]
-; PASS-CHECK-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
-; PASS-CHECK-NEXT: [[TMP7:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP6]])
-; PASS-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
-; PASS-CHECK-NEXT: br i1 [[TMP8]], [[DOTLOOPEXIT:label %.*]], [[DOTPEEL_NEWPH:label %.*]]
-; PASS-CHECK: [[_PEEL_NEWPH:.*:]]
-; PASS-CHECK-NEXT: [[TMP4:%.*]] = tail call i32 asm sideeffect "", "=v,0"(i32 0) #[[ATTR5]], !srcloc [[META0]]
-; PASS-CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP4]], 0
-; PASS-CHECK-NEXT: [[TMP10:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP9]])
-; PASS-CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
-; PASS-CHECK-NEXT: br i1 [[TMP11]], [[DOTLOOPEXIT]], [[DOTPEEL_NEWPH]], !llvm.loop [[LOOP1:![0-9]+]]
-; PASS-CHECK: [[_LOOPEXIT:.*:]]
-; PASS-CHECK-NEXT: ret void
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall(
+; DCE-CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*:]]
+; DCE-CHECK-NEXT: ret void
;
- br label %1
+entry:
+ br label %while
-1: ; preds = %10, %0
- %2 = phi i8 [ 0, %0 ], [ %12, %10 ]
- %3 = and i8 %2, 1
- %4 = xor i8 %3, 1
- %5 = zext nneg i8 %4 to i32
- %6 = tail call i32 asm sideeffect "", "=v,0"(i32 %5) #2, !srcloc !6
- %7 = icmp ne i32 %6, 0
- %8 = tail call i64 @llvm.amdgcn.ballot.i64(i1 %7)
- %9 = icmp eq i64 %8, 0
- br i1 %9, label %13, label %10
+while:
+ %done = phi i1 [ 1, %entry ], [ %new_done, %if ]
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %done)
+ %is_done = icmp eq i64 %ballot, 0
+ br i1 %is_done, label %exit, label %if
-10: ; preds = %1
- %11 = icmp eq i8 %3, 0
- %12 = select i1 %11, i8 1, i8 %2
- br label %1, !llvm.loop !7
+if:
+ %is_one = icmp eq i1 %done, 1
+ %new_done = select i1 %is_one, i1 0, i1 %done
+ br label %while
-13: ; preds = %1
+exit:
ret void
}
-define protected amdgpu_kernel void @waterfall() local_unnamed_addr #0 {
+define protected amdgpu_kernel void @waterfall() {
; PASS-CHECK-LABEL: define protected amdgpu_kernel void @waterfall(
-; PASS-CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; PASS-CHECK-SAME: ) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP0]])
; PASS-CHECK-NEXT: br label %[[BB2:.*]]
; PASS-CHECK: [[BB2]]:
-; PASS-CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ false, [[TMP0:%.*]] ], [ [[DOTBE:%.*]], %[[DOTBACKEDGE:.*]] ]
+; PASS-CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[TMP15:%.*]], %[[TMP14:.*]] ]
; PASS-CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[TMP3]], true
; PASS-CHECK-NEXT: [[TMP5:%.*]] = zext i1 [[TMP4]] to i32
-; PASS-CHECK-NEXT: [[TMP6:%.*]] = tail call i32 asm sideeffect "", "=v,0"(i32 [[TMP5]]) #[[ATTR5]], !srcloc [[META0]]
+; PASS-CHECK-NEXT: [[TMP6:%.*]] = tail call i32 asm sideeffect "", "=v,0"(i32 [[TMP5]]), !srcloc [[META0:![0-9]+]]
; PASS-CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
-; PASS-CHECK-NEXT: [[TMP8:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP7]])
-; PASS-CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; PASS-CHECK-NEXT: br i1 [[TMP9]], label %[[BB14:.*]], label %[[BB10:.*]]
+; PASS-CHECK-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP7]])
+; PASS-CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 0
+; PASS-CHECK-NEXT: br i1 [[TMP9]], label %[[BB16:.*]], label %[[BB10:.*]]
; PASS-CHECK: [[BB10]]:
-; PASS-CHECK-NEXT: br i1 [[TMP3]], label %[[DOTBACKEDGE]], label %[[BB11:.*]]
+; PASS-CHECK-NEXT: br i1 [[TMP3]], label %[[TMP14]], label %[[BB11:.*]]
; PASS-CHECK: [[BB11]]:
; PASS-CHECK-NEXT: [[TMP12:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]])
; PASS-CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP1]], [[TMP12]]
-; PASS-CHECK-NEXT: br label %[[DOTBACKEDGE]]
-; PASS-CHECK: [[_BACKEDGE:.*:]]
-; PASS-CHECK-NEXT: [[DOTBE]] = phi i1 [ true, %[[BB10]] ], [ [[TMP13]], %[[BB11]] ]
-; PASS-CHECK-NEXT: br label %[[BB2]], !llvm.loop [[LOOP4:![0-9]+]]
-; PASS-CHECK: [[BB14]]:
+; PASS-CHECK-NEXT: br label %[[TMP14]]
+; PASS-CHECK: [[TMP14]]:
+; PASS-CHECK-NEXT: [[TMP15]] = phi i1 [ true, %[[BB10]] ], [ [[TMP13]], %[[BB11]] ]
+; PASS-CHECK-NEXT: br label %[[BB2]], !llvm.loop [[LOOP1:![0-9]+]]
+; PASS-CHECK: [[BB16]]:
; PASS-CHECK-NEXT: ret void
;
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @waterfall(
+; DCE-CHECK-SAME: ) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*]]:
+; DCE-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; DCE-CHECK-NEXT: br label %[[BB1:.*]]
+; DCE-CHECK: [[BB1]]:
+; DCE-CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[TMP14:%.*]], %[[TMP13:.*]] ]
+; DCE-CHECK-NEXT: [[TMP3:%.*]] = xor i1 [[TMP2]], true
+; DCE-CHECK-NEXT: [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; DCE-CHECK-NEXT: [[TMP5:%.*]] = tail call i32 asm sideeffect "", "=v,0"(i32 [[TMP4]]) #[[ATTR4:[0-9]+]], !srcloc [[META0:![0-9]+]]
+; DCE-CHECK-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
+; DCE-CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP6]])
+; DCE-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
+; DCE-CHECK-NEXT: br i1 [[TMP8]], label %[[BB15:.*]], label %[[BB9:.*]]
+; DCE-CHECK: [[BB9]]:
+; DCE-CHECK-NEXT: br i1 [[TMP2]], label %[[TMP13]], label %[[BB10:.*]]
+; DCE-CHECK: [[BB10]]:
+; DCE-CHECK-NEXT: [[TMP11:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
+; DCE-CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP0]], [[TMP11]]
+; DCE-CHECK-NEXT: br label %[[TMP13]]
+; DCE-CHECK: [[TMP13]]:
+; DCE-CHECK-NEXT: [[TMP14]] = phi i1 [ true, %[[BB9]] ], [ [[TMP12]], %[[BB10]] ]
+; DCE-CHECK-NEXT: br label %[[BB1]], !llvm.loop [[LOOP1:![0-9]+]]
+; DCE-CHECK: [[BB15]]:
+; DCE-CHECK-NEXT: ret void
+;
+entry:
%1 = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%2 = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1)
br label %3
-3: ; preds = %15, %0
- %4 = phi i1 [ false, %0 ], [ %16, %15 ]
+3:
+ %4 = phi i1 [ false, %entry ], [ %16, %15 ]
%5 = xor i1 %4, true
%6 = zext i1 %5 to i32
%7 = tail call i32 asm sideeffect "", "=v,0"(i32 %6) #3, !srcloc !6
@@ -125,19 +112,19 @@ define protected amdgpu_kernel void @waterfall() local_unnamed_addr #0 {
%10 = icmp eq i64 %9, 0
br i1 %10, label %17, label %11
-11: ; preds = %3
+11:
br i1 %4, label %15, label %12
-12: ; preds = %11
+12:
%13 = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 %2)
%14 = icmp eq i32 %2, %13
br label %15
-15: ; preds = %12, %11
+15:
%16 = phi i1 [ true, %11 ], [ %14, %12 ]
br label %3, !llvm.loop !7
-17: ; preds = %3
+17:
ret void
}
@@ -148,8 +135,10 @@ declare i64 @llvm.amdgcn.ballot.i64(i1) #1
!8 = !{!"llvm.loop.mustprogress"}
;.
; PASS-CHECK: [[META0]] = !{i64 690}
-; PASS-CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; PASS-CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]]}
; PASS-CHECK: [[META2]] = !{!"llvm.loop.mustprogress"}
-; PASS-CHECK: [[META3]] = !{!"llvm.loop.peeled.count", i32 1}
-; PASS-CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]]}
+;.
+; DCE-CHECK: [[META0]] = !{i64 690}
+; DCE-CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]]}
+; DCE-CHECK: [[META2]] = !{!"llvm.loop.mustprogress"}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index c6a639a761f75..ee54aff64f25d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -1,11 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine,amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s --check-prefixes=PASS-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s -check-prefix=DCE-CHECK
define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 77)
; PASS-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; DCE-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 77)
store i32 %v, ptr addrspace(1) %out
@@ -15,7 +22,14 @@ define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_undef(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 undef)
+; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_undef(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -25,8 +39,14 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[SRC]])
; PASS-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 %src)
store i32 %v, ptr addrspace(1) %out
@@ -38,10 +58,17 @@ define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
@@ -56,10 +83,18 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -72,8 +107,14 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 7, i32 5)
; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
store i32 %v, ptr addrspace(1) %out
@@ -83,7 +124,14 @@ define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_undef(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 undef, i32 undef)
+; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_undef(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -93,8 +141,14 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
store i32 %v, ptr addrspace(1) %out
@@ -107,10 +161,18 @@ define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out)
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -128,10 +190,20 @@ define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out
; PASS-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
; PASS-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; DCE-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -146,8 +218,14 @@ define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out
define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
store i32 %v, ptr addrspace(1) %out
@@ -157,7 +235,14 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_undef(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 undef)
+; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -167,8 +252,14 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
store i32 %v, ptr addrspace(1) %out
@@ -180,10 +271,17 @@ define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
@@ -198,10 +296,18 @@ define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -214,8 +320,15 @@ define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
%v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
@@ -229,8 +342,17 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -245,8 +367,16 @@ define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[V1]], i32 3)
; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
@@ -261,8 +391,17 @@ define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[V1]], i32 2)
; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -276,9 +415,17 @@ define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[MIN_V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 -2147483648)
; PASS-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; PASS-CHECK-NEXT: [[MAX_V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 2147483647)
; PASS-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; DCE-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; DCE-CHECK-NEXT: ret void
;
%min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
store i32 %min_v, ptr addrspace(1) %out_min
@@ -295,6 +442,14 @@ define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = add i32 %tidx, 5
@@ -306,8 +461,16 @@ define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
+; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
+; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%random = xor i32 123, 456
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
@@ -318,7 +481,14 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_invalid(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[UNDEF_V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 undef)
+; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_invalid(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %undef_v, ptr addrspace(1) %out
@@ -329,10 +499,18 @@ define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
+; PASS-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%idx1 = call i32 @llvm.amdgcn.workitem.id.x()
%idx2 = mul i32 %idx1, 2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
new file mode 100644
index 0000000000000..a467bba7973ef
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=DCE-CHECK
+
+; This should not be optimized
+define amdgpu_cs void @temporal_divergence(ptr addrspace(1) %out, i32 %n) {
+; PASS-CHECK-LABEL: define amdgpu_cs void @temporal_divergence(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[VAL:%.*]] = alloca i32, align 4
+; PASS-CHECK-NEXT: store i32 0, ptr [[VAL]], align 4
+; PASS-CHECK-NEXT: [[TID_MOD:%.*]] = urem i32 [[TID]], 2
+; PASS-CHECK-NEXT: [[IS_EVEN:%.*]] = icmp eq i32 [[TID_MOD]], 0
+; PASS-CHECK-NEXT: br i1 [[IS_EVEN]], label %[[EXIT_LOOP:.*]], label %[[LOOP:.*]]
+; PASS-CHECK: [[LOOP]]:
+; PASS-CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; PASS-CHECK-NEXT: [[VAL_LOADED:%.*]] = load i32, ptr [[VAL]], align 4
+; PASS-CHECK-NEXT: [[VAL_UPDATED:%.*]] = add i32 [[VAL_LOADED]], [[I]]
+; PASS-CHECK-NEXT: store i32 [[VAL_UPDATED]], ptr [[VAL]], align 4
+; PASS-CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
+; PASS-CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; PASS-CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT_LOOP]]
+; PASS-CHECK: [[EXIT_LOOP]]:
+; PASS-CHECK-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[VAL]], align 4
+; PASS-CHECK-NEXT: [[FIRST_LANE_VAL:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[FINAL_VAL]])
+; PASS-CHECK-NEXT: store i32 [[FIRST_LANE_VAL]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_cs void @temporal_divergence(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*]]:
+; DCE-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[VAL:%.*]] = alloca i32, align 4
+; DCE-CHECK-NEXT: store i32 0, ptr [[VAL]], align 4
+; DCE-CHECK-NEXT: [[TID_MOD:%.*]] = and i32 [[TID]], 1
+; DCE-CHECK-NEXT: [[IS_EVEN:%.*]] = icmp eq i32 [[TID_MOD]], 0
+; DCE-CHECK-NEXT: br i1 [[IS_EVEN]], label %[[EXIT_LOOP:.*]], label %[[LOOP:.*]]
+; DCE-CHECK: [[LOOP]]:
+; DCE-CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; DCE-CHECK-NEXT: [[VAL_LOADED:%.*]] = load i32, ptr [[VAL]], align 4
+; DCE-CHECK-NEXT: [[VAL_UPDATED:%.*]] = add i32 [[VAL_LOADED]], [[I]]
+; DCE-CHECK-NEXT: store i32 [[VAL_UPDATED]], ptr [[VAL]], align 4
+; DCE-CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
+; DCE-CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; DCE-CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT_LOOP]]
+; DCE-CHECK: [[EXIT_LOOP]]:
+; DCE-CHECK-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[VAL]], align 4
+; DCE-CHECK-NEXT: [[FIRST_LANE_VAL:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[FINAL_VAL]])
+; DCE-CHECK-NEXT: store i32 [[FIRST_LANE_VAL]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+entry:
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %val = alloca i32, align 4
+ store i32 0, ptr %val, align 4
+
+ ; Compute (tid % 2) to check if it is even
+ %tid_mod = urem i32 %tid, 2
+ %is_even = icmp eq i32 %tid_mod, 0
+
+ ; If tid is even, jump directly to exit.loop
+ br i1 %is_even, label %exit.loop, label %loop
+
+loop:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+ %val.loaded = load i32, ptr %val, align 4
+
+ ; Update value
+ %val.updated = add i32 %val.loaded, %i
+ store i32 %val.updated, ptr %val, align 4
+
+ ; Loop iteration
+ %i.next = add i32 %i, 1
+ %loop.cond = icmp ult i32 %i.next, %n
+ br i1 %loop.cond, label %loop, label %exit.loop
+
+exit.loop:
+ ; Read first lane's value
+ %final_val = load i32, ptr %val, align 4
+ %first_lane_val = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %final_val)
+
+ ; Store result in memory
+ store i32 %first_lane_val, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
+
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index d7f54f3b8e9e2..bbd164993764d 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -31,6 +31,11 @@
; GCN-O0-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O0-NEXT: AMDGPU Printf lowering
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O0-NEXT: FunctionPass Manager
+; GCN-O0-NEXT: Dominator Tree Construction
+; GCN-O0-NEXT: Cycle Info Analysis
+; GCN-O0-NEXT: Uniformity Analysis
+; GCN-O0-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O0-NEXT: Expand variadic functions
; GCN-O0-NEXT: AMDGPU Inline All Functions
; GCN-O0-NEXT: Inliner for always_inline functions
@@ -181,6 +186,11 @@
; GCN-O1-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-NEXT: AMDGPU Printf lowering
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O1-NEXT: FunctionPass Manager
+; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Cycle Info Analysis
+; GCN-O1-NEXT: Uniformity Analysis
+; GCN-O1-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-NEXT: Expand variadic functions
; GCN-O1-NEXT: AMDGPU Inline All Functions
; GCN-O1-NEXT: Inliner for always_inline functions
@@ -466,6 +476,11 @@
; GCN-O1-OPTS-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O1-OPTS-NEXT: FunctionPass Manager
+; GCN-O1-OPTS-NEXT: Dominator Tree Construction
+; GCN-O1-OPTS-NEXT: Cycle Info Analysis
+; GCN-O1-OPTS-NEXT: Uniformity Analysis
+; GCN-O1-OPTS-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-OPTS-NEXT: Expand variadic functions
; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
@@ -781,6 +796,10 @@
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer
+; GCN-O2-NEXT: Dominator Tree Construction
+; GCN-O2-NEXT: Cycle Info Analysis
+; GCN-O2-NEXT: Uniformity Analysis
+; GCN-O2-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O2-NEXT: Expand variadic functions
; GCN-O2-NEXT: AMDGPU Inline All Functions
; GCN-O2-NEXT: Inliner for always_inline functions
@@ -1100,6 +1119,10 @@
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer
+; GCN-O3-NEXT: Dominator Tree Construction
+; GCN-O3-NEXT: Cycle Info Analysis
+; GCN-O3-NEXT: Uniformity Analysis
+; GCN-O3-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O3-NEXT: Expand variadic functions
; GCN-O3-NEXT: AMDGPU Inline All Functions
; GCN-O3-NEXT: Inliner for always_inline functions
>From be7a5a2376b76bd4ae093f57bddef433ccf0ff5d Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 18 Feb 2025 19:28:51 +0530
Subject: [PATCH 09/17] updated test
---
.../amdgpu-simplify-uniform-waterfall.ll | 137 ++++++++----------
.../amdgpu-uniform-temporal-divergence.ll | 96 ++++--------
2 files changed, 89 insertions(+), 144 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index f43d3163efd5f..f5d3aa176449a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -2,129 +2,123 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=DCE-CHECK
-define protected amdgpu_kernel void @trivial_waterfall() {
+define protected amdgpu_kernel void @trivial_waterfall(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall(
-; PASS-CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
-; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ true, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[DONE1:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[DONE:%.*]] = xor i1 [[DONE1]], true
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = zext i1 [[DONE]] to i64
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[TMP0]], 0
; PASS-CHECK-NEXT: br i1 [[IS_DONE]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
-; PASS-CHECK-NEXT: [[IS_ONE:%.*]] = icmp eq i1 [[DONE]], true
-; PASS-CHECK-NEXT: [[NEW_DONE]] = select i1 [[IS_ONE]], i1 false, i1 [[DONE]]
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: br label %[[WHILE]]
; PASS-CHECK: [[EXIT]]:
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall(
-; DCE-CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; DCE-CHECK-NEXT: [[ENTRY:.*:]]
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
entry:
br label %while
while:
- %done = phi i1 [ 1, %entry ], [ %new_done, %if ]
- %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %done)
+ %done = phi i1 [ 0, %entry ], [ 1, %if ]
+ %not_done = xor i1 %done, true
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
%is_done = icmp eq i64 %ballot, 0
br i1 %is_done, label %exit, label %if
if:
- %is_one = icmp eq i1 %done, 1
- %new_done = select i1 %is_one, i1 0, i1 %done
+ store i32 5, ptr addrspace(1) %out
br label %while
exit:
ret void
}
-define protected amdgpu_kernel void @waterfall() {
+define protected amdgpu_kernel void @waterfall(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define protected amdgpu_kernel void @waterfall(
-; PASS-CHECK-SAME: ) #[[ATTR0]] {
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
; PASS-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
; PASS-CHECK-NEXT: [[TMP1:%.*]] = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP0]])
-; PASS-CHECK-NEXT: br label %[[BB2:.*]]
-; PASS-CHECK: [[BB2]]:
-; PASS-CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[TMP15:%.*]], %[[TMP14:.*]] ]
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
; PASS-CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[TMP3]], true
-; PASS-CHECK-NEXT: [[TMP5:%.*]] = zext i1 [[TMP4]] to i32
-; PASS-CHECK-NEXT: [[TMP6:%.*]] = tail call i32 asm sideeffect "", "=v,0"(i32 [[TMP5]]), !srcloc [[META0:![0-9]+]]
-; PASS-CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
-; PASS-CHECK-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP7]])
+; PASS-CHECK-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP4]])
; PASS-CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 0
-; PASS-CHECK-NEXT: br i1 [[TMP9]], label %[[BB16:.*]], label %[[BB10:.*]]
-; PASS-CHECK: [[BB10]]:
-; PASS-CHECK-NEXT: br i1 [[TMP3]], label %[[TMP14]], label %[[BB11:.*]]
-; PASS-CHECK: [[BB11]]:
+; PASS-CHECK-NEXT: br i1 [[TMP9]], label %[[EXIT:.*]], label %[[IF:.*]]
+; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: [[TMP12:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]])
; PASS-CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP1]], [[TMP12]]
-; PASS-CHECK-NEXT: br label %[[TMP14]]
-; PASS-CHECK: [[TMP14]]:
-; PASS-CHECK-NEXT: [[TMP15]] = phi i1 [ true, %[[BB10]] ], [ [[TMP13]], %[[BB11]] ]
-; PASS-CHECK-NEXT: br label %[[BB2]], !llvm.loop [[LOOP1:![0-9]+]]
-; PASS-CHECK: [[BB16]]:
+; PASS-CHECK-NEXT: br i1 [[TMP13]], label %[[WORK:.*]], label %[[TAIL]]
+; PASS-CHECK: [[WORK]]:
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: br label %[[TAIL]]
+; PASS-CHECK: [[TAIL]]:
+; PASS-CHECK-NEXT: [[NEW_DONE]] = phi i1 [ true, %[[WORK]] ], [ false, %[[IF]] ]
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define protected amdgpu_kernel void @waterfall(
-; DCE-CHECK-SAME: ) #[[ATTR0]] {
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; DCE-CHECK-NEXT: [[ENTRY:.*]]:
; DCE-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; DCE-CHECK-NEXT: br label %[[BB1:.*]]
-; DCE-CHECK: [[BB1]]:
-; DCE-CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[TMP14:%.*]], %[[TMP13:.*]] ]
+; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
+; DCE-CHECK: [[WHILE]]:
+; DCE-CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[TMP12:%.*]], %[[TAIL:.*]] ]
; DCE-CHECK-NEXT: [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; DCE-CHECK-NEXT: [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
-; DCE-CHECK-NEXT: [[TMP5:%.*]] = tail call i32 asm sideeffect "", "=v,0"(i32 [[TMP4]]) #[[ATTR4:[0-9]+]], !srcloc [[META0:![0-9]+]]
-; DCE-CHECK-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
-; DCE-CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP6]])
+; DCE-CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP3]])
; DCE-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
-; DCE-CHECK-NEXT: br i1 [[TMP8]], label %[[BB15:.*]], label %[[BB9:.*]]
-; DCE-CHECK: [[BB9]]:
-; DCE-CHECK-NEXT: br i1 [[TMP2]], label %[[TMP13]], label %[[BB10:.*]]
-; DCE-CHECK: [[BB10]]:
+; DCE-CHECK-NEXT: br i1 [[TMP8]], label %[[EXIT:.*]], label %[[IF:.*]]
+; DCE-CHECK: [[IF]]:
; DCE-CHECK-NEXT: [[TMP11:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
-; DCE-CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP0]], [[TMP11]]
-; DCE-CHECK-NEXT: br label %[[TMP13]]
-; DCE-CHECK: [[TMP13]]:
-; DCE-CHECK-NEXT: [[TMP14]] = phi i1 [ true, %[[BB9]] ], [ [[TMP12]], %[[BB10]] ]
-; DCE-CHECK-NEXT: br label %[[BB1]], !llvm.loop [[LOOP1:![0-9]+]]
-; DCE-CHECK: [[BB15]]:
+; DCE-CHECK-NEXT: [[TMP12]] = icmp eq i32 [[TMP0]], [[TMP11]]
+; DCE-CHECK-NEXT: br i1 [[TMP12]], label %[[WORK:.*]], label %[[TAIL]]
+; DCE-CHECK: [[WORK]]:
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: br label %[[TAIL]]
+; DCE-CHECK: [[TAIL]]:
+; DCE-CHECK-NEXT: br label %[[WHILE]]
+; DCE-CHECK: [[EXIT]]:
; DCE-CHECK-NEXT: ret void
;
entry:
%1 = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
- %2 = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1)
- br label %3
+ %tid = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1)
+ br label %while
-3:
- %4 = phi i1 [ false, %entry ], [ %16, %15 ]
- %5 = xor i1 %4, true
- %6 = zext i1 %5 to i32
- %7 = tail call i32 asm sideeffect "", "=v,0"(i32 %6) #3, !srcloc !6
- %8 = icmp ne i32 %7, 0
- %9 = tail call i64 @llvm.amdgcn.ballot.i64(i1 %8)
- %10 = icmp eq i64 %9, 0
- br i1 %10, label %17, label %11
+while:
+ %done = phi i1 [ false, %entry ], [ %new_done, %tail ]
+ %not_done = xor i1 %done, true
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
+ %is_done = icmp eq i64 %ballot, 0
+ br i1 %is_done, label %exit, label %if
-11:
- br i1 %4, label %15, label %12
+if:
+ %first_active_id = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 %tid)
+ %is_first_active_id = icmp eq i32 %tid, %first_active_id
+ br i1 %is_first_active_id, label %work, label %tail
-12:
- %13 = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 %2)
- %14 = icmp eq i32 %2, %13
- br label %15
+work:
+ store i32 5, ptr addrspace(1) %out
+ br label %tail
-15:
- %16 = phi i1 [ true, %11 ], [ %14, %12 ]
- br label %3, !llvm.loop !7
+tail:
+ %new_done = phi i1 [ true, %work ], [ false, %if ]
+ br label %while
-17:
+exit:
ret void
}
@@ -133,12 +127,3 @@ declare i64 @llvm.amdgcn.ballot.i64(i1) #1
!6 = !{i64 690}
!7 = distinct !{!7, !8}
!8 = !{!"llvm.loop.mustprogress"}
-;.
-; PASS-CHECK: [[META0]] = !{i64 690}
-; PASS-CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]]}
-; PASS-CHECK: [[META2]] = !{!"llvm.loop.mustprogress"}
-;.
-; DCE-CHECK: [[META0]] = !{i64 690}
-; DCE-CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]]}
-; DCE-CHECK: [[META2]] = !{!"llvm.loop.mustprogress"}
-;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
index a467bba7973ef..3061669edc0b3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
@@ -7,84 +7,44 @@ define amdgpu_cs void @temporal_divergence(ptr addrspace(1) %out, i32 %n) {
; PASS-CHECK-LABEL: define amdgpu_cs void @temporal_divergence(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
-; PASS-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[VAL:%.*]] = alloca i32, align 4
-; PASS-CHECK-NEXT: store i32 0, ptr [[VAL]], align 4
-; PASS-CHECK-NEXT: [[TID_MOD:%.*]] = urem i32 [[TID]], 2
-; PASS-CHECK-NEXT: [[IS_EVEN:%.*]] = icmp eq i32 [[TID_MOD]], 0
-; PASS-CHECK-NEXT: br i1 [[IS_EVEN]], label %[[EXIT_LOOP:.*]], label %[[LOOP:.*]]
-; PASS-CHECK: [[LOOP]]:
-; PASS-CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; PASS-CHECK-NEXT: [[VAL_LOADED:%.*]] = load i32, ptr [[VAL]], align 4
-; PASS-CHECK-NEXT: [[VAL_UPDATED:%.*]] = add i32 [[VAL_LOADED]], [[I]]
-; PASS-CHECK-NEXT: store i32 [[VAL_UPDATED]], ptr [[VAL]], align 4
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: br label %[[H:.*]]
+; PASS-CHECK: [[H]]:
+; PASS-CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[H]] ]
; PASS-CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
-; PASS-CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
-; PASS-CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT_LOOP]]
-; PASS-CHECK: [[EXIT_LOOP]]:
-; PASS-CHECK-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[VAL]], align 4
-; PASS-CHECK-NEXT: [[FIRST_LANE_VAL:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[FINAL_VAL]])
-; PASS-CHECK-NEXT: store i32 [[FIRST_LANE_VAL]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0
+; PASS-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]]
+; PASS-CHECK: [[X]]:
+; PASS-CHECK-NEXT: [[UNI_JOIN:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[I_NEXT]])
+; PASS-CHECK-NEXT: [[JOIN_USER:%.*]] = add i32 [[I_NEXT]], 5
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_cs void @temporal_divergence(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; DCE-CHECK-NEXT: [[ENTRY:.*]]:
-; DCE-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[VAL:%.*]] = alloca i32, align 4
-; DCE-CHECK-NEXT: store i32 0, ptr [[VAL]], align 4
-; DCE-CHECK-NEXT: [[TID_MOD:%.*]] = and i32 [[TID]], 1
-; DCE-CHECK-NEXT: [[IS_EVEN:%.*]] = icmp eq i32 [[TID_MOD]], 0
-; DCE-CHECK-NEXT: br i1 [[IS_EVEN]], label %[[EXIT_LOOP:.*]], label %[[LOOP:.*]]
-; DCE-CHECK: [[LOOP]]:
-; DCE-CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; DCE-CHECK-NEXT: [[VAL_LOADED:%.*]] = load i32, ptr [[VAL]], align 4
-; DCE-CHECK-NEXT: [[VAL_UPDATED:%.*]] = add i32 [[VAL_LOADED]], [[I]]
-; DCE-CHECK-NEXT: store i32 [[VAL_UPDATED]], ptr [[VAL]], align 4
-; DCE-CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
-; DCE-CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
-; DCE-CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT_LOOP]]
-; DCE-CHECK: [[EXIT_LOOP]]:
-; DCE-CHECK-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[VAL]], align 4
-; DCE-CHECK-NEXT: [[FIRST_LANE_VAL:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[FINAL_VAL]])
-; DCE-CHECK-NEXT: store i32 [[FIRST_LANE_VAL]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[ENTRY:.*:]]
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: br label %[[H:.*]]
+; DCE-CHECK: [[H]]:
+; DCE-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0
+; DCE-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]]
+; DCE-CHECK: [[X]]:
; DCE-CHECK-NEXT: ret void
;
entry:
- %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
- %val = alloca i32, align 4
- store i32 0, ptr %val, align 4
-
- ; Compute (tid % 2) to check if it is even
- %tid_mod = urem i32 %tid, 2
- %is_even = icmp eq i32 %tid_mod, 0
-
- ; If tid is even, jump directly to exit.loop
- br i1 %is_even, label %exit.loop, label %loop
-
-loop:
- %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
- %val.loaded = load i32, ptr %val, align 4
-
- ; Update value
- %val.updated = add i32 %val.loaded, %i
- store i32 %val.updated, ptr %val, align 4
-
- ; Loop iteration
- %i.next = add i32 %i, 1
- %loop.cond = icmp ult i32 %i.next, %n
- br i1 %loop.cond, label %loop, label %exit.loop
-
-exit.loop:
- ; Read first lane's value
- %final_val = load i32, ptr %val, align 4
- %first_lane_val = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %final_val)
-
- ; Store result in memory
- store i32 %first_lane_val, ptr addrspace(1) %out, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ br label %H
+
+H:
+ %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ]
+ %uni.inc = add i32 %uni.merge.h, 1
+ %div.exitx = icmp eq i32 %tid, 0
+ br i1 %div.exitx, label %X, label %H ; divergent branch
+
+X:
+ %uni.join = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %uni.inc)
+ %join.user = add i32 %uni.join, 5
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
-
>From 16bc3a86b131d2154af353fff9c3d69a5f5555f5 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Feb 2025 17:50:35 +0530
Subject: [PATCH 10/17] Fix: use isDivergentUse instead isUniform
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 15 ++++----
.../amdgpu-uniform-temporal-divergence.ll | 37 +++++++++++--------
2 files changed, 30 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 4e02c16c61b09..e083ca26b7c6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -114,8 +114,8 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
- // Check if the argument is uniform
- if (UI->isUniform(II.getOperandUse(0))) {
+ // Check if the argument use is uniform
+ if (!UI->isDivergentUse(II.getOperandUse(0))) {
LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << "\n");
II.replaceAllUsesWith(Src);
return true;
@@ -124,13 +124,14 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
}
case Intrinsic::amdgcn_ballot: {
Value *Src = II.getArgOperand(0);
- // Check if the argument is uniform and has a direct `icmp eq` use of the
- // ballot result.
+ // Check if the argument use is uniform and has a direct `icmp eq` use of
+ // the ballot result. If exists pull the ballot argument to the use place.
// %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cond)
// %is_done = icmp eq i64 %ballot, 0
- // This means we are checking if *all lanes* in the ballot result are
- // inactive.
- if (UI->isUniform(II.getOperandUse(0))) {
+ // transformed IR should look like.
+ // %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cond)
+ // %is_done = icmp eq i64 %cond, 0
+ if (!UI->isDivergentUse(II.getOperandUse(0))) {
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << "\n");
// Look for a direct `icmp eq` use of the ballot result.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
index 3061669edc0b3..2fde3e3759f47 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=DCE-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=COMB-CHECK
; This should not be optimized
define amdgpu_cs void @temporal_divergence(ptr addrspace(1) %out, i32 %n) {
@@ -10,25 +10,31 @@ define amdgpu_cs void @temporal_divergence(ptr addrspace(1) %out, i32 %n) {
; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: br label %[[H:.*]]
; PASS-CHECK: [[H]]:
-; PASS-CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[H]] ]
-; PASS-CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
+; PASS-CHECK-NEXT: [[UNI_MERGE_H:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[UNI_INC:%.*]], %[[H]] ]
+; PASS-CHECK-NEXT: [[UNI_INC]] = add i32 [[UNI_MERGE_H]], 1
; PASS-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0
; PASS-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]]
; PASS-CHECK: [[X]]:
-; PASS-CHECK-NEXT: [[UNI_JOIN:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[I_NEXT]])
-; PASS-CHECK-NEXT: [[JOIN_USER:%.*]] = add i32 [[I_NEXT]], 5
+; PASS-CHECK-NEXT: [[UNI_JOIN:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[UNI_INC]])
+; PASS-CHECK-NEXT: [[JOIN_USER:%.*]] = add i32 [[UNI_JOIN]], 5
+; PASS-CHECK-NEXT: store i32 [[JOIN_USER]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
-; DCE-CHECK-LABEL: define amdgpu_cs void @temporal_divergence(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; DCE-CHECK-NEXT: [[ENTRY:.*:]]
-; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: br label %[[H:.*]]
-; DCE-CHECK: [[H]]:
-; DCE-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0
-; DCE-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]]
-; DCE-CHECK: [[X]]:
-; DCE-CHECK-NEXT: ret void
+; COMB-CHECK-LABEL: define amdgpu_cs void @temporal_divergence(
+; COMB-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMB-CHECK-NEXT: [[ENTRY:.*]]:
+; COMB-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; COMB-CHECK-NEXT: br label %[[H:.*]]
+; COMB-CHECK: [[H]]:
+; COMB-CHECK-NEXT: [[UNI_MERGE_H:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[UNI_INC:%.*]], %[[H]] ]
+; COMB-CHECK-NEXT: [[UNI_INC]] = add i32 [[UNI_MERGE_H]], 1
+; COMB-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0
+; COMB-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]]
+; COMB-CHECK: [[X]]:
+; COMB-CHECK-NEXT: [[UNI_JOIN:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[UNI_INC]])
+; COMB-CHECK-NEXT: [[JOIN_USER:%.*]] = add i32 [[UNI_JOIN]], 5
+; COMB-CHECK-NEXT: store i32 [[JOIN_USER]], ptr addrspace(1) [[OUT]], align 4
+; COMB-CHECK-NEXT: ret void
;
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -43,6 +49,7 @@ H:
X:
%uni.join = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %uni.inc)
%join.user = add i32 %uni.join, 5
+ store i32 %join.user, ptr addrspace(1) %out
ret void
}
>From b3c4e936667c486f0b7d4f79abce2b69920724f7 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Feb 2025 19:39:01 +0530
Subject: [PATCH 11/17] addressed reviews
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 88 +++++++++----------
1 file changed, 44 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index e083ca26b7c6b..1525c9a491f65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -95,6 +95,16 @@ AMDGPUUniformIntrinsicCombinePass::run(Function &F,
bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
bool IsChanged{false};
+ Module *M = F.getParent();
+
+ // If none of the relevant intrinsics are declared, return early.
+ // if (!M->getFunction(Intrinsic::getName(Intrinsic::amdgcn_permlane64)) &&
+ // !M->getFunction(Intrinsic::getName(Intrinsic::amdgcn_readfirstlane)) &&
+ // !M->getFunction(Intrinsic::getName(Intrinsic::amdgcn_readlane)) &&
+ // !M->getFunction(Intrinsic::getName(Intrinsic::amdgcn_ballot))) {
+ // return false;
+ // }
+
// Iterate over each instruction in the function to get the desired intrinsic
// inst to check for optimization.
for (Instruction &I : instructions(F)) {
@@ -114,53 +124,43 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
- // Check if the argument use is uniform
- if (!UI->isDivergentUse(II.getOperandUse(0))) {
- LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << "\n");
- II.replaceAllUsesWith(Src);
- return true;
- }
- break;
+ // Check if the argument use is divergent
+ if (UI->isDivergentUse(II.getOperandUse(0)))
+ return false;
+ LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << "\n");
+ II.replaceAllUsesWith(Src);
+ return true;
}
case Intrinsic::amdgcn_ballot: {
Value *Src = II.getArgOperand(0);
- // Check if the argument use is uniform and has a direct `icmp eq` use of
- // the ballot result. If exists pull the ballot argument to the use place.
- // %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cond)
- // %is_done = icmp eq i64 %ballot, 0
- // transformed IR should look like.
- // %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cond)
- // %is_done = icmp eq i64 %cond, 0
- if (!UI->isDivergentUse(II.getOperandUse(0))) {
- LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << "\n");
-
- // Look for a direct `icmp eq` use of the ballot result.
- auto It = llvm::find_if(II.users(), [&](User *U) {
- return match(U, m_ICmp(m_Specific(&II), m_Zero()));
- });
-
- // Check if a match was found
- if (It != II.user_end()) {
- // Extract the matching `icmp` instruction
- ICmpInst *ICmp = dyn_cast<ICmpInst>(*It);
- if (!ICmp)
- break; // Safety check
-
- IRBuilder<> Builder(ICmp);
-
- // Convert ballot argument to match `icmp` operand type (i64)
- Value *ConvertedSrc =
- Builder.CreateZExtOrTrunc(Src, ICmp->getOperand(1)->getType());
-
- LLVM_DEBUG(dbgs() << "Replacing ballot result in icmp: " << *ICmp
- << " with " << *ConvertedSrc << "\n");
-
- // Replace `%ballot` in `icmp` with `ConvertedSrc`
- ICmp->setOperand(0, ConvertedSrc);
- return true;
- }
- }
- break;
+ if (UI->isDivergentUse(II.getOperandUse(0)))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << "\n");
+
+ // Look for a direct `icmp eq` use of the ballot result.
+ // FIXME: replace all the uses?
+ auto It = llvm::find_if(II.users(), [&](User *U) {
+ return match(U, m_ICmp(m_Specific(&II), m_Zero()));
+ });
+
+ // Check if a match was found
+ if (It == II.user_end())
+ return false;
+
+ // Extract the matching `icmp` instruction
+ ICmpInst *ICmp = dyn_cast<ICmpInst>(*It);
+ IRBuilder<> Builder(ICmp);
+
+ // Convert ballot argument to match `icmp` operand type (i64)
+ Value *ConvertedSrc = Builder.CreateZExtOrTrunc(Src, II.getType());
+
+ LLVM_DEBUG(dbgs() << "Replacing ballot result in icmp: " << *ICmp
+ << " with " << *ConvertedSrc << "\n");
+
+ // Replace `%ballot` in `icmp` with `ConvertedSrc`
+ ICmp->setOperand(0, ConvertedSrc);
+ return true;
}
}
return false;
>From 94a4787dc4ec1393ee6342e359e1bd71258c0ace Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Thu, 20 Feb 2025 18:46:37 +0530
Subject: [PATCH 12/17] pull the ballot argument to all the match users
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 47 ++++++++-----------
.../amdgpu-simplify-uniform-waterfall.ll | 47 +++++++++++++++++++
2 files changed, 67 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 1525c9a491f65..d4f91bb4c2572 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -96,12 +96,12 @@ AMDGPUUniformIntrinsicCombinePass::run(Function &F,
bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
bool IsChanged{false};
Module *M = F.getParent();
-
+
// If none of the relevant intrinsics are declared, return early.
- // if (!M->getFunction(Intrinsic::getName(Intrinsic::amdgcn_permlane64)) &&
- // !M->getFunction(Intrinsic::getName(Intrinsic::amdgcn_readfirstlane)) &&
- // !M->getFunction(Intrinsic::getName(Intrinsic::amdgcn_readlane)) &&
- // !M->getFunction(Intrinsic::getName(Intrinsic::amdgcn_ballot))) {
+ // if (!Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_permlane64, {}) &&
+ // !Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_readfirstlane, {}) &&
+ // !Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_readlane, {}) &&
+ // !Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_ballot, {})) {
// return false;
// }
@@ -139,28 +139,21 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << "\n");
// Look for a direct `icmp eq` use of the ballot result.
- // FIXME: replace all the uses?
- auto It = llvm::find_if(II.users(), [&](User *U) {
- return match(U, m_ICmp(m_Specific(&II), m_Zero()));
- });
-
- // Check if a match was found
- if (It == II.user_end())
- return false;
-
- // Extract the matching `icmp` instruction
- ICmpInst *ICmp = dyn_cast<ICmpInst>(*It);
- IRBuilder<> Builder(ICmp);
-
- // Convert ballot argument to match `icmp` operand type (i64)
- Value *ConvertedSrc = Builder.CreateZExtOrTrunc(Src, II.getType());
-
- LLVM_DEBUG(dbgs() << "Replacing ballot result in icmp: " << *ICmp
- << " with " << *ConvertedSrc << "\n");
-
- // Replace `%ballot` in `icmp` with `ConvertedSrc`
- ICmp->setOperand(0, ConvertedSrc);
- return true;
+ bool Changed = false;
+ for (User *U : make_early_inc_range(II.users())) {
+ if (match(U, m_ICmp(m_Specific(&II), m_Zero()))) {
+ ICmpInst *ICmp = dyn_cast<ICmpInst>(U);
+ IRBuilder<> Builder(ICmp);
+ Value *ConvertedSrc = Builder.CreateZExtOrTrunc(Src, II.getType());
+
+ LLVM_DEBUG(dbgs() << "Replacing ballot result in icmp: " << *ICmp
+ << " with " << *ConvertedSrc << "\n");
+
+ ICmp->setOperand(0, ConvertedSrc);
+ Changed = true;
+ }
+ }
+ return Changed;
}
}
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index f5d3aa176449a..69346d14a79fb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -122,6 +122,53 @@ exit:
ret void
}
+define protected amdgpu_kernel void @trivial_waterfall_multiple_icmp(ptr addrspace(1) %out) {
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_multiple_icmp(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[NOT_DONE]] to i64
+; PASS-CHECK-NEXT: [[IS_DONE_1:%.*]] = icmp eq i64 [[TMP1]], 0
+; PASS-CHECK-NEXT: [[TMP0:%.*]] = zext i1 [[NOT_DONE]] to i64
+; PASS-CHECK-NEXT: [[IS_DONE_3:%.*]] = icmp eq i64 [[TMP0]], 0
+; PASS-CHECK-NEXT: br i1 [[IS_DONE_1]], label %[[EXIT:.*]], label %[[IF]]
+; PASS-CHECK: [[IF]]:
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[NOT_DONE]] to i64
+; PASS-CHECK-NEXT: [[IS_DONE_4:%.*]] = icmp eq i64 [[TMP2]], 0
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_multiple_icmp(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*:]]
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+entry:
+ br label %while
+
+while:
+ %done = phi i1 [ 0, %entry ], [ 1, %if ]
+ %not_done = xor i1 %done, true
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
+ %is_done_1 = icmp eq i64 %ballot, 0
+ %is_done_2 = icmp eq i64 %ballot, 0
+ br i1 %is_done_1, label %exit, label %if
+
+if:
+ store i32 5, ptr addrspace(1) %out
+ %is_done_3 = icmp eq i64 %ballot, 0
+ br label %while
+
+exit:
+ ret void
+}
declare i64 @llvm.amdgcn.ballot.i64(i1) #1
!6 = !{i64 690}
>From 7d0af28baa2ba03104d761b92d9fc3761c4a69e5 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 21 Feb 2025 16:27:20 +0530
Subject: [PATCH 13/17] Match and replace icmp ballot,0 with XOR
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 37 ++++------
.../amdgpu-simplify-uniform-waterfall.ll | 70 +++++++++----------
2 files changed, 47 insertions(+), 60 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index d4f91bb4c2572..6074eaeac3bf0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -95,19 +95,10 @@ AMDGPUUniformIntrinsicCombinePass::run(Function &F,
bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
bool IsChanged{false};
- Module *M = F.getParent();
-
- // If none of the relevant intrinsics are declared, return early.
- // if (!Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_permlane64, {}) &&
- // !Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_readfirstlane, {}) &&
- // !Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_readlane, {}) &&
- // !Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_ballot, {})) {
- // return false;
- // }
// Iterate over each instruction in the function to get the desired intrinsic
// inst to check for optimization.
- for (Instruction &I : instructions(F)) {
+ for (Instruction &I : make_early_inc_range(instructions(F))) {
if (auto *Intrinsic = dyn_cast<IntrinsicInst>(&I)) {
IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
}
@@ -135,22 +126,24 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
Value *Src = II.getArgOperand(0);
if (UI->isDivergentUse(II.getOperandUse(0)))
return false;
-
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << "\n");
- // Look for a direct `icmp eq` use of the ballot result.
bool Changed = false;
for (User *U : make_early_inc_range(II.users())) {
- if (match(U, m_ICmp(m_Specific(&II), m_Zero()))) {
- ICmpInst *ICmp = dyn_cast<ICmpInst>(U);
- IRBuilder<> Builder(ICmp);
- Value *ConvertedSrc = Builder.CreateZExtOrTrunc(Src, II.getType());
-
- LLVM_DEBUG(dbgs() << "Replacing ballot result in icmp: " << *ICmp
- << " with " << *ConvertedSrc << "\n");
-
- ICmp->setOperand(0, ConvertedSrc);
- Changed = true;
+ if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
+ Value *Op0 = ICmp->getOperand(0);
+ Value *Op1 = ICmp->getOperand(1);
+
+ if (ICmp->getPredicate() == ICmpInst::ICMP_EQ &&
+ ((Op0 == &II && match(Op1, m_Zero())) ||
+ (Op1 == &II && match(Op0, m_Zero())))) {
+
+ IRBuilder<> Builder(ICmp);
+ Value *Xor = Builder.CreateXor(Src, Builder.getTrue());
+ LLVM_DEBUG(dbgs() << "Replacing with XOR: " << *Xor << "\n");
+ ICmp->replaceAllUsesWith(Xor);
+ Changed = true;
+ }
}
}
return Changed;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index 69346d14a79fb..6898b35920ef0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=DCE-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,early-cse,instcombine,simplifycfg -S < %s | FileCheck %s -check-prefix=DCE-CHECK
define protected amdgpu_kernel void @trivial_waterfall(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall(
@@ -8,12 +8,12 @@ define protected amdgpu_kernel void @trivial_waterfall(ptr addrspace(1) %out) {
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
-; PASS-CHECK-NEXT: [[DONE1:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
-; PASS-CHECK-NEXT: [[DONE:%.*]] = xor i1 [[DONE1]], true
-; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
-; PASS-CHECK-NEXT: [[TMP0:%.*]] = zext i1 [[DONE]] to i64
-; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[TMP0]], 0
-; PASS-CHECK-NEXT: br i1 [[IS_DONE]], label %[[EXIT:.*]], label %[[IF]]
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
+; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
+; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: br label %[[WHILE]]
@@ -49,18 +49,18 @@ define protected amdgpu_kernel void @waterfall(ptr addrspace(1) %out) {
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
; PASS-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP0]])
+; PASS-CHECK-NEXT: [[TID:%.*]] = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP0]])
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
-; PASS-CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
-; PASS-CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[TMP3]], true
-; PASS-CHECK-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP4]])
-; PASS-CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 0
-; PASS-CHECK-NEXT: br i1 [[TMP9]], label %[[EXIT:.*]], label %[[IF:.*]]
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
+; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
+; PASS-CHECK-NEXT: br i1 [[IS_DONE]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
-; PASS-CHECK-NEXT: [[TMP12:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]])
-; PASS-CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP1]], [[TMP12]]
-; PASS-CHECK-NEXT: br i1 [[TMP13]], label %[[WORK:.*]], label %[[TAIL]]
+; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[TID]], [[FIRST_ACTIVE_ID]]
+; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
; PASS-CHECK: [[WORK]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: br label %[[TAIL]]
@@ -76,15 +76,15 @@ define protected amdgpu_kernel void @waterfall(ptr addrspace(1) %out) {
; DCE-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
; DCE-CHECK: [[WHILE]]:
-; DCE-CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[TMP12:%.*]], %[[TAIL:.*]] ]
-; DCE-CHECK-NEXT: [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; DCE-CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP3]])
-; DCE-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
-; DCE-CHECK-NEXT: br i1 [[TMP8]], label %[[EXIT:.*]], label %[[IF:.*]]
+; DCE-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[IS_FIRST_ACTIVE_ID:%.*]], %[[TAIL:.*]] ]
+; DCE-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; DCE-CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]])
+; DCE-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i32 [[TMP1]], 0
+; DCE-CHECK-NEXT: br i1 [[IS_DONE]], label %[[EXIT:.*]], label %[[IF:.*]]
; DCE-CHECK: [[IF]]:
-; DCE-CHECK-NEXT: [[TMP11:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
-; DCE-CHECK-NEXT: [[TMP12]] = icmp eq i32 [[TMP0]], [[TMP11]]
-; DCE-CHECK-NEXT: br i1 [[TMP12]], label %[[WORK:.*]], label %[[TAIL]]
+; DCE-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
+; DCE-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID]] = icmp eq i32 [[TMP0]], [[FIRST_ACTIVE_ID]]
+; DCE-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
; DCE-CHECK: [[WORK]]:
; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: br label %[[TAIL]]
@@ -122,8 +122,8 @@ exit:
ret void
}
-define protected amdgpu_kernel void @trivial_waterfall_multiple_icmp(ptr addrspace(1) %out) {
-; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_multiple_icmp(
+define protected amdgpu_kernel void @trivial_waterfall_swap_op(ptr addrspace(1) %out) {
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_swap_op(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
@@ -131,20 +131,16 @@ define protected amdgpu_kernel void @trivial_waterfall_multiple_icmp(ptr addrspa
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[NOT_DONE]] to i64
-; PASS-CHECK-NEXT: [[IS_DONE_1:%.*]] = icmp eq i64 [[TMP1]], 0
-; PASS-CHECK-NEXT: [[TMP0:%.*]] = zext i1 [[NOT_DONE]] to i64
-; PASS-CHECK-NEXT: [[IS_DONE_3:%.*]] = icmp eq i64 [[TMP0]], 0
-; PASS-CHECK-NEXT: br i1 [[IS_DONE_1]], label %[[EXIT:.*]], label %[[IF]]
+; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]]
+; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[NOT_DONE]] to i64
-; PASS-CHECK-NEXT: [[IS_DONE_4:%.*]] = icmp eq i64 [[TMP2]], 0
; PASS-CHECK-NEXT: br label %[[WHILE]]
; PASS-CHECK: [[EXIT]]:
; PASS-CHECK-NEXT: ret void
;
-; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_multiple_icmp(
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_swap_op(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; DCE-CHECK-NEXT: [[ENTRY:.*:]]
; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -157,13 +153,11 @@ while:
%done = phi i1 [ 0, %entry ], [ 1, %if ]
%not_done = xor i1 %done, true
%ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
- %is_done_1 = icmp eq i64 %ballot, 0
- %is_done_2 = icmp eq i64 %ballot, 0
- br i1 %is_done_1, label %exit, label %if
+ %is_done = icmp eq i64 0, %ballot
+ br i1 %is_done, label %exit, label %if
if:
store i32 5, ptr addrspace(1) %out
- %is_done_3 = icmp eq i64 %ballot, 0
br label %while
exit:
>From 809ffb5146318635501b72f8f89564ba4898bc4a Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 24 Feb 2025 15:43:13 +0530
Subject: [PATCH 14/17] Rebase: resolve merge
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 -
...amdgpu-miscellaneous-uniform-intrinsics.ll | 131 ------------------
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 23 ---
3 files changed, 156 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 97c3d001883f9..6542a2d85ea4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1222,8 +1222,6 @@ void AMDGPUPassConfig::addIRPasses() {
if (isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
- if (EnableUniformIntrinsicCombine)
- addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
deleted file mode 100644
index f450b0e6763c4..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s
-
-define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
-; CHECK-LABEL: readfirstlane_with_readfirstlane:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
- %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
-; CHECK-LABEL: readfirstlane_with_readlane:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT: v_readfirstlane_b32 s2, v1
-; CHECK-NEXT: v_readlane_b32 s2, v0, s2
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
- %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
-; CHECK-LABEL: readlane_with_firstlane:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT: v_readfirstlane_b32 s2, v0
-; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
- %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
-; CHECK-LABEL: readlane_readlane:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT: v_readfirstlane_b32 s2, v1
-; CHECK-NEXT: v_readlane_b32 s2, v0, s2
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
- %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
-; CHECK-LABEL: permlane64_uniform:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s2, s[4:5], 0x8
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
-; CHECK-LABEL: permlane64_nonuniform:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: v_permlane64_b32 v1, v0
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
- %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- store i32 %v, i32 addrspace(1)* %out_ptr
- ret void
-}
-
-define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
-; CHECK-LABEL: permlane64_nonuniform_expression:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v0
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: v_permlane64_b32 v1, v1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid2 = add i32 %tid, 1
- %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
- %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- store i32 %v, i32 addrspace(1)* %out_ptr
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index bbd164993764d..d7f54f3b8e9e2 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -31,11 +31,6 @@
; GCN-O0-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O0-NEXT: AMDGPU Printf lowering
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
-; GCN-O0-NEXT: FunctionPass Manager
-; GCN-O0-NEXT: Dominator Tree Construction
-; GCN-O0-NEXT: Cycle Info Analysis
-; GCN-O0-NEXT: Uniformity Analysis
-; GCN-O0-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O0-NEXT: Expand variadic functions
; GCN-O0-NEXT: AMDGPU Inline All Functions
; GCN-O0-NEXT: Inliner for always_inline functions
@@ -186,11 +181,6 @@
; GCN-O1-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-NEXT: AMDGPU Printf lowering
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
-; GCN-O1-NEXT: FunctionPass Manager
-; GCN-O1-NEXT: Dominator Tree Construction
-; GCN-O1-NEXT: Cycle Info Analysis
-; GCN-O1-NEXT: Uniformity Analysis
-; GCN-O1-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-NEXT: Expand variadic functions
; GCN-O1-NEXT: AMDGPU Inline All Functions
; GCN-O1-NEXT: Inliner for always_inline functions
@@ -476,11 +466,6 @@
; GCN-O1-OPTS-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
-; GCN-O1-OPTS-NEXT: FunctionPass Manager
-; GCN-O1-OPTS-NEXT: Dominator Tree Construction
-; GCN-O1-OPTS-NEXT: Cycle Info Analysis
-; GCN-O1-OPTS-NEXT: Uniformity Analysis
-; GCN-O1-OPTS-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-OPTS-NEXT: Expand variadic functions
; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
@@ -796,10 +781,6 @@
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer
-; GCN-O2-NEXT: Dominator Tree Construction
-; GCN-O2-NEXT: Cycle Info Analysis
-; GCN-O2-NEXT: Uniformity Analysis
-; GCN-O2-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O2-NEXT: Expand variadic functions
; GCN-O2-NEXT: AMDGPU Inline All Functions
; GCN-O2-NEXT: Inliner for always_inline functions
@@ -1119,10 +1100,6 @@
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer
-; GCN-O3-NEXT: Dominator Tree Construction
-; GCN-O3-NEXT: Cycle Info Analysis
-; GCN-O3-NEXT: Uniformity Analysis
-; GCN-O3-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O3-NEXT: Expand variadic functions
; GCN-O3-NEXT: AMDGPU Inline All Functions
; GCN-O3-NEXT: Inliner for always_inline functions
>From 11f6c0b286ebeae09a7446009aab031de99ffd06 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 24 Feb 2025 16:21:16 +0530
Subject: [PATCH 15/17] remove undef test
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +-
.../amdgpu-uniform-intrinsic-combine.ll | 68 -------------------
2 files changed, 1 insertion(+), 69 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6542a2d85ea4b..39dcec68b6836 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -476,7 +476,7 @@ static cl::opt<bool> HasClosedWorldAssumption(
"amdgpu-link-time-closed-world",
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);
-
+
static cl::opt<bool> EnableUniformIntrinsicCombine(
"amdgpu-enable-uniform-intrinsic-combine",
cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index ee54aff64f25d..e182319a1faad 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -19,23 +19,6 @@ define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
-; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_undef(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 undef)
-; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_undef(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
@@ -121,23 +104,6 @@ define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_undef(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 undef, i32 undef)
-; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_undef(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
@@ -232,23 +198,6 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_undef(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 undef)
-; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_undef(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
@@ -478,23 +427,6 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
-; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_invalid(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[UNDEF_V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 undef)
-; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_invalid(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
- store i32 %undef_v, ptr addrspace(1) %out
- ret void
-}
-
define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
>From 13023a4d82dfcd9a5a1a23cd2108e095922ab2aa Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 25 Feb 2025 12:44:06 +0530
Subject: [PATCH 16/17] added pass to llc pipeline
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +
...amdgpu-miscellaneous-uniform-intrinsics.ll | 131 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 23 +++
3 files changed, 156 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 39dcec68b6836..046a72c74383a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1222,6 +1222,8 @@ void AMDGPUPassConfig::addIRPasses() {
if (isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
+ if (EnableUniformIntrinsicCombine)
+ addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
new file mode 100644
index 0000000000000..f450b0e6763c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s
+
+define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readfirstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_with_firstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v0
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_readlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: permlane64_uniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s2, s[4:5], 0x8
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_permlane64_b32 v1, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform_expression:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: v_permlane64_b32 v1, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index d7f54f3b8e9e2..bbd164993764d 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -31,6 +31,11 @@
; GCN-O0-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O0-NEXT: AMDGPU Printf lowering
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O0-NEXT: FunctionPass Manager
+; GCN-O0-NEXT: Dominator Tree Construction
+; GCN-O0-NEXT: Cycle Info Analysis
+; GCN-O0-NEXT: Uniformity Analysis
+; GCN-O0-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O0-NEXT: Expand variadic functions
; GCN-O0-NEXT: AMDGPU Inline All Functions
; GCN-O0-NEXT: Inliner for always_inline functions
@@ -181,6 +186,11 @@
; GCN-O1-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-NEXT: AMDGPU Printf lowering
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O1-NEXT: FunctionPass Manager
+; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Cycle Info Analysis
+; GCN-O1-NEXT: Uniformity Analysis
+; GCN-O1-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-NEXT: Expand variadic functions
; GCN-O1-NEXT: AMDGPU Inline All Functions
; GCN-O1-NEXT: Inliner for always_inline functions
@@ -466,6 +476,11 @@
; GCN-O1-OPTS-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O1-OPTS-NEXT: FunctionPass Manager
+; GCN-O1-OPTS-NEXT: Dominator Tree Construction
+; GCN-O1-OPTS-NEXT: Cycle Info Analysis
+; GCN-O1-OPTS-NEXT: Uniformity Analysis
+; GCN-O1-OPTS-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-OPTS-NEXT: Expand variadic functions
; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
@@ -781,6 +796,10 @@
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer
+; GCN-O2-NEXT: Dominator Tree Construction
+; GCN-O2-NEXT: Cycle Info Analysis
+; GCN-O2-NEXT: Uniformity Analysis
+; GCN-O2-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O2-NEXT: Expand variadic functions
; GCN-O2-NEXT: AMDGPU Inline All Functions
; GCN-O2-NEXT: Inliner for always_inline functions
@@ -1100,6 +1119,10 @@
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer
+; GCN-O3-NEXT: Dominator Tree Construction
+; GCN-O3-NEXT: Cycle Info Analysis
+; GCN-O3-NEXT: Uniformity Analysis
+; GCN-O3-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O3-NEXT: Expand variadic functions
; GCN-O3-NEXT: AMDGPU Inline All Functions
; GCN-O3-NEXT: Inliner for always_inline functions
>From b4e8b93dc9ec5ee25c4d216924561d1d3f65b396 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 25 Feb 2025 15:27:12 +0530
Subject: [PATCH 17/17] Update test checks
---
.../GlobalISel/llvm.amdgcn.ballot.i32.ll | 42 +-
.../GlobalISel/llvm.amdgcn.ballot.i64.ll | 46 +-
llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll | 540 +++++++++++++++++-
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 39 +-
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 39 +-
.../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 57 +-
.../AMDGPU/llvm.amdgcn.permlane64.ptr.ll | 80 +--
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 31 +-
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 35 +-
.../spill-vgpr-to-agpr-update-regscavenger.ll | 23 +-
.../AMDGPU/splitkit-getsubrangeformask.ll | 221 ++++---
11 files changed, 819 insertions(+), 334 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 927a31d3992b0..612842758f38a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -162,16 +162,17 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -259,17 +260,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -374,16 +371,15 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 0bbb40b8db43a..358306a43e08c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -165,16 +165,17 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -262,17 +263,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -377,16 +374,15 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 34ee90c68569f..bab319baf1e2c 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -6,6 +6,34 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
define double @v_sqrt_f64(double %x) {
+; SDAG-LABEL: v_sqrt_f64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38,6 +66,34 @@ define double @v_sqrt_f64(double %x) {
}
define double @v_sqrt_f64_fneg(double %x) {
+; SDAG-LABEL: v_sqrt_f64_fneg:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 9
+; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_fneg:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -71,6 +127,34 @@ define double @v_sqrt_f64_fneg(double %x) {
}
define double @v_sqrt_f64_fabs(double %x) {
+; SDAG-LABEL: v_sqrt_f64_fabs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_fabs:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -104,6 +188,34 @@ define double @v_sqrt_f64_fabs(double %x) {
}
define double @v_sqrt_f64_fneg_fabs(double %x) {
+; SDAG-LABEL: v_sqrt_f64_fneg_fabs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 9
+; SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_fneg_fabs:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -138,6 +250,34 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
}
define double @v_sqrt_f64_ninf(double %x) {
+; SDAG-LABEL: v_sqrt_f64_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_ninf:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -170,6 +310,34 @@ define double @v_sqrt_f64_ninf(double %x) {
}
define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" {
+; SDAG-LABEL: v_sqrt_f64_no_infs_attribute:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -202,6 +370,34 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
}
define double @v_sqrt_f64_nnan(double %x) {
+; SDAG-LABEL: v_sqrt_f64_nnan:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_nnan:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -257,8 +453,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SDAG-NEXT: v_readfirstlane_b32 s0, v0
; SDAG-NEXT: v_readfirstlane_b32 s1, v1
; SDAG-NEXT: ; return to shader part epilog
@@ -326,8 +522,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SDAG-NEXT: v_readfirstlane_b32 s0, v0
; SDAG-NEXT: v_readfirstlane_b32 s1, v1
; SDAG-NEXT: ; return to shader part epilog
@@ -395,8 +591,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SDAG-NEXT: v_readfirstlane_b32 s0, v0
; SDAG-NEXT: v_readfirstlane_b32 s1, v1
; SDAG-NEXT: ; return to shader part epilog
@@ -464,8 +660,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SDAG-NEXT: v_readfirstlane_b32 s0, v0
; SDAG-NEXT: v_readfirstlane_b32 s1, v1
; SDAG-NEXT: ; return to shader part epilog
@@ -510,6 +706,34 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
}
define double @v_sqrt_f64_nsz(double %x) {
+; SDAG-LABEL: v_sqrt_f64_nsz:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_nsz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -542,6 +766,34 @@ define double @v_sqrt_f64_nsz(double %x) {
}
define double @v_sqrt_f64_nnan_ninf(double %x) {
+; SDAG-LABEL: v_sqrt_f64_nnan_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_nnan_ninf:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -574,6 +826,34 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
}
define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
+; SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -606,6 +886,34 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
}
define double @v_sqrt_f64_afn(double %x) {
+; SDAG-LABEL: v_sqrt_f64_afn:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_afn:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -638,6 +946,34 @@ define double @v_sqrt_f64_afn(double %x) {
}
define double @v_sqrt_f64_afn_nsz(double %x) {
+; SDAG-LABEL: v_sqrt_f64_afn_nsz:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_afn_nsz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -770,6 +1106,34 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
}
define double @v_sqrt_f64_afn_nnan(double %x) {
+; SDAG-LABEL: v_sqrt_f64_afn_nnan:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_afn_nnan:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -802,6 +1166,34 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
}
define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
+; SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -835,6 +1227,34 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
}
define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
+; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -967,6 +1387,34 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
}
define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
+; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -999,6 +1447,34 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
}
define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
+; SDAG-LABEL: v_sqrt_f64__approx_func_fp_math:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1031,6 +1507,34 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
}
define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
+; SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1063,6 +1567,34 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
}
define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
+; SDAG-LABEL: v_sqrt_f64__unsafe_attr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-LABEL: v_sqrt_f64__unsafe_attr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index d26f0df49b0a8..db71ebdd8b75b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -156,15 +156,16 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-NEXT: s_cselect_b32 s0, -1, 0
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT: s_cbranch_vccnz .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -245,14 +246,14 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -353,14 +354,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT: s_cbranch_vccnz .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index c7597e98a6d58..f794049723af1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -159,15 +159,16 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -248,14 +249,14 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -356,14 +357,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index f23f9595446eb..aae95e63293e3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -6,32 +6,46 @@ declare i32 @llvm.amdgcn.permlane64(i32)
declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
-; GFX11-LABEL: test_s:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_permlane64_b32 v0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: test_s:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_s:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane64(i32 %src0)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
-; GFX11-LABEL: test_i:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_permlane64_b32 v0, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: test_i:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x63
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_i:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane64(i32 99)
store i32 %v, ptr addrspace(1) %out
ret void
@@ -52,6 +66,3 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
store i32 %v, ptr addrspace(1) %out
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX11-GISEL: {{.*}}
-; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
index f7c37caf41eab..8f3fb1eb69d86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
@@ -6,12 +6,9 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
-; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0)
store ptr %v, ptr addrspace(1) %out
@@ -22,21 +19,14 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) {
; GFX11-SDAG-LABEL: test_v3p0:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x2
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; GFX11-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x54
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s7
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s6
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v7, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v4
-; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
-; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v8
-; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s7
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16
; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[4:5]
@@ -53,10 +43,8 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0)
store ptr addrspace(3) %v, ptr addrspace(1) %out
@@ -69,16 +57,10 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3
-; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX11-SDAG-NEXT: s_endpgm
%v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0)
store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
@@ -92,10 +74,8 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0)
store ptr addrspace(5) %v, ptr addrspace(1) %out
@@ -108,16 +88,10 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3
-; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX11-SDAG-NEXT: s_endpgm
%v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0)
store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
@@ -131,10 +105,8 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0)
store ptr addrspace(6) %v, ptr addrspace(1) %out
@@ -147,16 +119,10 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3
-; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX11-SDAG-NEXT: s_endpgm
%v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0)
store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 0605a158b974f..bf4b37bd59ebd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -295,9 +295,8 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -323,10 +322,8 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
-; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -408,12 +405,12 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
@@ -442,12 +439,12 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index edb6ebcee1325..0e0f80045575c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -9,7 +9,7 @@ declare double @llvm.amdgcn.readlane.f64(double, i32) #0
define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 {
; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32:
; CHECK-SDAG: ; %bb.0:
-; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; use s0
@@ -18,7 +18,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1
;
; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; use s0
@@ -215,9 +215,8 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -243,10 +242,8 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
-; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -596,12 +593,12 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
@@ -630,12 +627,12 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index 4384d1e32cf53..e27b9219db8b5 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -20,38 +20,33 @@ define void @test() {
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: .LBB0_3: ; %bb.3
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: ; implicit-def: $sgpr4
-; CHECK-NEXT: v_mov_b32_e32 v0, s4
-; CHECK-NEXT: v_readfirstlane_b32 s6, v0
; CHECK-NEXT: s_mov_b64 s[4:5], -1
-; CHECK-NEXT: s_mov_b32 s7, 0
-; CHECK-NEXT: s_cmp_eq_u32 s6, s7
; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
-; CHECK-NEXT: s_mov_b64 s[10:11], exec
-; CHECK-NEXT: s_mov_b64 exec, -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_cbranch_scc1 .LBB0_5
; CHECK-NEXT: ; %bb.4: ; %bb.4
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
-; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: .LBB0_5: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: v_readlane_b32 s4, v1, 0
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index c611c4b502817..7c8fda381a587 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -21,7 +21,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr5
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr18
+ ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr18
; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr19
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr20
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr21
@@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: KILL undef %125:sgpr_128
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %117:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: KILL undef %117:sgpr_128
; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
@@ -42,142 +42,135 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc
; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.71, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) undef`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 64, 0 :: (invariant load (s128) from %ir.88, addrspace 4)
; CHECK-NEXT: KILL undef %74:sreg_64
; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %87:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL undef %89:sgpr_128
- ; CHECK-NEXT: KILL undef %118:sgpr_128
+ ; CHECK-NEXT: KILL undef %112:sgpr_128
+ ; CHECK-NEXT: KILL undef %87:sgpr_128
; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
- ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
+ ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.77, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.83, addrspace 4)
; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
- ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
+ ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %301:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %272:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %356:sgpr_128, undef %357:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %367:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %351:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %362:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %327:sgpr_128, undef %328:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %338:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY6]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 0, 0 :: (invariant load (s128) from %ir.32, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.103, addrspace 4)
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %322:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %333:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
- ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 0, 0 :: (invariant load (s128) from %ir.98, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %354:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 224, 0 :: (invariant load (s128) from %ir.114, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.130, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 224, 0 :: (invariant load (s128) from %ir.119, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 576, 0 :: (invariant load (s128) from %ir.135, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.125, addrspace 4)
; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_15]], 168, 0 :: (invariant load (s32) from %ir.257, align 8, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.142, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 576, 0 :: (invariant load (s128) from %ir.147, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.155, addrspace 4)
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.163, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.168, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_16]], 168, 0 :: (invariant load (s64) from %ir.266, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.190, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.196, addrspace 4)
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.201, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.206, addrspace 4)
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
@@ -187,28 +180,28 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_17]], 168, 0 :: (invariant load (s64) from %ir.277, addrspace 4)
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4)
- ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
- ; CHECK-NEXT: KILL undef %469:sreg_64
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 160, 0 :: (invariant load (s128) from %ir.241, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %436:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4)
+ ; CHECK-NEXT: KILL [[S_ADD_U32_13]].sub0, [[S_ADD_U32_13]].sub1
; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
+ ; CHECK-NEXT: KILL undef %436:sreg_64
; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
- ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 160, 0 :: (invariant load (s128) from %ir.249, addrspace 4)
+ ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.288, align 8, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]]
@@ -224,22 +217,22 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
- ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
- ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_19]], 96, 0 :: (invariant load (s128) from %ir.306, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_20]], 96, 0 :: (invariant load (s128) from %ir.312, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 96, 0 :: (invariant load (s128) from %ir.318, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]]
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]]
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -263,11 +256,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_10:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_9]], [[V_SUBREV_U32_e64_4]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_6:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 37, [[BUFFER_LOAD_FORMAT_X_IDXEN6]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_10]], [[V_SUBREV_U32_e64_5]], implicit $exec
- ; CHECK-NEXT: [[V_SUBREV_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 38, [[BUFFER_LOAD_FORMAT_X_IDXEN7]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_SUBREV_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 38, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_11]], [[V_SUBREV_U32_e64_6]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_12]], [[V_SUBREV_U32_e64_7]], implicit $exec
- ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN7]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_13]], [[V_SUBREV_U32_e64_8]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 51, [[BUFFER_LOAD_FORMAT_X_IDXEN9]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_14]], [[V_SUBREV_U32_e64_9]], implicit $exec
@@ -351,13 +344,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
- ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %509:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
- ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+ ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %523:vgpr_32, undef %525:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
.expVert:
%0 = extractelement <31 x i32> %userData, i64 2
More information about the llvm-commits
mailing list