[llvm] [AMDGPU] Introduce "amdgpu-uniform-intrinsic-combine" pass to combine uniform AMDGPU lane Intrinsics. (PR #116953)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 12 04:20:27 PDT 2025
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/116953
>From 7df7c752094c9ff632107e5726f20d8a6b4f1fc4 Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Thu, 21 Nov 2024 12:35:56 +0530
Subject: [PATCH 1/3] [AMDGPU] combine uniform AMDGPU lane Intrinsics
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 5 +
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 +
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 157 ++++
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../amdgpu-simplify-uniform-waterfall.ll | 508 +++++++++++
.../amdgpu-uniform-intrinsic-combine.ll | 790 ++++++++++++++++++
.../amdgpu-uniform-temporal-divergence.ll | 57 ++
8 files changed, 1527 insertions(+)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0f2c33585884f..ce2b4a5f6f2e9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -562,6 +562,11 @@ class AMDGPURewriteAGPRCopyMFMAPass
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
+struct AMDGPUUniformIntrinsicCombinePass
+ : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 9449e70930913..a6074eaf78fd0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
+MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9afe7590fe4ef..e02d5cb2ed5ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> EnableUniformIntrinsicCombine(
+ "amdgpu-enable-uniform-intrinsic-combine",
+ cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
+ cl::init(true), cl::Hidden);
+
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -898,6 +903,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
+
+ if (EnableUniformIntrinsicCombine)
+ PM.addPass(AMDGPUUniformIntrinsicCombinePass());
});
PB.registerPeepholeEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
new file mode 100644
index 0000000000000..5014979c0c678
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -0,0 +1,157 @@
+//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass simplifies certain intrinsic calls when the arguments are uniform.
+/// also, this pass relies on the fact that uniformity analysis remains safe
+/// across valid transformations in LLVM. A transformation does not alter
+/// program behavior across threads: each instruction in the original IR
+/// continues to have a well-defined counterpart in the transformed IR, both
+/// statically and dynamically.
+///
+/// Valid transformations respect three invariants:
+/// 1. Use-def relationships are preserved. If one instruction produces a value
+/// and another consumes it, that dependency must remain intact.
+/// 2. Uniformity classification is preserved. Certain values are always uniform
+/// (constants, kernel arguments, convergent operations), while others are
+/// always divergent (atomics, most function calls). Transformations may turn
+/// divergent computations into uniform ones, but never the reverse.
+/// 3. Uniformity must hold not only at the point of value computation but also
+/// at all later uses of that value, consistently across the same set of
+/// threads.
+///
+/// Together, these invariants ensure that transformations in this pass are
+/// correctness-preserving and remain safe for uniformity analysis.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+using namespace llvm::PatternMatch;
+
+/// Optimizes uniform intrinsics.
+static bool optimizeUniformIntrinsic(IntrinsicInst &II,
+ const UniformityInfo &UI) {
+ llvm::Intrinsic::ID IID = II.getIntrinsicID();
+
+ switch (IID) {
+ case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane: {
+ Value *Src = II.getArgOperand(0);
+ // Check if the argument use is divergent
+ if (UI.isDivergentUse(II.getOperandUse(0)))
+ return false;
+ LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
+ II.replaceAllUsesWith(Src);
+ II.eraseFromParent();
+ return true;
+ }
+ case Intrinsic::amdgcn_ballot: {
+ Value *Src = II.getArgOperand(0);
+ if (UI.isDivergentUse(II.getOperandUse(0)))
+ return false;
+ LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
+
+ // If there are no ICmp users, return early.
+ if (none_of(II.users(), [](User *U) { return isa<ICmpInst>(U); }))
+ return false;
+
+ bool Changed = false;
+ for (User *U : make_early_inc_range(II.users())) {
+ if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
+ Value *Op0 = ICmp->getOperand(0);
+ Value *Op1 = ICmp->getOperand(1);
+ ICmpInst::Predicate Pred = ICmp->getPredicate();
+ Value *OtherOp = Op0 == &II ? Op1 : Op0;
+
+ if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
+ // Case (icmp eq %ballot, 0) --> xor %ballot_arg, 1
+ Instruction *NotOp =
+ BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
+ LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
+ ICmp->replaceAllUsesWith(NotOp);
+ ICmp->eraseFromParent();
+ Changed = true;
+ } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
+ // (icmp ne %ballot, 0) --> %ballot_arg
+ LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
+ << *Src << '\n');
+ ICmp->replaceAllUsesWith(Src);
+ ICmp->eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+ // Erase the intrinsic if it has no remaining uses.
+ if (II.use_empty())
+ II.eraseFromParent();
+ return Changed;
+ }
+ default:
+ llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
+ }
+ return false;
+}
+
+/// Iterate over the Intrinsics use in the Module to optimise.
+static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+ bool IsChanged = false;
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ for (Function &F : M) {
+ switch (F.getIntrinsicID()) {
+ case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_ballot:
+ break;
+ default:
+ continue;
+ }
+
+ for (User *U : F.users()) {
+ auto *II = cast<IntrinsicInst>(U);
+ Function *ParentF = II->getFunction();
+ if (ParentF->isDeclaration())
+ continue;
+
+ const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
+ IsChanged |= optimizeUniformIntrinsic(*II, UI);
+ }
+ }
+ return IsChanged;
+}
+
+PreservedAnalyses
+AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
+ if (!runUniformIntrinsicCombine(M, AM))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<UniformityInfoAnalysis>();
+ return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index aae56eef73edd..13f727b68c0d9 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
+ AMDGPUUniformIntrinsicCombine.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
AMDGPUISelDAGToDAG.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
new file mode 100644
index 0000000000000..30bd3f18c5e97
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -0,0 +1,508 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-enable-uniform-intrinsic-combine=0 -O3 -S < %s | FileCheck %s -check-prefix=CURRENT-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,early-cse,instcombine,simplifycfg -S < %s | FileCheck %s -check-prefix=DCE-CHECK
+
+define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CURRENT-CHECK-NEXT: [[ENTRY:.*:]]
+; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true)
+; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0
+; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[IF_PEEL:.*]]
+; CURRENT-CHECK: [[IF_PEEL]]:
+; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: br label %[[EXIT]]
+; CURRENT-CHECK: [[EXIT]]:
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
+; PASS-CHECK: [[IF]]:
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*]]:
+; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
+; DCE-CHECK: [[WHILE]]:
+; DCE-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; DCE-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
+; DCE-CHECK: [[IF]]:
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: br label %[[WHILE]]
+; DCE-CHECK: [[EXIT]]:
+; DCE-CHECK-NEXT: ret void
+;
+entry:
+ br label %while
+
+while:
+ %done = phi i1 [ 0, %entry ], [ 1, %if ]
+ %not_done = xor i1 %done, true
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
+ %is_done = icmp eq i64 %ballot, 0 ; in this case is_done = !not_done
+ br i1 %is_done, label %exit, label %if
+
+if:
+ store i32 5, ptr addrspace(1) %out
+ br label %while
+
+exit:
+ ret void
+}
+
+define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: [[ENTRY:.*:]]
+; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true)
+; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0
+; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[IF_PEEL:.*]]
+; CURRENT-CHECK: [[IF_PEEL]]:
+; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: br label %[[EXIT]]
+; CURRENT-CHECK: [[EXIT]]:
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
+; PASS-CHECK: [[IF]]:
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*]]:
+; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
+; DCE-CHECK: [[WHILE]]:
+; DCE-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; DCE-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
+; DCE-CHECK: [[IF]]:
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: br label %[[WHILE]]
+; DCE-CHECK: [[EXIT]]:
+; DCE-CHECK-NEXT: ret void
+;
+entry:
+ br label %while
+
+while:
+ %done = phi i1 [ 0, %entry ], [ 1, %if ]
+ %not_done = xor i1 %done, true
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
+ %is_done = icmp eq i64 0, %ballot ; in this case is_done = !not_done
+ br i1 %is_done, label %exit, label %if
+
+if:
+ store i32 5, ptr addrspace(1) %out
+ br label %while
+
+exit:
+ ret void
+}
+
+define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CURRENT-CHECK-NEXT: [[ENTRY:.*:]]
+; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: br label %[[WHILE:.*]]
+; CURRENT-CHECK: [[WHILE]]:
+; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true)
+; CURRENT-CHECK-NEXT: [[IS_DONE_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_NOT]], label %[[WHILE]], label %[[EXIT:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CURRENT-CHECK: [[EXIT]]:
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
+; PASS-CHECK: [[IF]]:
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*]]:
+; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
+; DCE-CHECK: [[WHILE]]:
+; DCE-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; DCE-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
+; DCE-CHECK: [[IF]]:
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: br label %[[WHILE]]
+; DCE-CHECK: [[EXIT]]:
+; DCE-CHECK-NEXT: ret void
+;
+entry:
+ br label %while
+
+while:
+ %done = phi i1 [ 0, %entry ], [ 1, %if ]
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %done)
+ %is_done = icmp ne i64 0, %ballot ; in this case is_done = done
+ br i1 %is_done, label %exit, label %if
+
+if:
+ store i32 5, ptr addrspace(1) %out
+ br label %while
+
+exit:
+ ret void
+}
+
+define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CURRENT-CHECK-NEXT: [[ENTRY:.*:]]
+; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: br label %[[WHILE:.*]]
+; CURRENT-CHECK: [[WHILE]]:
+; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true)
+; CURRENT-CHECK-NEXT: [[IS_DONE_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_NOT]], label %[[WHILE]], label %[[EXIT:.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CURRENT-CHECK: [[EXIT]]:
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
+; PASS-CHECK: [[IF]]:
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*]]:
+; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
+; DCE-CHECK: [[WHILE]]:
+; DCE-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; DCE-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
+; DCE-CHECK: [[IF]]:
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: br label %[[WHILE]]
+; DCE-CHECK: [[EXIT]]:
+; DCE-CHECK-NEXT: ret void
+;
+entry:
+ br label %while
+
+while:
+ %done = phi i1 [ 0, %entry ], [ 1, %if ]
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %done)
+ %is_done = icmp ne i64 %ballot, 0 ; in this case is_done = done
+ br i1 %is_done, label %exit, label %if
+
+if:
+ store i32 5, ptr addrspace(1) %out
+ br label %while
+
+exit:
+ ret void
+}
+
+define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_uniform_waterfall(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: [[ENTRY:.*:]]
+; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true)
+; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0
+; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[WORK_PEEL:.*]]
+; CURRENT-CHECK: [[WORK_PEEL]]:
+; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: br label %[[EXIT]]
+; CURRENT-CHECK: [[EXIT]]:
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_uniform_waterfall(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
+; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
+; PASS-CHECK: [[IF]]:
+; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
+; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
+; PASS-CHECK: [[WORK]]:
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: br label %[[TAIL]]
+; PASS-CHECK: [[TAIL]]:
+; PASS-CHECK-NEXT: [[NEW_DONE]] = phi i1 [ true, %[[WORK]] ], [ false, %[[IF]] ]
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_uniform_waterfall(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*]]:
+; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
+; DCE-CHECK: [[WHILE]]:
+; DCE-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[WORK:.*]] ]
+; DCE-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[WORK]]
+; DCE-CHECK: [[WORK]]:
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: br label %[[WHILE]]
+; DCE-CHECK: [[EXIT]]:
+; DCE-CHECK-NEXT: ret void
+;
+entry:
+ br label %while
+
+while:
+ %done = phi i1 [ false, %entry ], [ %new_done, %tail ]
+ %not_done = xor i1 %done, true
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
+ %is_done = icmp eq i64 %ballot, 0
+ br i1 %is_done, label %exit, label %if
+
+if:
+ %first_active_id = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 0)
+ %is_first_active_id = icmp eq i32 0, %first_active_id
+ br i1 %is_first_active_id, label %work, label %tail
+
+work:
+ store i32 5, ptr addrspace(1) %out
+ br label %tail
+
+tail:
+ %new_done = phi i1 [ true, %work ], [ false, %if ]
+ br label %while
+
+exit:
+ ret void
+}
+
+define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i32 %mymask) {
+; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @uniform_waterfall(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]], i32 [[MYMASK:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: [[ENTRY:.*:]]
+; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true)
+; CURRENT-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i32 [[TMP0]], 0
+; CURRENT-CHECK-NEXT: br i1 [[IS_DONE]], label %[[EXIT:.*]], label %[[WORK_PEEL:.*]]
+; CURRENT-CHECK: [[WORK_PEEL]]:
+; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: br label %[[EXIT]]
+; CURRENT-CHECK: [[EXIT]]:
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @uniform_waterfall(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MYMASK:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
+; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
+; PASS-CHECK: [[IF]]:
+; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
+; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
+; PASS-CHECK: [[WORK]]:
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: br label %[[TAIL]]
+; PASS-CHECK: [[TAIL]]:
+; PASS-CHECK-NEXT: [[NEW_DONE]] = phi i1 [ true, %[[WORK]] ], [ false, %[[IF]] ]
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @uniform_waterfall(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MYMASK:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*]]:
+; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
+; DCE-CHECK: [[WHILE]]:
+; DCE-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[WORK:.*]] ]
+; DCE-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[WORK]]
+; DCE-CHECK: [[WORK]]:
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: br label %[[WHILE]]
+; DCE-CHECK: [[EXIT]]:
+; DCE-CHECK-NEXT: ret void
+;
+entry:
+ br label %while
+
+while:
+ %done = phi i1 [ false, %entry ], [ %new_done, %tail ]
+ %not_done = xor i1 %done, true
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
+ %is_done = icmp eq i64 %ballot, 0
+ br i1 %is_done, label %exit, label %if
+
+if:
+ %first_active_id = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 %mymask)
+ %is_first_active_id = icmp eq i32 %mymask, %first_active_id
+ br i1 %is_first_active_id, label %work, label %tail
+
+work:
+ store i32 5, ptr addrspace(1) %out
+ br label %tail
+
+tail:
+ %new_done = phi i1 [ true, %work ], [ false, %if ]
+ br label %while
+
+exit:
+ ret void
+}
+
+define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: [[ENTRY:.*:]]
+; CURRENT-CHECK-NEXT: [[BALLOT_PEEL:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true)
+; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[BALLOT_PEEL]], 0
+; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[IF_PEEL:.*]]
+; CURRENT-CHECK: [[IF_PEEL]]:
+; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: br label %[[EXIT]]
+; CURRENT-CHECK: [[EXIT]]:
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
+; PASS-CHECK: [[IF]]:
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*]]:
+; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
+; DCE-CHECK: [[WHILE]]:
+; DCE-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; DCE-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
+; DCE-CHECK: [[IF]]:
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: br label %[[WHILE]]
+; DCE-CHECK: [[EXIT]]:
+; DCE-CHECK-NEXT: ret void
+;
+entry:
+ br label %while
+
+while:
+ %done = phi i1 [ 0, %entry ], [ 1, %if ]
+ %not_done = xor i1 %done, true
+ %ballot = tail call i32 @llvm.amdgcn.ballot.i32(i1 %not_done)
+ %is_done = icmp eq i32 %ballot, 0 ; in this case is_done = !not_done
+ br i1 %is_done, label %exit, label %if
+
+if:
+ store i32 5, ptr addrspace(1) %out
+ br label %while
+
+exit:
+ ret void
+}
+
+define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CURRENT-CHECK-NEXT: [[ENTRY:.*:]]
+; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: br label %[[WHILE:.*]]
+; CURRENT-CHECK: [[WHILE]]:
+; CURRENT-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true)
+; CURRENT-CHECK-NEXT: [[IS_DONE_NOT:%.*]] = icmp eq i32 [[BALLOT]], 0
+; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_NOT]], label %[[WHILE]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CURRENT-CHECK: [[EXIT]]:
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
+; PASS-CHECK: [[WHILE]]:
+; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
+; PASS-CHECK: [[IF]]:
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: br label %[[WHILE]]
+; PASS-CHECK: [[EXIT]]:
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[ENTRY:.*]]:
+; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
+; DCE-CHECK: [[WHILE]]:
+; DCE-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; DCE-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
+; DCE-CHECK: [[IF]]:
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: br label %[[WHILE]]
+; DCE-CHECK: [[EXIT]]:
+; DCE-CHECK-NEXT: ret void
+;
+entry:
+ br label %while
+
+while:
+ %done = phi i1 [ 0, %entry ], [ 1, %if ]
+ %ballot = tail call i32 @llvm.amdgcn.ballot.i32(i1 %done)
+ %is_done = icmp ne i32 0, %ballot ; in this case is_done = done
+ br i1 %is_done, label %exit, label %if
+
+if:
+ store i32 5, ptr addrspace(1) %out
+ br label %while
+
+exit:
+ ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1) #1
+!6 = !{i64 690}
+!7 = distinct !{!7, !8}
+!8 = !{!"llvm.loop.mustprogress"}
+;.
+; CURRENT-CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CURRENT-CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 1}
+; CURRENT-CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+; CURRENT-CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
new file mode 100644
index 0000000000000..1b56e77094ac5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -0,0 +1,790 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-enable-uniform-intrinsic-combine=0 -O3 -S < %s | FileCheck %s -check-prefix=CURRENT-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s -check-prefix=DCE-CHECK
+
+define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CURRENT-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PASS-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; DCE-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.permlane64(i32 77)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64
+; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[TID2:%.*]] = add nuw nsw i32 [[TID]], 1
+; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64
+; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TIDX]] to i64
+; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CURRENT-CHECK-NEXT: [[TIDX2:%.*]] = add nuw nsw i32 [[TIDX]], 1
+; CURRENT-CHECK-NEXT: [[TIDY2:%.*]] = add nuw nsw i32 [[TIDY]], 2
+; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TIDX]] to i64
+; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; PASS-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; DCE-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %tidx2 = add i32 %tidx, 1
+ %tidy2 = add i32 %tidy, 2
+ %v = call i32 @llvm.amdgcn.readlane(i32 %tidx2, i32 %tidy2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64
+; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[TID2:%.*]] = add nuw nsw i32 [[TID]], 1
+; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID2]] to i64
+; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT_MIN:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT_MAX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; CURRENT-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; PASS-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; DCE-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
+ store i32 %min_v, ptr addrspace(1) %out_min
+ %max_v = call i32 @llvm.amdgcn.permlane64(i32 2147483647)
+ store i32 %max_v, ptr addrspace(1) %out_max
+ ret void
+}
+
+define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = add nuw nsw i32 [[TIDX]], 5
+; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = add i32 %tidx, 5
+ %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CURRENT-CHECK-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
+; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
+; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %random = xor i32 123, 456
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
+; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CURRENT-CHECK-NEXT: [[IDX1:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CURRENT-CHECK-NEXT: [[IDX2:%.*]] = shl nuw nsw i32 [[IDX1]], 1
+; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
+;
+ %idx1 = call i32 @llvm.amdgcn.workitem.id.x()
+ %idx2 = mul i32 %idx1, 2
+ %v = call i32 @llvm.amdgcn.readlane(i32 %idx1, i32 %idx2)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
+; CURRENT-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 1)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CURRENT-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
+; CURRENT-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
+; CURRENT-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0
+; CURRENT-CHECK-NEXT: store i1 [[BALLOT_NE_ZERO]], ptr addrspace(1) [[OUT]], align 1
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
+; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
+; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
+; DCE-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
+; DCE-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
+; DCE-CHECK-NEXT: ret void
+;
+ %c = trunc i32 %v to i1
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ store i1 %ballot_ne_zero, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
+; CURRENT-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 1)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CURRENT-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
+; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
+; CURRENT-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[TMP1]], 0
+; CURRENT-CHECK-NEXT: store i1 [[BALLOT_NE_ZERO]], ptr addrspace(1) [[OUT]], align 1
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
+; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
+; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
+; DCE-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
+; DCE-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
+; DCE-CHECK-NEXT: ret void
+;
+ %c = trunc i32 %v to i1
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ store i1 %ballot_ne_zero, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_i16(i16 %src0, i32 %src1) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16(
+; CURRENT-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16(
+; PASS-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: call void asm sideeffect "
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16(
+; DCE-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: call void asm sideeffect "
+; DCE-CHECK-NEXT: ret void
+;
+ %readlane = call i16 @llvm.amdgcn.readlane.i16(i16 %src0, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(i16 %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_i64(i64 %src0, i32 %src1) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64(
+; CURRENT-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64(
+; PASS-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: call void asm sideeffect "
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64(
+; DCE-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: call void asm sideeffect "
+; DCE-CHECK-NEXT: ret void
+;
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(i64 %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_bf16(bfloat %src0, i32 %src1) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16(
+; CURRENT-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16(
+; PASS-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: call void asm sideeffect "
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16(
+; DCE-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: call void asm sideeffect "
+; DCE-CHECK-NEXT: ret void
+;
+ %readlane = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src0, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(bfloat %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_f16(half %src0, i32 %src1) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16(
+; CURRENT-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16(
+; PASS-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: call void asm sideeffect "
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16(
+; DCE-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: call void asm sideeffect "
+; DCE-CHECK-NEXT: ret void
+;
+ %readlane = call half @llvm.amdgcn.readlane.f16(half %src0, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(half %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_f32(float %src0, i32 %src1) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32(
+; CURRENT-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32(
+; PASS-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: call void asm sideeffect "
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32(
+; DCE-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: call void asm sideeffect "
+; DCE-CHECK-NEXT: ret void
+;
+ %readlane = call float @llvm.amdgcn.readlane.f32(float %src0, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(float %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_f64(double %src0, i32 %src1) {
+; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64(
+; CURRENT-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64(
+; PASS-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: call void asm sideeffect "
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64(
+; DCE-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: call void asm sideeffect "
+; DCE-CHECK-NEXT: ret void
+;
+ %readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(double %readlane)
+ ret void
+}
+; All such cases can be optimised, given generic way to query getDeclarationIfExists()
+define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
+; CURRENT-CHECK-LABEL: define void @test_readlane_v8i16(
+; CURRENT-CHECK-SAME: ptr addrspace(1) readnone captures(none) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
+; CURRENT-CHECK-NEXT: [[X:%.*]] = tail call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]])
+; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
+; CURRENT-CHECK-NEXT: ret void
+;
+; PASS-CHECK-LABEL: define void @test_readlane_v8i16(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[X:%.*]] = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]])
+; PASS-CHECK-NEXT: call void asm sideeffect "
+; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define void @test_readlane_v8i16(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[X:%.*]] = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]])
+; DCE-CHECK-NEXT: call void asm sideeffect "
+; DCE-CHECK-NEXT: ret void
+;
+ %x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
new file mode 100644
index 0000000000000..2fde3e3759f47
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=COMB-CHECK
+
+; This should not be optimized
+define amdgpu_cs void @temporal_divergence(ptr addrspace(1) %out, i32 %n) {
+; PASS-CHECK-LABEL: define amdgpu_cs void @temporal_divergence(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; PASS-CHECK-NEXT: [[ENTRY:.*]]:
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: br label %[[H:.*]]
+; PASS-CHECK: [[H]]:
+; PASS-CHECK-NEXT: [[UNI_MERGE_H:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[UNI_INC:%.*]], %[[H]] ]
+; PASS-CHECK-NEXT: [[UNI_INC]] = add i32 [[UNI_MERGE_H]], 1
+; PASS-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0
+; PASS-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]]
+; PASS-CHECK: [[X]]:
+; PASS-CHECK-NEXT: [[UNI_JOIN:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[UNI_INC]])
+; PASS-CHECK-NEXT: [[JOIN_USER:%.*]] = add i32 [[UNI_JOIN]], 5
+; PASS-CHECK-NEXT: store i32 [[JOIN_USER]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
+;
+; COMB-CHECK-LABEL: define amdgpu_cs void @temporal_divergence(
+; COMB-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMB-CHECK-NEXT: [[ENTRY:.*]]:
+; COMB-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; COMB-CHECK-NEXT: br label %[[H:.*]]
+; COMB-CHECK: [[H]]:
+; COMB-CHECK-NEXT: [[UNI_MERGE_H:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[UNI_INC:%.*]], %[[H]] ]
+; COMB-CHECK-NEXT: [[UNI_INC]] = add i32 [[UNI_MERGE_H]], 1
+; COMB-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0
+; COMB-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]]
+; COMB-CHECK: [[X]]:
+; COMB-CHECK-NEXT: [[UNI_JOIN:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[UNI_INC]])
+; COMB-CHECK-NEXT: [[JOIN_USER:%.*]] = add i32 [[UNI_JOIN]], 5
+; COMB-CHECK-NEXT: store i32 [[JOIN_USER]], ptr addrspace(1) [[OUT]], align 4
+; COMB-CHECK-NEXT: ret void
+;
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ br label %H
+
+H:
+ %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ]
+ %uni.inc = add i32 %uni.merge.h, 1
+ %div.exitx = icmp eq i32 %tid, 0
+ br i1 %div.exitx, label %X, label %H ; divergent branch
+
+X:
+ %uni.join = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %uni.inc)
+ %join.user = add i32 %uni.join, 5
+ store i32 %join.user, ptr addrspace(1) %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
>From 52b2692a42d6a93e0f53f7d95aecb592acc1fc3a Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 12 Sep 2025 15:37:31 +0530
Subject: [PATCH 2/3] update description
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 27 ++++++-------------
1 file changed, 8 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 5014979c0c678..65cb02a99b109 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -8,25 +8,14 @@
//
/// \file
/// This pass simplifies certain intrinsic calls when the arguments are uniform.
-/// also, this pass relies on the fact that uniformity analysis remains safe
-/// across valid transformations in LLVM. A transformation does not alter
-/// program behavior across threads: each instruction in the original IR
-/// continues to have a well-defined counterpart in the transformed IR, both
-/// statically and dynamically.
-///
-/// Valid transformations respect three invariants:
-/// 1. Use-def relationships are preserved. If one instruction produces a value
-/// and another consumes it, that dependency must remain intact.
-/// 2. Uniformity classification is preserved. Certain values are always uniform
-/// (constants, kernel arguments, convergent operations), while others are
-/// always divergent (atomics, most function calls). Transformations may turn
-/// divergent computations into uniform ones, but never the reverse.
-/// 3. Uniformity must hold not only at the point of value computation but also
-/// at all later uses of that value, consistently across the same set of
-/// threads.
-///
-/// Together, these invariants ensure that transformations in this pass are
-/// correctness-preserving and remain safe for uniformity analysis.
+/// It's true that this pass has transforms that can lead to a situation where
+/// some instruction whose operand was previously recognized as statically
+/// uniform is later on no longer recognized as statically uniform. However, the
+/// semantics of how programs execute don't (and must not, for this precise
+/// reason[0]) care about static uniformity, they only ever care about dynamic
+/// uniformity. And every instruction that's downstream and cares about dynamic
+/// uniformity must be convergent (and isel will introduce v_readfirstlane for
+/// them if their operands can't be proven statically uniform).
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
>From 5cddc510f53208a4c3ebc539b3f3ad18d65cc5e9 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 12 Sep 2025 16:50:12 +0530
Subject: [PATCH 3/3] store newly inserted inst and its uniformity
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 32 ++++++++++++++-----
1 file changed, 24 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 65cb02a99b109..2e2b049af169b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -41,9 +41,22 @@ using namespace llvm;
using namespace llvm::AMDGPU;
using namespace llvm::PatternMatch;
+/// Map for newly created IR instructions and their uniformity.
+using NewUniformityMap = DenseMap<const Value *, bool>;
+
+/// Wrapper for querying uniformity info that first checks new instructions.
+static bool isDivergentUseWithNew(const Use &U, const UniformityInfo &UI,
+ const NewUniformityMap &NewUMap) {
+ Value *V = U.get();
+ if (auto It = NewUMap.find(V); It != NewUMap.end())
+ return !It->second; // divergent if marked false
+ return UI.isDivergentUse(U);
+}
+
/// Optimizes uniform intrinsics.
static bool optimizeUniformIntrinsic(IntrinsicInst &II,
- const UniformityInfo &UI) {
+ const UniformityInfo &UI,
+ NewUniformityMap &NewUMap) {
llvm::Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
@@ -51,8 +64,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
- // Check if the argument use is divergent
- if (UI.isDivergentUse(II.getOperandUse(0)))
+ if (isDivergentUseWithNew(II.getOperandUse(0), UI, NewUMap))
return false;
LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
II.replaceAllUsesWith(Src);
@@ -61,7 +73,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
}
case Intrinsic::amdgcn_ballot: {
Value *Src = II.getArgOperand(0);
- if (UI.isDivergentUse(II.getOperandUse(0)))
+ if (isDivergentUseWithNew(II.getOperandUse(0), UI, NewUMap))
return false;
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
@@ -78,15 +90,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
Value *OtherOp = Op0 == &II ? Op1 : Op0;
if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
- // Case (icmp eq %ballot, 0) --> xor %ballot_arg, 1
+ // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
Instruction *NotOp =
BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
+ // Record uniformity: Src is uniform, and NOT preserves uniformity.
+ NewUMap[NotOp] = true;
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
ICmp->replaceAllUsesWith(NotOp);
ICmp->eraseFromParent();
Changed = true;
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
- // (icmp ne %ballot, 0) --> %ballot_arg
+ // Case: (icmp ne %ballot, 0) -> %ballot_arg
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
<< *Src << '\n');
ICmp->replaceAllUsesWith(Src);
@@ -106,9 +120,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
return false;
}
-/// Iterate over the Intrinsics use in the Module to optimise.
+/// Iterate over intrinsics in the module to optimise.
static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
bool IsChanged = false;
+ NewUniformityMap NewUMap;
+
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
for (Function &F : M) {
@@ -129,7 +145,7 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
continue;
const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
- IsChanged |= optimizeUniformIntrinsic(*II, UI);
+ IsChanged |= optimizeUniformIntrinsic(*II, UI, NewUMap);
}
}
return IsChanged;
More information about the llvm-commits
mailing list