[llvm] [WIP][AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. (PR #128687)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 10 02:00:13 PDT 2025
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/128687
>From 3035f5fe687f8df430f3866607bf89a01587fbb8 Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Thu, 21 Nov 2024 12:35:56 +0530
Subject: [PATCH 01/14] [AMDGPU] combine uniform AMDGPU lane Intrinsics
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 37 ++++++++++++++++---
1 file changed, 32 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 50c78d8c67251..58cf7ef5f9053 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -78,29 +78,39 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
}
case Intrinsic::amdgcn_ballot: {
Value *Src = II.getArgOperand(0);
+<<<<<<< HEAD
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
return false;
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
- bool Changed = false;
- for (User *U : make_early_inc_range(II.users())) {
- if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
- Value *Op0 = ICmp->getOperand(0);
- Value *Op1 = ICmp->getOperand(1);
+ if (UI.isDivergentUse(II.getOperandUse(0)))
+ return false;
+ LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
+
ICmpInst::Predicate Pred = ICmp->getPredicate();
Value *OtherOp = Op0 == &II ? Op1 : Op0;
if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
+<<<<<<< HEAD
// Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
Instruction *NotOp =
BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
Tracker[NotOp] = true; // NOT preserves uniformity
+=======
+ // Case (icmp eq %ballot, 0) --> xor %ballot_arg, 1
+ Instruction *NotOp =
+ BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
+>>>>>>> ee370a5a77ca ([AMDGPU] combine uniform AMDGPU lane Intrinsics)
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
ICmp->replaceAllUsesWith(NotOp);
ICmp->eraseFromParent();
Changed = true;
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
+<<<<<<< HEAD
// Case: (icmp ne %ballot, 0) -> %ballot_arg
+=======
+ // (icmp ne %ballot, 0) --> %ballot_arg
+>>>>>>> ee370a5a77ca ([AMDGPU] combine uniform AMDGPU lane Intrinsics)
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
<< *Src << '\n');
ICmp->replaceAllUsesWith(Src);
@@ -120,11 +130,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
return false;
}
+<<<<<<< HEAD
/// Iterates over intrinsic declarations in the module to optimize their uses.
static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
bool IsChanged = false;
ValueMap<const Value *, bool> Tracker;
+=======
+/// Iterate over the Intrinsics use in the Module to optimise.
+static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+ bool IsChanged = false;
+>>>>>>> ee370a5a77ca ([AMDGPU] combine uniform AMDGPU lane Intrinsics)
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
for (Function &F : M) {
@@ -138,11 +154,22 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
continue;
}
+<<<<<<< HEAD
for (User *U : make_early_inc_range(F.users())) {
auto *II = cast<IntrinsicInst>(U);
Function *ParentF = II->getFunction();
const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
+=======
+ for (User *U : F.users()) {
+ auto *II = cast<IntrinsicInst>(U);
+ Function *ParentF = II->getFunction();
+ if (ParentF->isDeclaration())
+ continue;
+
+ const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
+ IsChanged |= optimizeUniformIntrinsic(*II, UI);
+>>>>>>> ee370a5a77ca ([AMDGPU] combine uniform AMDGPU lane Intrinsics)
}
}
return IsChanged;
>From 27b2b79679e25b031b386ce8305cc8f9cc78c1e3 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 12 Sep 2025 16:50:12 +0530
Subject: [PATCH 02/14] store newly inserted inst and its uniformity
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 33 ++++++++++++++++---
1 file changed, 29 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 58cf7ef5f9053..05d07ef6b0846 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -62,14 +62,11 @@ isDivergentUseWithNew(const Use &U, const UniformityInfo &UI,
static bool optimizeUniformIntrinsic(IntrinsicInst &II,
const UniformityInfo &UI,
ValueMap<const Value *, bool> &Tracker) {
- llvm::Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
- case Intrinsic::amdgcn_readlane: {
- Value *Src = II.getArgOperand(0);
- if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
+>>>>>>> f6cc7aa1522d (store newly inserted inst and its uniformity)
return false;
LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
II.replaceAllUsesWith(Src);
@@ -78,12 +75,16 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
}
case Intrinsic::amdgcn_ballot: {
Value *Src = II.getArgOperand(0);
+<<<<<<< HEAD
<<<<<<< HEAD
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
return false;
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
if (UI.isDivergentUse(II.getOperandUse(0)))
+=======
+ if (isDivergentUseWithNew(II.getOperandUse(0), UI, NewUMap))
+>>>>>>> f6cc7aa1522d (store newly inserted inst and its uniformity)
return false;
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
@@ -91,6 +92,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
Value *OtherOp = Op0 == &II ? Op1 : Op0;
if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
+<<<<<<< HEAD
<<<<<<< HEAD
// Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
Instruction *NotOp =
@@ -101,16 +103,27 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
Instruction *NotOp =
BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
>>>>>>> ee370a5a77ca ([AMDGPU] combine uniform AMDGPU lane Intrinsics)
+=======
+ // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
+ Instruction *NotOp =
+ BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
+ // Record uniformity: Src is uniform, and NOT preserves uniformity.
+ NewUMap[NotOp] = true;
+>>>>>>> f6cc7aa1522d (store newly inserted inst and its uniformity)
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
ICmp->replaceAllUsesWith(NotOp);
ICmp->eraseFromParent();
Changed = true;
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
+<<<<<<< HEAD
<<<<<<< HEAD
// Case: (icmp ne %ballot, 0) -> %ballot_arg
=======
// (icmp ne %ballot, 0) --> %ballot_arg
>>>>>>> ee370a5a77ca ([AMDGPU] combine uniform AMDGPU lane Intrinsics)
+=======
+ // Case: (icmp ne %ballot, 0) -> %ballot_arg
+>>>>>>> f6cc7aa1522d (store newly inserted inst and its uniformity)
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
<< *Src << '\n');
ICmp->replaceAllUsesWith(Src);
@@ -130,6 +143,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
return false;
}
+<<<<<<< HEAD
<<<<<<< HEAD
/// Iterates over intrinsic declarations in the module to optimize their uses.
static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
@@ -141,6 +155,13 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
bool IsChanged = false;
>>>>>>> ee370a5a77ca ([AMDGPU] combine uniform AMDGPU lane Intrinsics)
+=======
+/// Iterate over intrinsics in the module to optimise.
+static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+ bool IsChanged = false;
+ NewUniformityMap NewUMap;
+
+>>>>>>> f6cc7aa1522d (store newly inserted inst and its uniformity)
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
for (Function &F : M) {
@@ -168,8 +189,12 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
continue;
const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
+<<<<<<< HEAD
IsChanged |= optimizeUniformIntrinsic(*II, UI);
>>>>>>> ee370a5a77ca ([AMDGPU] combine uniform AMDGPU lane Intrinsics)
+=======
+ IsChanged |= optimizeUniformIntrinsic(*II, UI, NewUMap);
+>>>>>>> f6cc7aa1522d (store newly inserted inst and its uniformity)
}
}
return IsChanged;
>From 3d85fbecd2b28a5fb01da2b470969c139fc955a8 Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Thu, 21 Nov 2024 12:35:56 +0530
Subject: [PATCH 03/14] [WIP][AMDGPU] combine uniform AMDGPU lane Intrinsics
---
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 4 -
.../amdgpu-uniform-intrinsic-combine.ll | 810 +++---------------
3 files changed, 121 insertions(+), 694 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index a6074eaf78fd0..a5f9813e9e5e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -69,6 +69,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass(*this))
#undef FUNCTION_PASS
#ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 05d07ef6b0846..c3358542c2bfa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -27,13 +27,9 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index aa11574517520..e2db2d75c6831 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -4,72 +4,45 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s -check-prefix=DCE-CHECK
define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CURRENT-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; PASS-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; DCE-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
+; GFX-LABEL: define amdgpu_kernel void @permlane64_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 77)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CURRENT-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
+define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
+ %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
-; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64
-; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
-; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; PASS-CHECK-NEXT: ret void
+define amdgpu_kernel void @permlane64_sgpr(ptr addrspace(1) %out, i32 %src) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
-; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
-; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; DCE-CHECK-NEXT: ret void
+ %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
@@ -78,34 +51,16 @@ define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[TID2:%.*]] = add nuw nsw i32 [[TID]], 1
-; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
-; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64
-; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
-; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
-; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
-; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; DCE-CHECK-NEXT: ret void
+define amdgpu_kernel void @permlane64_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -116,75 +71,48 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
}
define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CURRENT-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
+; GFX-LABEL: define amdgpu_kernel void @readlane_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CURRENT-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
+define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
+ %v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
+; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TIDX]] to i64
-; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
-; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
-; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; DCE-CHECK-NEXT: ret void
+define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -194,40 +122,18 @@ define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out)
ret void
}
-define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
-; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; CURRENT-CHECK-NEXT: [[TIDX2:%.*]] = add nuw nsw i32 [[TIDX]], 1
-; CURRENT-CHECK-NEXT: [[TIDY2:%.*]] = add nuw nsw i32 [[TIDY]], 2
-; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
-; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TIDX]] to i64
-; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; PASS-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
-; PASS-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
-; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; DCE-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
-; DCE-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
-; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
-; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
-; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; DCE-CHECK-NEXT: ret void
+define amdgpu_kernel void @readlane_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; GFX-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -240,72 +146,47 @@ define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CURRENT-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CURRENT-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
+define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_sgpr(ptr addrspace(1) %out, i32 %src0) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
-; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64
-; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
-; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
-; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
-; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; DCE-CHECK-NEXT: ret void
+define amdgpu_kernel void @readfirstlane_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
@@ -314,34 +195,16 @@ define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out
ret void
}
-define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[TID2:%.*]] = add nuw nsw i32 [[TID]], 1
-; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
-; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID2]] to i64
-; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]]
-; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
-; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]]
-; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; DCE-CHECK-NEXT: ret void
+define amdgpu_kernel void @readfirstlane_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
+; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -351,440 +214,7 @@ define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
- %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
-; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
- %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
-; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
-; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
-; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
- %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
-; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
- %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-
-define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT_MIN:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT_MAX:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CURRENT-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
-; CURRENT-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
-; PASS-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
-; DCE-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
- store i32 %min_v, ptr addrspace(1) %out_min
- %max_v = call i32 @llvm.amdgcn.permlane64(i32 2147483647)
- store i32 %max_v, ptr addrspace(1) %out_max
- ret void
-}
-
-define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = add nuw nsw i32 [[TIDX]], 5
-; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
-; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %tidy = add i32 %tidx, 5
- %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CURRENT-CHECK-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
-; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
-; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %random = xor i32 123, 456
- %v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
-; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; CURRENT-CHECK-NEXT: [[IDX1:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CURRENT-CHECK-NEXT: [[IDX2:%.*]] = shl nuw nsw i32 [[IDX1]], 1
-; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
-; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
-; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; DCE-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2
-; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
-; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %idx1 = call i32 @llvm.amdgcn.workitem.id.x()
- %idx2 = mul i32 %idx1, 2
- %v = call i32 @llvm.amdgcn.readlane(i32 %idx1, i32 %idx2)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
-; CURRENT-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 1)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; CURRENT-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
-; CURRENT-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
-; CURRENT-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0
-; CURRENT-CHECK-NEXT: store i1 [[BALLOT_NE_ZERO]], ptr addrspace(1) [[OUT]], align 1
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
-; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
-; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
-; DCE-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
-; DCE-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
-; DCE-CHECK-NEXT: ret void
-;
- %c = trunc i32 %v to i1
- %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
- %ballot_ne_zero = icmp ne i32 %ballot, 0
- store i1 %ballot_ne_zero, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
-; CURRENT-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 1)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; CURRENT-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
-; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
-; CURRENT-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[TMP1]], 0
-; CURRENT-CHECK-NEXT: store i1 [[BALLOT_NE_ZERO]], ptr addrspace(1) [[OUT]], align 1
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
-; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
-; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
-; DCE-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
-; DCE-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
-; DCE-CHECK-NEXT: ret void
-;
- %c = trunc i32 %v to i1
- %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
- %ballot_ne_zero = icmp ne i64 %ballot, 0
- store i1 %ballot_ne_zero, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @test_readlane_i16(i16 %src0, i32 %src1) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16(
-; CURRENT-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
-; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16(
-; PASS-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: call void asm sideeffect "
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16(
-; DCE-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: call void asm sideeffect "
-; DCE-CHECK-NEXT: ret void
-;
- %readlane = call i16 @llvm.amdgcn.readlane.i16(i16 %src0, i32 %src1)
- call void asm sideeffect "; use $0", "s"(i16 %readlane)
- ret void
-}
-
-define amdgpu_kernel void @test_readlane_i64(i64 %src0, i32 %src1) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64(
-; CURRENT-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
-; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64(
-; PASS-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: call void asm sideeffect "
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64(
-; DCE-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: call void asm sideeffect "
-; DCE-CHECK-NEXT: ret void
-;
- %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1)
- call void asm sideeffect "; use $0", "s"(i64 %readlane)
- ret void
-}
-
-define amdgpu_kernel void @test_readlane_bf16(bfloat %src0, i32 %src1) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16(
-; CURRENT-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
-; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16(
-; PASS-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: call void asm sideeffect "
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16(
-; DCE-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: call void asm sideeffect "
-; DCE-CHECK-NEXT: ret void
-;
- %readlane = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src0, i32 %src1)
- call void asm sideeffect "; use $0", "s"(bfloat %readlane)
- ret void
-}
-
-define amdgpu_kernel void @test_readlane_f16(half %src0, i32 %src1) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16(
-; CURRENT-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
-; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16(
-; PASS-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: call void asm sideeffect "
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16(
-; DCE-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: call void asm sideeffect "
-; DCE-CHECK-NEXT: ret void
-;
- %readlane = call half @llvm.amdgcn.readlane.f16(half %src0, i32 %src1)
- call void asm sideeffect "; use $0", "s"(half %readlane)
- ret void
-}
-
-define amdgpu_kernel void @test_readlane_f32(float %src0, i32 %src1) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32(
-; CURRENT-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
-; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32(
-; PASS-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: call void asm sideeffect "
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32(
-; DCE-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: call void asm sideeffect "
-; DCE-CHECK-NEXT: ret void
-;
- %readlane = call float @llvm.amdgcn.readlane.f32(float %src0, i32 %src1)
- call void asm sideeffect "; use $0", "s"(float %readlane)
- ret void
-}
-
-define amdgpu_kernel void @test_readlane_f64(double %src0, i32 %src1) {
-; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64(
-; CURRENT-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
-; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64(
-; PASS-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: call void asm sideeffect "
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64(
-; DCE-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: call void asm sideeffect "
-; DCE-CHECK-NEXT: ret void
-;
- %readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1)
- call void asm sideeffect "; use $0", "s"(double %readlane)
- ret void
-}
-; All such cases can be optimised, given generic way to query getDeclarationIfExists()
-define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
-; CURRENT-CHECK-LABEL: define void @test_readlane_v8i16(
-; CURRENT-CHECK-SAME: ptr addrspace(1) readnone captures(none) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
-; CURRENT-CHECK-NEXT: [[X:%.*]] = tail call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]])
-; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
-; CURRENT-CHECK-NEXT: ret void
-;
-; PASS-CHECK-LABEL: define void @test_readlane_v8i16(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[X:%.*]] = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]])
-; PASS-CHECK-NEXT: call void asm sideeffect "
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define void @test_readlane_v8i16(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: [[X:%.*]] = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]])
-; DCE-CHECK-NEXT: call void asm sideeffect "
-; DCE-CHECK-NEXT: ret void
-;
- %x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1)
- call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
- ret void
-}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
>From 41eeb1ea60af2eb8a3fb397cfae1251932eaee87 Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Fri, 22 Nov 2024 19:42:42 +0530
Subject: [PATCH 04/14] refactored and updated intrinsics handling
---
llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index c3358542c2bfa..417c8869145f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -27,6 +27,9 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
>From 8f2e60dd6a8a9818c63084e84b42478431f5110c Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Fri, 6 Dec 2024 01:23:42 +0530
Subject: [PATCH 05/14] refactored, added more test
---
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 1 +
.../amdgpu-uniform-intrinsic-combine.ll | 222 ++++++++++++++++--
2 files changed, 204 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 417c8869145f9..05d07ef6b0846 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -33,6 +33,7 @@
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index e2db2d75c6831..53e5c486ccc8b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -24,18 +24,20 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @permlane64_sgpr(ptr addrspace(1) %out, i32 %src) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_sgpr(
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_uniform(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[SRC]])
+; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
- %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr(
+define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
@@ -51,8 +53,8 @@ define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @permlane64_vgpr_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr_expression(
+define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
@@ -91,8 +93,8 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_sgpr(
+define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
@@ -103,8 +105,8 @@ define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %
ret void
}
-define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr(
+define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -122,8 +124,8 @@ define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @readlane_vgpr_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr_expression(
+define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -166,8 +168,8 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @readfirstlane_sgpr(ptr addrspace(1) %out, i32 %src0) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_sgpr(
+define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
@@ -178,8 +180,8 @@ define amdgpu_kernel void @readfirstlane_sgpr(ptr addrspace(1) %out, i32 %src0)
ret void
}
-define amdgpu_kernel void @readfirstlane_vgpr(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr(
+define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
@@ -195,8 +197,8 @@ define amdgpu_kernel void @readfirstlane_vgpr(i32 addrspace(1)* %out) {
ret void
}
-define amdgpu_kernel void @readfirstlane_vgpr_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr_expression(
+define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_expression(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
@@ -214,6 +216,188 @@ define amdgpu_kernel void @readfirstlane_vgpr_expression(i32 addrspace(1)* %out)
ret void
}
+define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_readlane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; GFX-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; GFX-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; GFX-NEXT: ret void
+;
+ %min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
+ store i32 %min_v, ptr addrspace(1) %out_min
+ %max_v = call i32 @llvm.amdgcn.permlane64(i32 2147483647)
+ store i32 %max_v, ptr addrspace(1) %out_max
+ ret void
+}
+
+define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = add i32 %tidx, 5
+ %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %random = xor i32 123, 456
+ %v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_invalid(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: ret void
+;
+ %undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+ store i32 %undef_v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
+; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+ %idx1 = call i32 @llvm.amdgcn.workitem.id.x()
+ %idx2 = mul i32 %idx1, 2
+ %v = call i32 @llvm.amdgcn.readlane(i32 %idx1, i32 %idx2)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+; Test case: Ensure that a loop with a divergent exit and a uniform value
+; used by an intrinsic outside the loop is not optimized due to temporal divergence.
+
+define amdgpu_kernel void @test_divergent_exit(ptr addrspace(1) %out, i32 %max_iter, i32 %div_cond) {
+; GFX-LABEL: define amdgpu_kernel void @test_divergent_exit(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: [[ITER:%.*]] = alloca i32, align 4
+; GFX-NEXT: store i32 0, ptr [[ITER]], align 4
+; GFX-NEXT: br label %[[LOOP:.*]]
+; GFX: [[LOOP]]:
+; GFX-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
+; GFX-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
+; GFX-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
+; GFX-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
+; GFX-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
+; GFX-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
+; GFX: [[EXIT_BLOCK]]:
+; GFX-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
+; GFX-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+entry:
+ %iter = alloca i32, align 4
+ store i32 0, ptr %iter, align 4
+ br label %loop
+
+loop:
+ ; Increment loop counter
+ %iter_val = load i32, ptr %iter, align 4
+ %new_iter = add i32 %iter_val, 1
+ store i32 %new_iter, ptr %iter, align 4
+
+ ; Check exit conditions
+ %cond1 = icmp sgt i32 %new_iter, %max_iter
+ %cond2 = icmp eq i32 %div_cond, 0
+ %exit = or i1 %cond1, %cond2
+ br i1 %exit, label %exit_block, label %loop
+
+exit_block:
+ ; Use the uniform value in an intrinsic outside the loop
+ %final_val = load i32, ptr %iter, align 4
+ %result = call i32 @llvm.amdgcn.permlane64(i32 %final_val)
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.permlane64(i32)
+
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX10: {{.*}}
; GFX11: {{.*}}
>From e1a9136eb87ea34bec5f611e3d7199052d8717b1 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Thu, 9 Jan 2025 23:24:16 +0530
Subject: [PATCH 06/14] integrated in pipeline, more test added
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 +++-
.../amdgpu-simplify-trivial-waterfall-loop.ll | 49 +++++++++++++++++
.../amdgpu-uniform-intrinsic-combine.ll | 53 +++++++++++++++++--
3 files changed, 107 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4958a200de4e0..6347259fb3c4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -525,6 +525,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
"amdgpu-link-time-closed-world",
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EnableUniformIntrinsicCombine(
+ "amdgpu-enable-uniform-intrinsic-combine",
+ cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
+ cl::init(true), cl::Hidden);
static cl::opt<bool> EnableUniformIntrinsicCombine(
"amdgpu-enable-uniform-intrinsic-combine",
@@ -890,13 +895,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
});
PB.registerPeepholeEPCallback(
- [](FunctionPassManager &FPM, OptimizationLevel Level) {
+ [this](FunctionPassManager &FPM, OptimizationLevel Level) {
if (Level == OptimizationLevel::O0)
return;
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify)
FPM.addPass(AMDGPUSimplifyLibCallsPass());
+
+ if (EnableUniformIntrinsicCombine)
+ FPM.addPass(AMDGPUUniformIntrinsicCombinePass(*this));
});
PB.registerCGSCCOptimizerLateEPCallback(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
new file mode 100644
index 0000000000000..2a1fddff5f3c8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+
+define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
+; CHECK-NOT: br label %loop
+; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+entry:
+ ; Initialize result to zero
+ %result = alloca i32, align 4
+ store i32 0, ptr %result, align 4
+ br label %loop
+
+loop:
+ ; Load the current result
+ %cur_result = load i32, ptr %result, align 4
+
+ ; Compute the next value
+ %next_value = add i32 %cur_result, %src
+
+ ; Apply the readfirstlane intrinsic for uniformity
+ %uniform_value = call i32 @llvm.amdgcn.readfirstlane(i32 %next_value)
+
+ ; Store the uniform result back
+ store i32 %uniform_value, ptr %result, align 4
+
+ ; This is a trivial loop that always exits after one iteration
+ br i1 true, label %exit, label %loop
+
+exit:
+ ; Store the result to the output pointer
+ %final_result = load i32, ptr %result, align 4
+ store i32 %final_result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index 53e5c486ccc8b..43b3bc8b01b79 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -27,7 +27,6 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
; GFX-LABEL: define amdgpu_kernel void @permlane64_uniform(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[SRC]])
; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
@@ -96,7 +95,6 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
@@ -171,7 +169,6 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; GFX-NEXT: ret void
;
@@ -395,8 +392,56 @@ exit_block:
ret void
}
-declare i32 @llvm.amdgcn.permlane64(i32)
+; Define the kernel function
+define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
+; CHECK-NOT: br label %loop
+; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX-NEXT: [[ENTRY:.*:]]
+; GFX-NEXT: [[RESULT:%.*]] = alloca i32, align 4
+; GFX-NEXT: store i32 0, ptr [[RESULT]], align 4
+; GFX-NEXT: br label %[[LOOP:.*]]
+; GFX: [[LOOP]]:
+; GFX-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
+; GFX-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
+; GFX-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
+; GFX-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
+; GFX: [[EXIT]]:
+; GFX-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT: ret void
+;
+entry:
+ ; Initialize result to zero
+ %result = alloca i32, align 4
+ store i32 0, ptr %result, align 4
+ br label %loop
+loop:
+ ; Load the current result
+ %cur_result = load i32, ptr %result, align 4
+
+ ; Compute the next value
+ %next_value = add i32 %cur_result, %src
+
+ ; Apply the readfirstlane intrinsic for uniformity
+ %uniform_value = call i32 @llvm.amdgcn.readfirstlane(i32 %next_value)
+
+ ; Store the uniform result back
+ store i32 %uniform_value, ptr %result, align 4
+
+ ; This is a trivial loop that always exits after one iteration
+ br i1 true, label %exit, label %loop
+
+exit:
+ ; Store the result to the output pointer
+ %final_result = load i32, ptr %result, align 4
+ store i32 %final_result, ptr addrspace(1) %out, align 4
+ ret void
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX10: {{.*}}
>From bd3ad38d32a576fe45ebec0da99dc1a4a8eab6db Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 15 Jan 2025 16:58:15 +0530
Subject: [PATCH 07/14] removed unused gfx checks
---
.../amdgpu-simplify-trivial-waterfall-loop.ll | 21 +-
.../amdgpu-uniform-intrinsic-combine.ll | 351 +++++++++---------
2 files changed, 178 insertions(+), 194 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
index 2a1fddff5f3c8..56ba117ce1d30 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
@@ -1,18 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX10
define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
-; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
-; CHECK-NOT: br label %loop
-; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; GFX-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; GFX-NEXT: [[ENTRY:.*:]]
-; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX10-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; GFX10-NEXT: [[ENTRY:.*:]]
+; GFX10-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
entry:
; Initialize result to zero
@@ -43,7 +38,3 @@ exit:
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index 43b3bc8b01b79..0a92ac1524a8a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -4,10 +4,10 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s -check-prefix=DCE-CHECK
define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_constant(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; GFX-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_constant(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX10-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 77)
store i32 %v, ptr addrspace(1) %out
@@ -15,9 +15,9 @@ define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_undef(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_undef(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -25,10 +25,10 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_uniform(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_uniform(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 %src)
store i32 %v, ptr addrspace(1) %out
@@ -36,14 +36,14 @@ define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
}
define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
@@ -53,15 +53,15 @@ define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
}
define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -72,10 +72,10 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
}
define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_constant(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_constant(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
store i32 %v, ptr addrspace(1) %out
@@ -83,9 +83,9 @@ define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_undef(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_undef(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -93,10 +93,10 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
store i32 %v, ptr addrspace(1) %out
@@ -104,15 +104,15 @@ define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i3
}
define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -123,17 +123,17 @@ define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out)
}
define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
-; GFX-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; GFX10-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -146,10 +146,10 @@ define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_constant(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
store i32 %v, ptr addrspace(1) %out
@@ -157,9 +157,9 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_undef(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -167,10 +167,10 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
store i32 %v, ptr addrspace(1) %out
@@ -178,14 +178,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3
}
define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
@@ -195,15 +195,15 @@ define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
-; GFX-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
-; GFX-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
+; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX10-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -214,10 +214,10 @@ define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
}
define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
%v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
@@ -226,13 +226,13 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou
}
define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -243,12 +243,12 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
@@ -258,13 +258,13 @@ define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_readlane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_readlane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -276,11 +276,11 @@ define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_boundary(
-; GFX-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
-; GFX-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; GFX10-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; GFX10-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; GFX10-NEXT: ret void
;
%min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
store i32 %min_v, ptr addrspace(1) %out_min
@@ -290,13 +290,13 @@ define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr ad
}
define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_cross_lane(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = add i32 %tidx, 5
@@ -306,10 +306,10 @@ define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readfirstlane_random(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%random = xor i32 123, 456
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
@@ -318,9 +318,9 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @permlane64_invalid(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @permlane64_invalid(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: ret void
;
%undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %undef_v, ptr addrspace(1) %out
@@ -328,13 +328,13 @@ define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
-; GFX-LABEL: define amdgpu_kernel void @readlane_expression(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
-; GFX-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
-; GFX-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @readlane_expression(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX10-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
+; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
%idx1 = call i32 @llvm.amdgcn.workitem.id.x()
%idx2 = mul i32 %idx1, 2
@@ -347,25 +347,25 @@ define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
; used by an intrinsic outside the loop is not optimized due to temporal divergence.
define amdgpu_kernel void @test_divergent_exit(ptr addrspace(1) %out, i32 %max_iter, i32 %div_cond) {
-; GFX-LABEL: define amdgpu_kernel void @test_divergent_exit(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[ENTRY:.*:]]
-; GFX-NEXT: [[ITER:%.*]] = alloca i32, align 4
-; GFX-NEXT: store i32 0, ptr [[ITER]], align 4
-; GFX-NEXT: br label %[[LOOP:.*]]
-; GFX: [[LOOP]]:
-; GFX-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
-; GFX-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
-; GFX-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
-; GFX-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
-; GFX-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
-; GFX-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
-; GFX-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
-; GFX: [[EXIT_BLOCK]]:
-; GFX-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
-; GFX-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
-; GFX-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @test_divergent_exit(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[ENTRY:.*:]]
+; GFX10-NEXT: [[ITER:%.*]] = alloca i32, align 4
+; GFX10-NEXT: store i32 0, ptr [[ITER]], align 4
+; GFX10-NEXT: br label %[[LOOP:.*]]
+; GFX10: [[LOOP]]:
+; GFX10-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX10-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
+; GFX10-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
+; GFX10-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
+; GFX10-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
+; GFX10-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
+; GFX10-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
+; GFX10: [[EXIT_BLOCK]]:
+; GFX10-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
+; GFX10-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
+; GFX10-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
entry:
%iter = alloca i32, align 4
@@ -394,25 +394,22 @@ exit_block:
; Define the kernel function
define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
-; CHECK-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; CHECK: store i32 %src, ptr addrspace(1) %out, align 4
-; CHECK-NOT: br label %loop
-; GFX-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX-NEXT: [[ENTRY:.*:]]
-; GFX-NEXT: [[RESULT:%.*]] = alloca i32, align 4
-; GFX-NEXT: store i32 0, ptr [[RESULT]], align 4
-; GFX-NEXT: br label %[[LOOP:.*]]
-; GFX: [[LOOP]]:
-; GFX-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
-; GFX-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
-; GFX-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
-; GFX-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
-; GFX-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
-; GFX: [[EXIT]]:
-; GFX-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
-; GFX-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
-; GFX-NEXT: ret void
+; GFX10-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
+; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[ENTRY:.*:]]
+; GFX10-NEXT: [[RESULT:%.*]] = alloca i32, align 4
+; GFX10-NEXT: store i32 0, ptr [[RESULT]], align 4
+; GFX10-NEXT: br label %[[LOOP:.*]]
+; GFX10: [[LOOP]]:
+; GFX10-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX10-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
+; GFX10-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
+; GFX10-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
+; GFX10-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
+; GFX10: [[EXIT]]:
+; GFX10-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
+; GFX10-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
+; GFX10-NEXT: ret void
;
entry:
; Initialize result to zero
@@ -443,7 +440,3 @@ exit:
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX12: {{.*}}
>From 0ce7930002e2140a09d235a69b0cf5c71597d385 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 7 Feb 2025 20:08:45 +0530
Subject: [PATCH 08/14] added pass to llc pipeline, more test added
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 7 +
...amdgpu-miscellaneous-uniform-intrinsics.ll | 132 +++++++
.../amdgpu-uniform-intrinsic-combine.ll | 371 +++++++-----------
4 files changed, 278 insertions(+), 234 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6347259fb3c4e..1d84497600d65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1318,6 +1318,8 @@ void AMDGPUPassConfig::addIRPasses() {
isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
+ if (EnableUniformIntrinsicCombine)
+ addPass(createAMDGPUUniformIntrinsicCombineLegacyPass(&TM));
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 05d07ef6b0846..bd2dca541a276 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -66,7 +66,14 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
switch (IID) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
+<<<<<<< HEAD
>>>>>>> f6cc7aa1522d (store newly inserted inst and its uniformity)
+=======
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_ballot: {
+ Value *Src = II.getArgOperand(0);
+ if (isDivergentUseWithNew(II.getOperandUse(0), UI, NewUMap))
+>>>>>>> b854d2fb61ff (added pass to llc pipeline, more test added)
return false;
LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
II.replaceAllUsesWith(Src);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
new file mode 100644
index 0000000000000..82f92d2ccb550
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs - %s | FileCheck %s
+
+define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readfirstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_with_firstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v0
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_readlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: permlane64_uniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s2, s[4:5], 0x8
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_permlane64_b32 v1, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform_expression:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: v_permlane64_b32 v1, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index 0a92ac1524a8a..d95d14c912dd4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -4,10 +4,10 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s -check-prefix=DCE-CHECK
define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_constant(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; GFX10-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PASS-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 77)
store i32 %v, ptr addrspace(1) %out
@@ -15,9 +15,9 @@ define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_undef(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_undef(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -25,10 +25,10 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_uniform(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 %src)
store i32 %v, ptr addrspace(1) %out
@@ -36,14 +36,14 @@ define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
}
define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
@@ -53,15 +53,15 @@ define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
}
define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -72,10 +72,10 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
}
define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_constant(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
store i32 %v, ptr addrspace(1) %out
@@ -83,9 +83,9 @@ define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_undef(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_undef(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -93,10 +93,10 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
store i32 %v, ptr addrspace(1) %out
@@ -104,15 +104,15 @@ define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i3
}
define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -123,17 +123,17 @@ define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out)
}
define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX10-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
-; GFX10-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; PASS-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -146,10 +146,10 @@ define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_constant(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
store i32 %v, ptr addrspace(1) %out
@@ -157,9 +157,9 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_undef(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -167,10 +167,10 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
store i32 %v, ptr addrspace(1) %out
@@ -178,14 +178,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3
}
define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
@@ -195,15 +195,15 @@ define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out
}
define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_expression(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
-; GFX10-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
-; GFX10-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -214,10 +214,10 @@ define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
}
define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
%v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
@@ -226,13 +226,13 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou
}
define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -243,12 +243,12 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
-; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
@@ -258,13 +258,13 @@ define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_readlane(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; GFX10-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX10-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -276,11 +276,11 @@ define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_boundary(
-; GFX10-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
-; GFX10-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; PASS-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; PASS-CHECK-NEXT: ret void
;
%min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
store i32 %min_v, ptr addrspace(1) %out_min
@@ -290,13 +290,13 @@ define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr ad
}
define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_cross_lane(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = add i32 %tidx, 5
@@ -306,10 +306,10 @@ define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readfirstlane_random(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%random = xor i32 123, 456
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
@@ -318,9 +318,9 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @permlane64_invalid(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_invalid(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: ret void
;
%undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %undef_v, ptr addrspace(1) %out
@@ -328,13 +328,13 @@ define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
-; GFX10-LABEL: define amdgpu_kernel void @readlane_expression(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX10-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
-; GFX10-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
-; GFX10-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
+; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
+; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; PASS-CHECK-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: ret void
;
%idx1 = call i32 @llvm.amdgcn.workitem.id.x()
%idx2 = mul i32 %idx1, 2
@@ -343,100 +343,3 @@ define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
ret void
}
-; Test case: Ensure that a loop with a divergent exit and a uniform value
-; used by an intrinsic outside the loop is not optimized due to temporal divergence.
-
-define amdgpu_kernel void @test_divergent_exit(ptr addrspace(1) %out, i32 %max_iter, i32 %div_cond) {
-; GFX10-LABEL: define amdgpu_kernel void @test_divergent_exit(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MAX_ITER:%.*]], i32 [[DIV_COND:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[ENTRY:.*:]]
-; GFX10-NEXT: [[ITER:%.*]] = alloca i32, align 4
-; GFX10-NEXT: store i32 0, ptr [[ITER]], align 4
-; GFX10-NEXT: br label %[[LOOP:.*]]
-; GFX10: [[LOOP]]:
-; GFX10-NEXT: [[ITER_VAL:%.*]] = load i32, ptr [[ITER]], align 4
-; GFX10-NEXT: [[NEW_ITER:%.*]] = add i32 [[ITER_VAL]], 1
-; GFX10-NEXT: store i32 [[NEW_ITER]], ptr [[ITER]], align 4
-; GFX10-NEXT: [[COND1:%.*]] = icmp sgt i32 [[NEW_ITER]], [[MAX_ITER]]
-; GFX10-NEXT: [[COND2:%.*]] = icmp eq i32 [[DIV_COND]], 0
-; GFX10-NEXT: [[EXIT:%.*]] = or i1 [[COND1]], [[COND2]]
-; GFX10-NEXT: br i1 [[EXIT]], label %[[EXIT_BLOCK:.*]], label %[[LOOP]]
-; GFX10: [[EXIT_BLOCK]]:
-; GFX10-NEXT: [[FINAL_VAL:%.*]] = load i32, ptr [[ITER]], align 4
-; GFX10-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[FINAL_VAL]])
-; GFX10-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
-;
-entry:
- %iter = alloca i32, align 4
- store i32 0, ptr %iter, align 4
- br label %loop
-
-loop:
- ; Increment loop counter
- %iter_val = load i32, ptr %iter, align 4
- %new_iter = add i32 %iter_val, 1
- store i32 %new_iter, ptr %iter, align 4
-
- ; Check exit conditions
- %cond1 = icmp sgt i32 %new_iter, %max_iter
- %cond2 = icmp eq i32 %div_cond, 0
- %exit = or i1 %cond1, %cond2
- br i1 %exit, label %exit_block, label %loop
-
-exit_block:
- ; Use the uniform value in an intrinsic outside the loop
- %final_val = load i32, ptr %iter, align 4
- %result = call i32 @llvm.amdgcn.permlane64(i32 %final_val)
- store i32 %result, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; Define the kernel function
-define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
-; GFX10-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; GFX10-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[ENTRY:.*:]]
-; GFX10-NEXT: [[RESULT:%.*]] = alloca i32, align 4
-; GFX10-NEXT: store i32 0, ptr [[RESULT]], align 4
-; GFX10-NEXT: br label %[[LOOP:.*]]
-; GFX10: [[LOOP]]:
-; GFX10-NEXT: [[CUR_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
-; GFX10-NEXT: [[NEXT_VALUE:%.*]] = add i32 [[CUR_RESULT]], [[SRC]]
-; GFX10-NEXT: [[UNIFORM_VALUE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[NEXT_VALUE]])
-; GFX10-NEXT: store i32 [[UNIFORM_VALUE]], ptr [[RESULT]], align 4
-; GFX10-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
-; GFX10: [[EXIT]]:
-; GFX10-NEXT: [[FINAL_RESULT:%.*]] = load i32, ptr [[RESULT]], align 4
-; GFX10-NEXT: store i32 [[FINAL_RESULT]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
-;
-entry:
- ; Initialize result to zero
- %result = alloca i32, align 4
- store i32 0, ptr %result, align 4
- br label %loop
-
-loop:
- ; Load the current result
- %cur_result = load i32, ptr %result, align 4
-
- ; Compute the next value
- %next_value = add i32 %cur_result, %src
-
- ; Apply the readfirstlane intrinsic for uniformity
- %uniform_value = call i32 @llvm.amdgcn.readfirstlane(i32 %next_value)
-
- ; Store the uniform result back
- store i32 %uniform_value, ptr %result, align 4
-
- ; This is a trivial loop that always exits after one iteration
- br i1 true, label %exit, label %loop
-
-exit:
- ; Store the result to the output pointer
- %final_result = load i32, ptr %result, align 4
- store i32 %final_result, ptr addrspace(1) %out, align 4
- ret void
-}
-
>From e9c77b5cff1a6805d233d23f890d123c37da7991 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 18 Feb 2025 09:58:43 +0530
Subject: [PATCH 09/14] handled ballot with icmp for trivial waterfall
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 4 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +-
.../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 4 +
...amdgpu-miscellaneous-uniform-intrinsics.ll | 3 +-
.../amdgpu-simplify-trivial-waterfall-loop.ll | 40 ----
.../amdgpu-uniform-intrinsic-combine.ll | 205 ++++++++++++++++--
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 23 ++
7 files changed, 224 insertions(+), 57 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ce2b4a5f6f2e9..c28d1c6925415 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -562,6 +562,10 @@ class AMDGPURewriteAGPRCopyMFMAPass
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
+void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
+FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();
+
struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 1d84497600d65..ea6177c573a41 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1319,7 +1319,7 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
if (EnableUniformIntrinsicCombine)
- addPass(createAMDGPUUniformIntrinsicCombineLegacyPass(&TM));
+ addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index bd2dca541a276..cdd6df940ecf6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -67,10 +67,14 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
<<<<<<< HEAD
+<<<<<<< HEAD
>>>>>>> f6cc7aa1522d (store newly inserted inst and its uniformity)
=======
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_ballot: {
+=======
+ case Intrinsic::amdgcn_readlane: {
+>>>>>>> cdb629c19b74 (handled ballot with icmp for trivial waterfall)
Value *Src = II.getArgOperand(0);
if (isDivergentUseWithNew(II.getOperandUse(0), UI, NewUMap))
>>>>>>> b854d2fb61ff (added pass to llc pipeline, more test added)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
index 82f92d2ccb550..f450b0e6763c4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s
define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
; CHECK-LABEL: readfirstlane_with_readfirstlane:
@@ -129,4 +129,3 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
store i32 %v, i32 addrspace(1)* %out_ptr
ret void
}
-
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
deleted file mode 100644
index 56ba117ce1d30..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-trivial-waterfall-loop.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="default<O1>" -S < %s | FileCheck %s --check-prefixes=GFX10
-
-define amdgpu_kernel void @trivial_waterfall_loop(ptr addrspace(1) %out, i32 %src) {
-; GFX10-LABEL: define amdgpu_kernel void @trivial_waterfall_loop(
-; GFX10-SAME: ptr addrspace(1) nocapture writeonly [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; GFX10-NEXT: [[ENTRY:.*:]]
-; GFX10-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
-; GFX10-NEXT: ret void
-;
-entry:
- ; Initialize result to zero
- %result = alloca i32, align 4
- store i32 0, ptr %result, align 4
- br label %loop
-
-loop:
- ; Load the current result
- %cur_result = load i32, ptr %result, align 4
-
- ; Compute the next value
- %next_value = add i32 %cur_result, %src
-
- ; Apply the readfirstlane intrinsic for uniformity
- %uniform_value = call i32 @llvm.amdgcn.readfirstlane(i32 %next_value)
-
- ; Store the uniform result back
- store i32 %uniform_value, ptr %result, align 4
-
- ; This is a trivial loop that always exits after one iteration
- br i1 true, label %exit, label %loop
-
-exit:
- ; Store the result to the output pointer
- %final_result = load i32, ptr %result, align 4
- store i32 %final_result, ptr addrspace(1) %out, align 4
- ret void
-}
-
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index d95d14c912dd4..1004a957e60c2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -6,8 +6,14 @@
define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 77)
; PASS-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; DCE-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 77)
store i32 %v, ptr addrspace(1) %out
@@ -17,7 +23,14 @@ define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_undef(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 undef)
+; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_undef(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -27,8 +40,14 @@ define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[SRC]])
; PASS-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.permlane64(i32 %src)
store i32 %v, ptr addrspace(1) %out
@@ -40,10 +59,17 @@ define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
@@ -58,10 +84,18 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -74,8 +108,14 @@ define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %o
define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 7, i32 5)
; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_constant(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
store i32 %v, ptr addrspace(1) %out
@@ -85,7 +125,14 @@ define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_undef(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 undef, i32 undef)
+; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_undef(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -95,8 +142,14 @@ define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
store i32 %v, ptr addrspace(1) %out
@@ -109,10 +162,18 @@ define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out)
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -130,10 +191,20 @@ define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out
; PASS-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
; PASS-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; DCE-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -148,8 +219,14 @@ define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out
define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
store i32 %v, ptr addrspace(1) %out
@@ -159,7 +236,14 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_undef(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 undef)
+; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
store i32 %v, ptr addrspace(1) %out
@@ -169,8 +253,14 @@ define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
store i32 %v, ptr addrspace(1) %out
@@ -182,10 +272,17 @@ define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
@@ -200,10 +297,18 @@ define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
-; PASS-CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TID2]] to i64
-; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]]
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]]
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = add i32 %tid, 1
@@ -216,8 +321,15 @@ define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) {
define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
%v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
@@ -231,8 +343,17 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -247,8 +368,16 @@ define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[V1]], i32 3)
; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]])
+; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
@@ -263,8 +392,17 @@ define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[V1]], i32 2)
; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -278,9 +416,17 @@ define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[MIN_V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 -2147483648)
; PASS-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; PASS-CHECK-NEXT: [[MAX_V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 2147483647)
; PASS-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4
+; DCE-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4
+; DCE-CHECK-NEXT: ret void
;
%min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648)
store i32 %min_v, ptr addrspace(1) %out_min
@@ -297,6 +443,14 @@ define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = add i32 %tidx, 5
@@ -308,8 +462,16 @@ define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) {
define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
+; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
+; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%random = xor i32 123, 456
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %random)
@@ -320,7 +482,14 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_invalid(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; PASS-CHECK-NEXT: [[UNDEF_V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 undef)
+; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_invalid(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
store i32 %undef_v, ptr addrspace(1) %out
@@ -331,10 +500,18 @@ define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; PASS-CHECK-NEXT: [[IDX2:%.*]] = shl i32 [[IDX1]], 1
+; PASS-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
+;
+; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
+; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; DCE-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; DCE-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]])
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: ret void
;
%idx1 = call i32 @llvm.amdgcn.workitem.id.x()
%idx2 = mul i32 %idx1, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 6e5212580ba2e..1ba47329a0d61 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -31,6 +31,11 @@
; GCN-O0-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O0-NEXT: AMDGPU Printf lowering
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O0-NEXT: FunctionPass Manager
+; GCN-O0-NEXT: Dominator Tree Construction
+; GCN-O0-NEXT: Cycle Info Analysis
+; GCN-O0-NEXT: Uniformity Analysis
+; GCN-O0-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O0-NEXT: Expand variadic functions
; GCN-O0-NEXT: AMDGPU Inline All Functions
; GCN-O0-NEXT: Inliner for always_inline functions
@@ -179,6 +184,11 @@
; GCN-O1-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-NEXT: AMDGPU Printf lowering
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O1-NEXT: FunctionPass Manager
+; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Cycle Info Analysis
+; GCN-O1-NEXT: Uniformity Analysis
+; GCN-O1-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-NEXT: Expand variadic functions
; GCN-O1-NEXT: AMDGPU Inline All Functions
; GCN-O1-NEXT: Inliner for always_inline functions
@@ -466,6 +476,11 @@
; GCN-O1-OPTS-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O1-OPTS-NEXT: FunctionPass Manager
+; GCN-O1-OPTS-NEXT: Dominator Tree Construction
+; GCN-O1-OPTS-NEXT: Cycle Info Analysis
+; GCN-O1-OPTS-NEXT: Uniformity Analysis
+; GCN-O1-OPTS-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-OPTS-NEXT: Expand variadic functions
; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
@@ -783,6 +798,10 @@
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer
+; GCN-O2-NEXT: Dominator Tree Construction
+; GCN-O2-NEXT: Cycle Info Analysis
+; GCN-O2-NEXT: Uniformity Analysis
+; GCN-O2-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O2-NEXT: Expand variadic functions
; GCN-O2-NEXT: AMDGPU Inline All Functions
; GCN-O2-NEXT: Inliner for always_inline functions
@@ -1104,6 +1123,10 @@
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer
+; GCN-O3-NEXT: Dominator Tree Construction
+; GCN-O3-NEXT: Cycle Info Analysis
+; GCN-O3-NEXT: Uniformity Analysis
+; GCN-O3-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O3-NEXT: Expand variadic functions
; GCN-O3-NEXT: AMDGPU Inline All Functions
; GCN-O3-NEXT: Inliner for always_inline functions
>From c1e3d566dea4794f381bd8b6eebd8c1c4ea1e079 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 24 Feb 2025 15:43:13 +0530
Subject: [PATCH 10/14] Rebase: resolve merge
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 -
...amdgpu-miscellaneous-uniform-intrinsics.ll | 131 ------------------
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 23 ---
3 files changed, 156 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ea6177c573a41..6347259fb3c4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1318,8 +1318,6 @@ void AMDGPUPassConfig::addIRPasses() {
isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
- if (EnableUniformIntrinsicCombine)
- addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
deleted file mode 100644
index f450b0e6763c4..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s
-
-define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
-; CHECK-LABEL: readfirstlane_with_readfirstlane:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
- %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
-; CHECK-LABEL: readfirstlane_with_readlane:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT: v_readfirstlane_b32 s2, v1
-; CHECK-NEXT: v_readlane_b32 s2, v0, s2
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
- %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
-; CHECK-LABEL: readlane_with_firstlane:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT: v_readfirstlane_b32 s2, v0
-; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
- %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
-; CHECK-LABEL: readlane_readlane:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT: v_readfirstlane_b32 s2, v1
-; CHECK-NEXT: v_readlane_b32 s2, v0, s2
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
- %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
- store i32 %v2, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
-; CHECK-LABEL: permlane64_uniform:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s2, s[4:5], 0x8
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
-; CHECK-LABEL: permlane64_nonuniform:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: v_permlane64_b32 v1, v0
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
- %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- store i32 %v, i32 addrspace(1)* %out_ptr
- ret void
-}
-
-define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
-; CHECK-LABEL: permlane64_nonuniform_expression:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v0
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: v_permlane64_b32 v1, v1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
-; CHECK-NEXT: s_endpgm
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid2 = add i32 %tid, 1
- %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
- %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- store i32 %v, i32 addrspace(1)* %out_ptr
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 1ba47329a0d61..6e5212580ba2e 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -31,11 +31,6 @@
; GCN-O0-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O0-NEXT: AMDGPU Printf lowering
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
-; GCN-O0-NEXT: FunctionPass Manager
-; GCN-O0-NEXT: Dominator Tree Construction
-; GCN-O0-NEXT: Cycle Info Analysis
-; GCN-O0-NEXT: Uniformity Analysis
-; GCN-O0-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O0-NEXT: Expand variadic functions
; GCN-O0-NEXT: AMDGPU Inline All Functions
; GCN-O0-NEXT: Inliner for always_inline functions
@@ -184,11 +179,6 @@
; GCN-O1-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-NEXT: AMDGPU Printf lowering
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
-; GCN-O1-NEXT: FunctionPass Manager
-; GCN-O1-NEXT: Dominator Tree Construction
-; GCN-O1-NEXT: Cycle Info Analysis
-; GCN-O1-NEXT: Uniformity Analysis
-; GCN-O1-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-NEXT: Expand variadic functions
; GCN-O1-NEXT: AMDGPU Inline All Functions
; GCN-O1-NEXT: Inliner for always_inline functions
@@ -476,11 +466,6 @@
; GCN-O1-OPTS-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
-; GCN-O1-OPTS-NEXT: FunctionPass Manager
-; GCN-O1-OPTS-NEXT: Dominator Tree Construction
-; GCN-O1-OPTS-NEXT: Cycle Info Analysis
-; GCN-O1-OPTS-NEXT: Uniformity Analysis
-; GCN-O1-OPTS-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-OPTS-NEXT: Expand variadic functions
; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
@@ -798,10 +783,6 @@
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer
-; GCN-O2-NEXT: Dominator Tree Construction
-; GCN-O2-NEXT: Cycle Info Analysis
-; GCN-O2-NEXT: Uniformity Analysis
-; GCN-O2-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O2-NEXT: Expand variadic functions
; GCN-O2-NEXT: AMDGPU Inline All Functions
; GCN-O2-NEXT: Inliner for always_inline functions
@@ -1123,10 +1104,6 @@
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer
-; GCN-O3-NEXT: Dominator Tree Construction
-; GCN-O3-NEXT: Cycle Info Analysis
-; GCN-O3-NEXT: Uniformity Analysis
-; GCN-O3-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O3-NEXT: Expand variadic functions
; GCN-O3-NEXT: AMDGPU Inline All Functions
; GCN-O3-NEXT: Inliner for always_inline functions
>From dc3d0f8e11e4e5dcaf86c0975f5affe501f07003 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 24 Feb 2025 16:21:16 +0530
Subject: [PATCH 11/14] remove undef test
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +-
.../amdgpu-uniform-intrinsic-combine.ll | 68 -------------------
2 files changed, 1 insertion(+), 69 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6347259fb3c4e..ed7acec2df8d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -525,7 +525,7 @@ static cl::opt<bool> HasClosedWorldAssumption(
"amdgpu-link-time-closed-world",
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);
-
+
static cl::opt<bool> EnableUniformIntrinsicCombine(
"amdgpu-enable-uniform-intrinsic-combine",
cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index 1004a957e60c2..14ca6e62b68ec 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -20,23 +20,6 @@ define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
-; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_undef(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 undef)
-; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_undef(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
@@ -122,23 +105,6 @@ define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_undef(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 undef, i32 undef)
-; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_undef(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
@@ -233,23 +199,6 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
-; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_undef(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 undef)
-; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_undef(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
@@ -479,23 +428,6 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @permlane64_invalid(ptr addrspace(1) %out) {
-; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_invalid(
-; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: [[UNDEF_V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 undef)
-; PASS-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; PASS-CHECK-NEXT: ret void
-;
-; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_invalid(
-; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 undef, ptr addrspace(1) [[OUT]], align 4
-; DCE-CHECK-NEXT: ret void
-;
- %undef_v = call i32 @llvm.amdgcn.permlane64(i32 undef)
- store i32 %undef_v, ptr addrspace(1) %out
- ret void
-}
-
define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_expression(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
>From d2f1e4e6ab3bedd59201c0ead0898babbd42e884 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 25 Feb 2025 12:44:06 +0530
Subject: [PATCH 12/14] added pass to llc pipeline
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +
...amdgpu-miscellaneous-uniform-intrinsics.ll | 131 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 23 +++
3 files changed, 156 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ed7acec2df8d5..dd5c6a9ff3d36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1318,6 +1318,8 @@ void AMDGPUPassConfig::addIRPasses() {
isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
+ if (EnableUniformIntrinsicCombine)
+ addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
new file mode 100644
index 0000000000000..f450b0e6763c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsics.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s
+
+define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readfirstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_with_firstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v0
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_readlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: permlane64_uniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s2, s[4:5], 0x8
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_permlane64_b32 v1, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform_expression:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: v_permlane64_b32 v1, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 6e5212580ba2e..1ba47329a0d61 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -31,6 +31,11 @@
; GCN-O0-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O0-NEXT: AMDGPU Printf lowering
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O0-NEXT: FunctionPass Manager
+; GCN-O0-NEXT: Dominator Tree Construction
+; GCN-O0-NEXT: Cycle Info Analysis
+; GCN-O0-NEXT: Uniformity Analysis
+; GCN-O0-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O0-NEXT: Expand variadic functions
; GCN-O0-NEXT: AMDGPU Inline All Functions
; GCN-O0-NEXT: Inliner for always_inline functions
@@ -179,6 +184,11 @@
; GCN-O1-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-NEXT: AMDGPU Printf lowering
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O1-NEXT: FunctionPass Manager
+; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Cycle Info Analysis
+; GCN-O1-NEXT: Uniformity Analysis
+; GCN-O1-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-NEXT: Expand variadic functions
; GCN-O1-NEXT: AMDGPU Inline All Functions
; GCN-O1-NEXT: Inliner for always_inline functions
@@ -466,6 +476,11 @@
; GCN-O1-OPTS-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O1-OPTS-NEXT: FunctionPass Manager
+; GCN-O1-OPTS-NEXT: Dominator Tree Construction
+; GCN-O1-OPTS-NEXT: Cycle Info Analysis
+; GCN-O1-OPTS-NEXT: Uniformity Analysis
+; GCN-O1-OPTS-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O1-OPTS-NEXT: Expand variadic functions
; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
@@ -783,6 +798,10 @@
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer
+; GCN-O2-NEXT: Dominator Tree Construction
+; GCN-O2-NEXT: Cycle Info Analysis
+; GCN-O2-NEXT: Uniformity Analysis
+; GCN-O2-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O2-NEXT: Expand variadic functions
; GCN-O2-NEXT: AMDGPU Inline All Functions
; GCN-O2-NEXT: Inliner for always_inline functions
@@ -1104,6 +1123,10 @@
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer
+; GCN-O3-NEXT: Dominator Tree Construction
+; GCN-O3-NEXT: Cycle Info Analysis
+; GCN-O3-NEXT: Uniformity Analysis
+; GCN-O3-NEXT: AMDGPU uniformIntrinsic Combine
; GCN-O3-NEXT: Expand variadic functions
; GCN-O3-NEXT: AMDGPU Inline All Functions
; GCN-O3-NEXT: Inliner for always_inline functions
>From 1ff34b8d66276363e76bb91d22f1a5911d80a901 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 25 Feb 2025 15:27:12 +0530
Subject: [PATCH 13/14] Update test checks
---
.../GlobalISel/llvm.amdgcn.ballot.i32.ll | 42 +++--
.../GlobalISel/llvm.amdgcn.ballot.i64.ll | 46 +++---
llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll | 114 ++++++-------
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 39 ++---
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 39 ++---
.../AMDGPU/llvm.amdgcn.permlane64.ptr.ll | 44 ++---
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 8 -
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 4 +-
.../spill-vgpr-to-agpr-update-regscavenger.ll | 23 ++-
.../AMDGPU/splitkit-getsubrangeformask.ll | 153 +++++++++---------
10 files changed, 236 insertions(+), 276 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 51714035352a3..8023a3589bbc2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -162,16 +162,17 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -259,17 +260,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -374,16 +371,15 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13b9ef1c..7a8bff465f26b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -165,16 +165,17 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -262,17 +263,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -377,16 +374,15 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 1b74ddfb883e8..9925bd64f4c4c 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -179,7 +179,6 @@ define double @v_sqrt_f64_fneg(double %x) {
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-GISEL-LABEL: v_sqrt_f64_fneg:
-; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
@@ -234,62 +233,6 @@ define double @v_sqrt_f64_fneg(double %x) {
; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%x.neg = fneg double %x
- %result = call double @llvm.sqrt.f64(double %x.neg)
- ret double %result
-}
-
-define double @v_sqrt_f64_fabs(double %x) {
-; GFX6-SDAG-LABEL: v_sqrt_f64_fabs:
-; GFX6-SDAG: ; %bb.0:
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
-; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
-; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
-; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
-; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
-; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
-; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-SDAG-LABEL: v_sqrt_f64_fabs:
-; GFX8-SDAG: ; %bb.0:
-; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
-; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
-; GFX8-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
-; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
-; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
-; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
-; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -350,6 +293,63 @@ define double @v_sqrt_f64_fabs(double %x) {
; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
+=======
+; SDAG-LABEL: v_sqrt_f64_fabs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fabs:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+>>>>>>> b4e8b93dc9ec (Update test checks)
%x.fabs = call double @llvm.fabs.f64(double %x)
%result = call double @llvm.sqrt.f64(double %x.fabs)
ret double %result
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index e00e1f13b2b77..64aab60679479 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -156,15 +156,16 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-NEXT: s_cselect_b32 s0, -1, 0
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT: s_cbranch_vccnz .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -245,14 +246,14 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -353,14 +354,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT: s_cbranch_vccnz .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index b4adf7f641550..6e6817595ed88 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -159,15 +159,16 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -248,14 +249,14 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -356,14 +357,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
index b0149f7de5e85..0ac614d91de27 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
@@ -6,12 +6,9 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
-; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0)
store ptr %v, ptr addrspace(1) %out
@@ -22,21 +19,14 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) {
; GFX11-SDAG-LABEL: test_v3p0:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x2
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; GFX11-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x54
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s7
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s6
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v7, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v4
-; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
-; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v8
-; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s7
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16
; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[4:5]
@@ -53,10 +43,8 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0)
store ptr addrspace(3) %v, ptr addrspace(1) %out
@@ -91,10 +79,8 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0)
store ptr addrspace(5) %v, ptr addrspace(1) %out
@@ -129,10 +115,8 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0)
store ptr addrspace(6) %v, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index d1ba892d7f7e1..bcde85c5df128 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -640,14 +640,6 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
-;
-; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
-; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
-; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 7ff5eb46def38..c71a5e0d0e618 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -9,7 +9,7 @@ declare double @llvm.amdgcn.readlane.f64(double, i32) #0
define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 {
; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32:
; CHECK-SDAG: ; %bb.0:
-; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; use s0
@@ -18,7 +18,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1
;
; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; use s0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index 586579fcaeb93..ef96944abef0e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -20,38 +20,33 @@ define void @test() {
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: .LBB0_3: ; %bb.3
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: ; implicit-def: $sgpr4
-; CHECK-NEXT: v_mov_b32_e32 v0, s4
-; CHECK-NEXT: v_readfirstlane_b32 s6, v0
; CHECK-NEXT: s_mov_b64 s[4:5], -1
-; CHECK-NEXT: s_mov_b32 s7, 0
-; CHECK-NEXT: s_cmp_eq_u32 s6, s7
; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
-; CHECK-NEXT: s_mov_b64 s[10:11], exec
-; CHECK-NEXT: s_mov_b64 exec, -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_cbranch_scc1 .LBB0_5
; CHECK-NEXT: ; %bb.4: ; %bb.4
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
-; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: .LBB0_5: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: v_readlane_b32 s4, v1, 0
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 5aafb0f576fb4..83dc3f83d03a9 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -21,7 +21,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr5
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr18
+ ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr18
; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr19
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr20
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr21
@@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: KILL undef %125:sgpr_128
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %117:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: KILL undef %117:sgpr_128
; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
@@ -42,7 +42,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc
; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
@@ -51,39 +51,32 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %87:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL undef %89:sgpr_128
- ; CHECK-NEXT: KILL undef %118:sgpr_128
+ ; CHECK-NEXT: KILL undef %112:sgpr_128
+ ; CHECK-NEXT: KILL undef %87:sgpr_128
; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
- ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
+ ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.77, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.83, addrspace 4)
; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
- ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
+ ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
@@ -114,7 +107,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -135,49 +128,50 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_15]], 168, 0 :: (invariant load (s32) from %ir.257, align 8, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.142, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 576, 0 :: (invariant load (s128) from %ir.147, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.155, addrspace 4)
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.163, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.168, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_16]], 168, 0 :: (invariant load (s64) from %ir.266, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.190, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.196, addrspace 4)
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.201, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.206, addrspace 4)
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
@@ -187,11 +181,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_17]], 168, 0 :: (invariant load (s64) from %ir.277, addrspace 4)
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
@@ -202,13 +196,14 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
; CHECK-NEXT: KILL undef %470:sreg_64
; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
+ ; CHECK-NEXT: KILL undef %436:sreg_64
; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
- ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 160, 0 :: (invariant load (s128) from %ir.249, addrspace 4)
+ ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.288, align 8, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
@@ -224,15 +219,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
- ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
- ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_19]], 96, 0 :: (invariant load (s128) from %ir.306, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_20]], 96, 0 :: (invariant load (s128) from %ir.312, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 96, 0 :: (invariant load (s128) from %ir.318, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -263,11 +258,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_10:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_9]], [[V_SUBREV_U32_e64_4]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_6:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 37, [[BUFFER_LOAD_FORMAT_X_IDXEN6]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_10]], [[V_SUBREV_U32_e64_5]], implicit $exec
- ; CHECK-NEXT: [[V_SUBREV_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 38, [[BUFFER_LOAD_FORMAT_X_IDXEN7]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_SUBREV_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 38, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_11]], [[V_SUBREV_U32_e64_6]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_12]], [[V_SUBREV_U32_e64_7]], implicit $exec
- ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN7]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_13]], [[V_SUBREV_U32_e64_8]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 51, [[BUFFER_LOAD_FORMAT_X_IDXEN9]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_14]], [[V_SUBREV_U32_e64_9]], implicit $exec
>From a5ea2cde1f0652ac617fc855f4ecac4c2af4d391 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 25 Feb 2025 20:43:29 +0530
Subject: [PATCH 14/14] expectedly failing test, illegal COPY introduction by
isel
---
llvm/test/CodeGen/AMDGPU/bf16.ll | 1 +
llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 1 +
llvm/test/CodeGen/AMDGPU/fminimum3.ll | 1 +
llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll | 1 +
llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll | 1 +
5 files changed, 5 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 4b14dc63eeb84..103d48b17dd9c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 069a47ec97bfe..2075176a65433 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index d8746b58b16b7..f210c19130d9e 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 8cf91aa900662..8c7f8e1ed0fd1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index c561924ae71bf..9f806bad68e5b 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SDAG,GCN-IEEE,SDAG-IEEE %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GISEL,GCN-IEEE,GISEL-IEEE %s
More information about the llvm-commits
mailing list