[llvm] [SLP][AMDGPU] Vectorize operands of non-trivially-vectorizable intrinsic calls (PR #189784)
Syadus Sefat via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 5 18:56:25 PDT 2026
https://github.com/mssefat updated https://github.com/llvm/llvm-project/pull/189784
>From c5fbe3c0fdef42860e2dc715364d62f8a27937f6 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Tue, 31 Mar 2026 19:28:41 -0500
Subject: [PATCH 1/6] [SLP][AMDGPU] Vectorize operands of
non-trivially-vectorizable intrinsic calls
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 49 +++
...otriviallyvectorizableintrinsicoperands.ll | 360 ++++++++++++++++++
2 files changed, 409 insertions(+)
create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f7c78db5a83ac..9301a938160aa 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -243,6 +243,39 @@ static const int MinScheduleRegionSize = 16;
/// Maximum allowed number of operands in the PHI nodes.
static const unsigned MaxPHINumOperands = 128;
+/// For instructions that are not trivially vectorizable, try to vectorize their
+/// operands.
+/// FIXME: Extend for all non-vectorized functions.
+static Value *getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+ auto *CI = dyn_cast<CallInst>(V);
+ if (!CI)
+ return nullptr;
+ Intrinsic::ID ID = CI->getIntrinsicID();
+ // Only consider intrinsic calls.
+ // FIXME: We may want to relax this condition in future.
+ if (ID == Intrinsic::not_intrinsic)
+ return nullptr;
+ // Skip trivially vectorizable intrinsics.
+ if (isTriviallyVectorizable(ID))
+ return nullptr;
+ // Only look through unary intrinsic calls.
+ if (CI->arg_size() != 1)
+ return nullptr;
+ // Check if it is speculatable, no memory access and will return
+ if (!CI->hasFnAttr(Attribute::Speculatable) || !CI->doesNotAccessMemory() ||
+ !CI->willReturn())
+ return nullptr;
+ auto *Operand = dyn_cast<Instruction>(CI->getArgOperand(0));
+ if (!Operand)
+ return nullptr;
+ // Operand type should match the result type we ignore type changing
+ // intrinsics.
+ if (Operand->getType() != CI->getType())
+ return nullptr;
+
+ return Operand;
+}
+
/// Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
@@ -29477,6 +29510,22 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
PostProcessCmps.insert(cast<CmpInst>(&*It));
}
+ DenseMap<Intrinsic::ID, SmallSetVector<Value *, 4>> IntrinsicSeedOps;
+ for (Instruction &I : *BB) {
+ if (R.isDeleted(&I))
+ continue;
+ // Collect operands of non-trivially vectorizable intrinsic calls (e.g.,
+ // llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
+ // vectorized independently.
+ // FIXME: Extend for all non-vectorized functions.
+ if (Value *Op = getNonTriviallyVectorizableIntrinsicCallOperand(&I))
+ IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert(Op);
+ }
+ // Try to vectorize per intrinsic call ID.
+ for (auto &[ID, Ops] : IntrinsicSeedOps) {
+ Changed |= tryToVectorizeList(Ops.getArrayRef(), R);
+ }
+
return Changed;
}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
new file mode 100644
index 0000000000000..0ca33e0e6b09f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
@@ -0,0 +1,360 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefix GCN
+
+define amdgpu_kernel void @test_with_wmma( ptr addrspace(1) %input, ptr addrspace(1) %output, float %scaled_max, <16 x i32> %A, <16 x i32> %B, i32 %scale_idx) {
+; GCN-LABEL: define amdgpu_kernel void @test_with_wmma(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] {
+; GCN-NEXT: [[ENTRY:.*:]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT: [[TMP1:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[SCALED_MAX]], i32 0
+; GCN-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; GCN-NEXT: [[TMP4:%.*]] = fsub contract <2 x float> [[TMP1]], [[TMP3]]
+; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
+; GCN-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP6]])
+; GCN-NEXT: [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0
+; GCN-NEXT: [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1
+; GCN-NEXT: [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32>
+; GCN-NEXT: [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0
+; GCN-NEXT: [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1
+; GCN-NEXT: [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; GCN-NEXT: [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; GCN-NEXT: store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32
+; GCN-NEXT: ret void
+;
+entry:
+
+ %in0 = load float, ptr addrspace(1) %input, align 4
+ %ptr1 = getelementptr float, ptr addrspace(1) %input, i64 1
+ %in1 = load float, ptr addrspace(1) %ptr1, align 4
+
+ %mul0 = fmul contract float %in0, 0x3FC0527DC0000000
+ %mul1 = fmul contract float %in1, 0x3FC0527DC0000000
+
+ %sub0 = fsub contract float %mul0, %scaled_max
+ %sub1 = fsub contract float %mul1, %scaled_max
+
+ %exp0 = tail call float @llvm.amdgcn.exp2.f32(float %sub0)
+ %exp1 = tail call float @llvm.amdgcn.exp2.f32(float %sub1)
+
+ %vec0 = insertelement <2 x float> poison, float %exp0, i64 0
+ %vec1 = insertelement <2 x float> %vec0, float %exp1, i64 1
+
+ %vec_i32 = bitcast <2 x float> %vec1 to <2 x i32>
+
+ %scale0 = extractelement <2 x i32> %vec_i32, i64 0
+ %scale1 = extractelement <2 x i32> %vec_i32, i64 1
+
+ %wmma0 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(
+ i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> zeroinitializer,
+ i32 0, i32 0, i32 %scale0, i32 0, i32 0, i32 %scale_idx, i1 false, i1 false)
+
+ %wmma1 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(
+ i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %wmma0,
+ i32 0, i32 0, i32 %scale1, i32 0, i32 0, i32 %scale_idx, i1 false, i1 false)
+
+ store <8 x float> %wmma1, ptr addrspace(1) %output, align 32
+ ret void
+}
+
+define amdgpu_kernel void @test_amdgcn_exp_log(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_log(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[ENTRY:.*:]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4
+; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]]
+; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP4]])
+; GCN-NEXT: [[LOG0:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP0]])
+; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
+; GCN-NEXT: [[LOG1:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP1]])
+; GCN-NEXT: [[SUM:%.*]] = fadd fast float [[LOG0]], [[LOG1]]
+; GCN-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT: ret void
+;
+entry:
+ %in0 = load float, ptr addrspace(1) %input, align 4
+ %ptr1 = getelementptr float, ptr addrspace(1) %input, i64 1
+ %in1 = load float, ptr addrspace(1) %ptr1, align 4
+ %scale0 = load float, ptr addrspace(1) %scales, align 4
+ %sptr1 = getelementptr float, ptr addrspace(1) %scales, i64 1
+ %scale1 = load float, ptr addrspace(1) %sptr1, align 4
+ %mul0 = fmul contract float %in0, 0x3FC0527DC0000000
+ %mul1 = fmul contract float %in1, 0x3FC0527DC0000000
+ %sub0 = fsub contract float %mul0, %scale0
+ %sub1 = fsub contract float %mul1, %scale1
+ %exp0 = tail call float @llvm.amdgcn.exp2.f32(float %sub0)
+ %log0 = tail call float @llvm.amdgcn.log.f32(float %exp0)
+ %exp1 = tail call float @llvm.amdgcn.exp2.f32(float %sub1)
+ %log1 = tail call float @llvm.amdgcn.log.f32(float %exp1)
+ %sum = fadd fast float %log0, %log1
+ store float %sum, ptr addrspace(1) %output, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_amdgcn_exp_f16(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_f16(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[ENTRY:.*:]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
+; GCN-NEXT: [[SUM:%.*]] = fadd fast half [[EXP0]], [[EXP1]]
+; GCN-NEXT: store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT: ret void
+;
+entry:
+ %in0 = load half, ptr addrspace(1) %input, align 2
+ %ptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+ %in1 = load half, ptr addrspace(1) %ptr1, align 2
+ %scale0 = load half, ptr addrspace(1) %scales, align 2
+ %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+ %scale1 = load half, ptr addrspace(1) %sptr1, align 2
+ %mul0 = fmul contract half %in0, 0xH3E14
+ %mul1 = fmul contract half %in1, 0xH3E14
+ %sub0 = fsub contract half %mul0, %scale0
+ %sub1 = fsub contract half %mul1, %scale1
+ %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %sub0)
+ %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %sub1)
+ %sum = fadd fast half %exp0, %exp1
+ store half %sum, ptr addrspace(1) %output, align 2
+ ret void
+}
+
+define amdgpu_kernel void @kernel_f16(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @kernel_f16(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[ENTRY:.*:]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
+; GCN-NEXT: [[LOG0:%.*]] = tail call half @llvm.amdgcn.log.f16(half [[EXP0]])
+; GCN-NEXT: [[LOG1:%.*]] = tail call half @llvm.amdgcn.log.f16(half [[EXP1]])
+; GCN-NEXT: [[SUM:%.*]] = fadd fast half [[LOG0]], [[LOG1]]
+; GCN-NEXT: store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT: ret void
+;
+entry:
+ %in0 = load half, ptr addrspace(1) %input, align 2
+ %ptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+ %in1 = load half, ptr addrspace(1) %ptr1, align 2
+ %scale0 = load half, ptr addrspace(1) %scales, align 2
+ %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+ %scale1 = load half, ptr addrspace(1) %sptr1, align 2
+ %mul0 = fmul contract half %in0, 0xH3E14
+ %mul1 = fmul contract half %in1, 0xH3E14
+ %sub0 = fsub contract half %mul0, %scale0
+ %sub1 = fsub contract half %mul1, %scale1
+ %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %sub0)
+ %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %sub1)
+ %log0 = tail call half @llvm.amdgcn.log.f16(half %exp0)
+ %log1 = tail call half @llvm.amdgcn.log.f16(half %exp1)
+ %sum = fadd fast half %log0, %log1
+ store half %sum, ptr addrspace(1) %output, align 2
+ ret void
+}
+
+define amdgpu_kernel void @look_through_reuse_shuffle(
+; GCN-LABEL: define amdgpu_kernel void @look_through_reuse_shuffle(
+; GCN-SAME: ptr addrspace(1) noalias [[INPUT:%.*]], ptr addrspace(1) noalias [[SCALES:%.*]], ptr addrspace(1) noalias [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[ENTRY:.*:]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT: [[TMP2:%.*]] = fadd contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT: [[TMP3:%.*]] = fmul contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
+; GCN-NEXT: [[TMP6:%.*]] = insertelement <4 x half> poison, half [[EXP0]], i32 0
+; GCN-NEXT: [[TMP7:%.*]] = insertelement <4 x half> [[TMP6]], half [[EXP1]], i32 1
+; GCN-NEXT: [[TMP8:%.*]] = shufflevector <4 x half> [[TMP7]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; GCN-NEXT: store <4 x half> [[TMP8]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT: ret void
+;
+ ptr addrspace(1) noalias %input, ptr addrspace(1) noalias %scales,
+ ptr addrspace(1) noalias %output) {
+entry:
+ %iptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+ %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+ %optr1 = getelementptr half, ptr addrspace(1) %output, i64 1
+ %optr2 = getelementptr half, ptr addrspace(1) %output, i64 2
+ %optr3 = getelementptr half, ptr addrspace(1) %output, i64 3
+
+ %in0 = load half, ptr addrspace(1) %input, align 2
+ %in1 = load half, ptr addrspace(1) %iptr1, align 2
+ %s0 = load half, ptr addrspace(1) %scales, align 2
+ %s1 = load half, ptr addrspace(1) %sptr1, align 2
+
+ %add0 = fadd contract half %in0, 0xH3E14
+ %add1 = fadd contract half %in1, 0xH3E14
+
+ %mul0 = fmul contract half %add0, %s0
+ %mul1 = fmul contract half %add1, %s1
+
+ %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %mul0)
+ %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %mul1)
+
+ store half %exp0, ptr addrspace(1) %output, align 2
+ store half %exp1, ptr addrspace(1) %optr1, align 2
+ store half %exp1, ptr addrspace(1) %optr2, align 2
+ store half %exp1, ptr addrspace(1) %optr3, align 2
+ ret void
+}
+
+define amdgpu_kernel void @wider_exp2_f32(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @wider_exp2_f32(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[ENTRY:.*:]]
+; GCN-NEXT: [[PTR2:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 2
+; GCN-NEXT: [[SPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 2
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4
+; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]]
+; GCN-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR2]], align 4
+; GCN-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr addrspace(1) [[SPTR2]], align 4
+; GCN-NEXT: [[TMP6:%.*]] = fmul contract <2 x float> [[TMP4]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT: [[TMP7:%.*]] = fsub contract <2 x float> [[TMP6]], [[TMP5]]
+; GCN-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP8]])
+; GCN-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP9]])
+; GCN-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
+; GCN-NEXT: [[EXP2:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP10]])
+; GCN-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
+; GCN-NEXT: [[EXP3:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP11]])
+; GCN-NEXT: [[SUM01:%.*]] = fadd fast float [[EXP0]], [[EXP1]]
+; GCN-NEXT: [[SUM23:%.*]] = fadd fast float [[EXP2]], [[EXP3]]
+; GCN-NEXT: [[SUM:%.*]] = fadd fast float [[SUM01]], [[SUM23]]
+; GCN-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT: ret void
+;
+entry:
+ %in0 = load float, ptr addrspace(1) %input, align 4
+ %ptr1 = getelementptr float, ptr addrspace(1) %input, i64 1
+ %in1 = load float, ptr addrspace(1) %ptr1, align 4
+ %ptr2 = getelementptr float, ptr addrspace(1) %input, i64 2
+ %in2 = load float, ptr addrspace(1) %ptr2, align 4
+ %ptr3 = getelementptr float, ptr addrspace(1) %input, i64 3
+ %in3 = load float, ptr addrspace(1) %ptr3, align 4
+
+ %scale0 = load float, ptr addrspace(1) %scales, align 4
+ %sptr1 = getelementptr float, ptr addrspace(1) %scales, i64 1
+ %scale1 = load float, ptr addrspace(1) %sptr1, align 4
+ %sptr2 = getelementptr float, ptr addrspace(1) %scales, i64 2
+ %scale2 = load float, ptr addrspace(1) %sptr2, align 4
+ %sptr3 = getelementptr float, ptr addrspace(1) %scales, i64 3
+ %scale3 = load float, ptr addrspace(1) %sptr3, align 4
+
+ %mul0 = fmul contract float %in0, 0x3FC0527DC0000000
+ %mul1 = fmul contract float %in1, 0x3FC0527DC0000000
+ %mul2 = fmul contract float %in2, 0x3FC0527DC0000000
+ %mul3 = fmul contract float %in3, 0x3FC0527DC0000000
+
+ %sub0 = fsub contract float %mul0, %scale0
+ %sub1 = fsub contract float %mul1, %scale1
+ %sub2 = fsub contract float %mul2, %scale2
+ %sub3 = fsub contract float %mul3, %scale3
+
+ %exp0 = tail call float @llvm.amdgcn.exp2.f32(float %sub0)
+ %exp1 = tail call float @llvm.amdgcn.exp2.f32(float %sub1)
+ %exp2 = tail call float @llvm.amdgcn.exp2.f32(float %sub2)
+ %exp3 = tail call float @llvm.amdgcn.exp2.f32(float %sub3)
+
+ %sum01 = fadd fast float %exp0, %exp1
+ %sum23 = fadd fast float %exp2, %exp3
+ %sum = fadd fast float %sum01, %sum23
+
+ store float %sum, ptr addrspace(1) %output, align 4
+ ret void
+}
+
+define amdgpu_kernel void @wider_exp2_half(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @wider_exp2_half(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[ENTRY:.*:]]
+; GCN-NEXT: [[PTR2:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 2
+; GCN-NEXT: [[SPTR2:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 2
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR2]], align 2
+; GCN-NEXT: [[TMP5:%.*]] = load <2 x half>, ptr addrspace(1) [[SPTR2]], align 2
+; GCN-NEXT: [[TMP6:%.*]] = fmul contract <2 x half> [[TMP4]], splat (half 0xH3E14)
+; GCN-NEXT: [[TMP7:%.*]] = fsub contract <2 x half> [[TMP6]], [[TMP5]]
+; GCN-NEXT: [[TMP8:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP8]])
+; GCN-NEXT: [[TMP9:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP9]])
+; GCN-NEXT: [[TMP10:%.*]] = extractelement <2 x half> [[TMP7]], i32 0
+; GCN-NEXT: [[EXP2:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP10]])
+; GCN-NEXT: [[TMP11:%.*]] = extractelement <2 x half> [[TMP7]], i32 1
+; GCN-NEXT: [[EXP3:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP11]])
+; GCN-NEXT: [[SUM01:%.*]] = fadd fast half [[EXP0]], [[EXP1]]
+; GCN-NEXT: [[SUM23:%.*]] = fadd fast half [[EXP2]], [[EXP3]]
+; GCN-NEXT: [[SUM:%.*]] = fadd fast half [[SUM01]], [[SUM23]]
+; GCN-NEXT: store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT: ret void
+;
+entry:
+ ;; Same 4-wide pattern as wider_exp2_f32. Cost model gives half higher
+ ;; LookThrough cost, so we get 2-wide vectorization (or none on stricter targets).
+ %in0 = load half, ptr addrspace(1) %input, align 2
+ %ptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+ %in1 = load half, ptr addrspace(1) %ptr1, align 2
+ %ptr2 = getelementptr half, ptr addrspace(1) %input, i64 2
+ %in2 = load half, ptr addrspace(1) %ptr2, align 2
+ %ptr3 = getelementptr half, ptr addrspace(1) %input, i64 3
+ %in3 = load half, ptr addrspace(1) %ptr3, align 2
+
+ %scale0 = load half, ptr addrspace(1) %scales, align 2
+ %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+ %scale1 = load half, ptr addrspace(1) %sptr1, align 2
+ %sptr2 = getelementptr half, ptr addrspace(1) %scales, i64 2
+ %scale2 = load half, ptr addrspace(1) %sptr2, align 2
+ %sptr3 = getelementptr half, ptr addrspace(1) %scales, i64 3
+ %scale3 = load half, ptr addrspace(1) %sptr3, align 2
+
+ %mul0 = fmul contract half %in0, 0xH3E14
+ %mul1 = fmul contract half %in1, 0xH3E14
+ %mul2 = fmul contract half %in2, 0xH3E14
+ %mul3 = fmul contract half %in3, 0xH3E14
+
+ %sub0 = fsub contract half %mul0, %scale0
+ %sub1 = fsub contract half %mul1, %scale1
+ %sub2 = fsub contract half %mul2, %scale2
+ %sub3 = fsub contract half %mul3, %scale3
+
+ %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %sub0)
+ %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %sub1)
+ %exp2 = tail call half @llvm.amdgcn.exp2.f16(half %sub2)
+ %exp3 = tail call half @llvm.amdgcn.exp2.f16(half %sub3)
+
+ %sum01 = fadd fast half %exp0, %exp1
+ %sum23 = fadd fast half %exp2, %exp3
+ %sum = fadd fast half %sum01, %sum23
+
+ store half %sum, ptr addrspace(1) %output, align 2
+ ret void
+}
+
+declare half @llvm.amdgcn.exp2.f16(half)
+declare float @llvm.amdgcn.exp2.f32(float)
+declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 immarg, <16 x i32>, i32 immarg, <16 x i32>, i16 immarg, <8 x float>, i32 immarg, i32 immarg, i32, i32 immarg, i32 immarg, i32, i1 immarg, i1 immarg)
>From 7c6b48e692269b6e3006957511117c13d7b054c9 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Wed, 1 Apr 2026 17:50:49 -0500
Subject: [PATCH 2/6] Fixed comment!
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9301a938160aa..3cb7e8e35779b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -258,7 +258,7 @@ static Value *getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
// Skip trivially vectorizable intrinsics.
if (isTriviallyVectorizable(ID))
return nullptr;
- // Only look through unary intrinsic calls.
+ // Only consider unary intrinsic calls.
if (CI->arg_size() != 1)
return nullptr;
// Check if it is speculatable, no memory access and will return
>From 54f51671b21e84100ef4a3f9356920e9a4ad358a Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 3 Apr 2026 01:16:28 -0500
Subject: [PATCH 3/6] Fixed checks and enabled for all operands
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 62 ++++++++++---------
.../Transforms/SLPVectorizer/RISCV/revec.ll | 7 +--
2 files changed, 37 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3cb7e8e35779b..514acaa49fffd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -246,35 +246,32 @@ static const unsigned MaxPHINumOperands = 128;
/// For instructions that are not trivially vectorizable, try to vectorize their
/// operands.
/// FIXME: Extend for all non-vectorized functions.
-static Value *getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+SmallVector<Value *, 4>
+getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+
+ SmallVector<Value *, 4> Operands;
auto *CI = dyn_cast<CallInst>(V);
- if (!CI)
- return nullptr;
+
+ if (!CI || isAssumeLikeIntrinsic(CI))
+ return {};
Intrinsic::ID ID = CI->getIntrinsicID();
// Only consider intrinsic calls.
// FIXME: We may want to relax this condition in future.
- if (ID == Intrinsic::not_intrinsic)
- return nullptr;
- // Skip trivially vectorizable intrinsics.
- if (isTriviallyVectorizable(ID))
- return nullptr;
- // Only consider unary intrinsic calls.
- if (CI->arg_size() != 1)
- return nullptr;
- // Check if it is speculatable, no memory access and will return
- if (!CI->hasFnAttr(Attribute::Speculatable) || !CI->doesNotAccessMemory() ||
- !CI->willReturn())
- return nullptr;
- auto *Operand = dyn_cast<Instruction>(CI->getArgOperand(0));
- if (!Operand)
- return nullptr;
- // Operand type should match the result type we ignore type changing
- // intrinsics.
- if (Operand->getType() != CI->getType())
- return nullptr;
+ if (ID == Intrinsic::not_intrinsic || isTriviallyVectorizable(ID))
+ return {};
- return Operand;
-}
+ // Skip memory intrinsics (e.g., masked.load, masked.gather etc.)
+ if (CI->mayReadOrWriteMemory())
+ return {};
+
+ for (Value *ArgOp : CI->args()) {
+ if (auto *I = dyn_cast<Instruction>(ArgOp)) {
+ Operands.emplace_back(I);
+ }
+ }
+
+ return Operands;
+}
/// Predicate for the element types that the SLP vectorizer supports.
///
@@ -29510,7 +29507,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
PostProcessCmps.insert(cast<CmpInst>(&*It));
}
- DenseMap<Intrinsic::ID, SmallSetVector<Value *, 4>> IntrinsicSeedOps;
+ SmallMapVector<Intrinsic::ID, SmallSetVector<Value *, 4>, 4> IntrinsicSeedOps;
for (Instruction &I : *BB) {
if (R.isDeleted(&I))
continue;
@@ -29518,12 +29515,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
// vectorized independently.
// FIXME: Extend for all non-vectorized functions.
- if (Value *Op = getNonTriviallyVectorizableIntrinsicCallOperand(&I))
- IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert(Op);
+ SmallVector<Value *, 4> Ops =
+ getNonTriviallyVectorizableIntrinsicCallOperand(&I);
+ if (!Ops.empty())
+ IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert_range(Ops);
}
// Try to vectorize per intrinsic call ID.
for (auto &[ID, Ops] : IntrinsicSeedOps) {
- Changed |= tryToVectorizeList(Ops.getArrayRef(), R);
+ // Sub-group by opcode so we do not get bailed early
+ SmallMapVector<unsigned, SmallVector<Value *, 4>, 4> OpcodeGroups;
+ for (Value *Op : Ops) {
+ if (auto *I = dyn_cast<Instruction>(Op))
+ OpcodeGroups[I->getOpcode()].push_back(Op);
+ }
+ for (auto &[Opc, Group] : OpcodeGroups)
+ Changed |= tryToVectorizeList(Group, R);
}
return Changed;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index e13dfce8c29f3..016726e5ae371 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -177,11 +177,10 @@ define ptr @test4() {
; NONPOWEROF2-NEXT: [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ]
; NONPOWEROF2-NEXT: br label [[TMP9:%.*]]
; NONPOWEROF2: 10:
-; NONPOWEROF2-NEXT: [[TMP10:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NONPOWEROF2-NEXT: [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]]
-; NONPOWEROF2-NEXT: [[TMP12:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; NONPOWEROF2-NEXT: [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]]
+; NONPOWEROF2-NEXT: [[TMP12:%.*]] = fmul <6 x float> zeroinitializer, [[TMP8]]
+; NONPOWEROF2-NEXT: [[TMP11:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
; NONPOWEROF2-NEXT: [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
+; NONPOWEROF2-NEXT: [[TMP13:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
; NONPOWEROF2-NEXT: [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
; NONPOWEROF2-NEXT: [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]])
; NONPOWEROF2-NEXT: [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])
>From 5e656dc3b1977cea7719411328cabe68f83dffd5 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 3 Apr 2026 01:32:01 -0500
Subject: [PATCH 4/6] Fixed format
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 514acaa49fffd..36e94de1cc18e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -271,7 +271,7 @@ getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
}
return Operands;
-}
+}
/// Predicate for the element types that the SLP vectorizer supports.
///
@@ -12276,8 +12276,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
}
}
if (NewLoopNest.size() > CurrentLoopNest.size())
- CurrentLoopNest.append(std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
- NewLoopNest.end());
+ CurrentLoopNest.append(
+ std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
+ NewLoopNest.end());
}
}
}
@@ -18475,10 +18476,9 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
assert(SLPReVec && "Only supported by REVEC.");
SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
}
- InstructionCost CastCost =
- TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
- TTI::CastContextHint::None,
- TTI::TCK_RecipThroughput);
+ InstructionCost CastCost = TTI->getCastInstrCost(
+ Opcode, DstTy, SrcTy, TTI::CastContextHint::None,
+ TTI::TCK_RecipThroughput);
CastCost = ScaleCost(CastCost, Root, /*Scalar=*/nullptr, ReductionRoot);
Cost += CastCost;
}
@@ -18647,9 +18647,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
default:
break;
}
- InstructionCost CastCost =
- TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
- TTI::TCK_RecipThroughput);
+ InstructionCost CastCost = TTI->getCastInstrCost(
+ Opcode, DstVecTy, SrcVecTy, CCH, TTI::TCK_RecipThroughput);
CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
/*Scalar=*/nullptr, ReductionRoot);
Cost += CastCost;
>From 25404e5d9db380e46d70d8831f51c65f2c94a721 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 3 Apr 2026 15:19:24 -0500
Subject: [PATCH 5/6] Addressed reviewes
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 50 +++++++++----------
1 file changed, 24 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 36e94de1cc18e..05dcc1d134f0f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -261,7 +261,7 @@ getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
return {};
// Skip memory intrinsics (e.g., masked.load, masked.gather etc.)
- if (CI->mayReadOrWriteMemory())
+ if (!SLPReVec && CI->getType()->isVectorTy())
return {};
for (Value *ArgOp : CI->args()) {
@@ -12276,9 +12276,8 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
}
}
if (NewLoopNest.size() > CurrentLoopNest.size())
- CurrentLoopNest.append(
- std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
- NewLoopNest.end());
+ CurrentLoopNest.append(std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
+ NewLoopNest.end());
}
}
}
@@ -18476,9 +18475,10 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
assert(SLPReVec && "Only supported by REVEC.");
SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
}
- InstructionCost CastCost = TTI->getCastInstrCost(
- Opcode, DstTy, SrcTy, TTI::CastContextHint::None,
- TTI::TCK_RecipThroughput);
+ InstructionCost CastCost =
+ TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
+ TTI::CastContextHint::None,
+ TTI::TCK_RecipThroughput);
CastCost = ScaleCost(CastCost, Root, /*Scalar=*/nullptr, ReductionRoot);
Cost += CastCost;
}
@@ -18647,8 +18647,9 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
default:
break;
}
- InstructionCost CastCost = TTI->getCastInstrCost(
- Opcode, DstVecTy, SrcVecTy, CCH, TTI::TCK_RecipThroughput);
+ InstructionCost CastCost =
+ TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
+ TTI::TCK_RecipThroughput);
CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
/*Scalar=*/nullptr, ReductionRoot);
Cost += CastCost;
@@ -29506,30 +29507,27 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
PostProcessCmps.insert(cast<CmpInst>(&*It));
}
- SmallMapVector<Intrinsic::ID, SmallSetVector<Value *, 4>, 4> IntrinsicSeedOps;
+ // Collect operands of non-trivially vectorizable intrinsic calls (e.g.,
+ // llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
+ // vectorized independently.
+ // FIXME: Extend for all non-vectorized functions.
+ SmallMapVector<std::pair<Intrinsic::ID, unsigned>, SmallVector<Value *, 4>, 4>
+ OpcodeGroups;
+
for (Instruction &I : *BB) {
if (R.isDeleted(&I))
continue;
- // Collect operands of non-trivially vectorizable intrinsic calls (e.g.,
- // llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
- // vectorized independently.
- // FIXME: Extend for all non-vectorized functions.
SmallVector<Value *, 4> Ops =
getNonTriviallyVectorizableIntrinsicCallOperand(&I);
- if (!Ops.empty())
- IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert_range(Ops);
- }
- // Try to vectorize per intrinsic call ID.
- for (auto &[ID, Ops] : IntrinsicSeedOps) {
- // Sub-group by opcode so we do not get bailed early
- SmallMapVector<unsigned, SmallVector<Value *, 4>, 4> OpcodeGroups;
- for (Value *Op : Ops) {
- if (auto *I = dyn_cast<Instruction>(Op))
- OpcodeGroups[I->getOpcode()].push_back(Op);
+ if (!Ops.empty()) {
+ Intrinsic::ID ID = cast<CallInst>(&I)->getIntrinsicID();
+ for (Value *Op : Ops)
+ if (auto *OpI = dyn_cast<Instruction>(Op))
+ OpcodeGroups[{ID, OpI->getOpcode()}].push_back(Op);
}
- for (auto &[Opc, Group] : OpcodeGroups)
- Changed |= tryToVectorizeList(Group, R);
}
+ for (auto &[_, OpGroup] : OpcodeGroups)
+ Changed |= tryToVectorizeList(OpGroup, R);
return Changed;
}
>From daca0bb67c57abfdb91ed34143421e843778dd63 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 5 Apr 2026 20:53:37 -0500
Subject: [PATCH 6/6] Addressed reviewers comments, added new tests.
---
.../llvm/Transforms/Vectorize/SLPVectorizer.h | 5 +
.../Transforms/Vectorize/SLPVectorizer.cpp | 55 ++++--
...otriviallyvectorizableintrinsicoperands.ll | 184 ++++++++++++++++++
3 files changed, 226 insertions(+), 18 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 877c83291170b..7c35dd70e2a77 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -152,6 +152,11 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
/// a vectorization chain.
bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
+ /// Tries to vectorize non-trivially-vectorizable intrinsic calls operands in
+ /// the block.
+ bool vectorizeIntrinsicSeedsInBlock(BasicBlock *BB,
+ slpvectorizer::BoUpSLP &R);
+
std::optional<bool> vectorizeStoreChain(ArrayRef<Value *> Chain,
slpvectorizer::BoUpSLP &R,
unsigned Idx, unsigned MinVF,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 05dcc1d134f0f..db28c2ba18fe2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -249,22 +249,20 @@ static const unsigned MaxPHINumOperands = 128;
SmallVector<Value *, 4>
getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
- SmallVector<Value *, 4> Operands;
- auto *CI = dyn_cast<CallInst>(V);
-
- if (!CI || isAssumeLikeIntrinsic(CI))
+ auto *II = dyn_cast<IntrinsicInst>(V);
+ if (!II || isAssumeLikeIntrinsic(II))
return {};
- Intrinsic::ID ID = CI->getIntrinsicID();
- // Only consider intrinsic calls.
- // FIXME: We may want to relax this condition in future.
- if (ID == Intrinsic::not_intrinsic || isTriviallyVectorizable(ID))
+
+ if (isTriviallyVectorizable(II->getIntrinsicID()))
return {};
// Skip memory intrinsics (e.g., masked.load, masked.gather etc.)
- if (!SLPReVec && CI->getType()->isVectorTy())
+ if (!SLPReVec && II->getType()->isVectorTy())
return {};
- for (Value *ArgOp : CI->args()) {
+ // FIXME: Add non-instructions operands to the list.
+ SmallVector<Value *, 4> Operands;
+ for (Value *ArgOp : II->args()) {
if (auto *I = dyn_cast<Instruction>(ArgOp)) {
Operands.emplace_back(I);
}
@@ -25298,6 +25296,9 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Vectorize trees that end at reductions.
Changed |= vectorizeChainsInBlock(BB, R);
+ // Vectorize non-trivially-vectorizable intrinsic.
+ Changed |= vectorizeIntrinsicSeedsInBlock(BB, R);
+
// Vectorize the index computations of getelementptr instructions. This
// is primarily intended to catch gather-like idioms ending at
// non-consecutive loads.
@@ -29507,13 +29508,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
PostProcessCmps.insert(cast<CmpInst>(&*It));
}
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeIntrinsicSeedsInBlock(BasicBlock *BB,
+ BoUpSLP &R) {
+ bool Changed = false;
// Collect operands of non-trivially vectorizable intrinsic calls (e.g.,
// llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
// vectorized independently.
// FIXME: Extend for all non-vectorized functions.
- SmallMapVector<std::pair<Intrinsic::ID, unsigned>, SmallVector<Value *, 4>, 4>
- OpcodeGroups;
-
+ SmallMapVector<std::pair<Intrinsic::ID, unsigned>, // (ID, OpIndex)
+ SmallMapVector<unsigned, // Opcode
+ SmallVector<Value *, 4>, 4>,
+ 4>
+ IntrinsicSeedOps;
for (Instruction &I : *BB) {
if (R.isDeleted(&I))
continue;
@@ -29521,14 +29530,24 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
getNonTriviallyVectorizableIntrinsicCallOperand(&I);
if (!Ops.empty()) {
Intrinsic::ID ID = cast<CallInst>(&I)->getIntrinsicID();
- for (Value *Op : Ops)
- if (auto *OpI = dyn_cast<Instruction>(Op))
- OpcodeGroups[{ID, OpI->getOpcode()}].push_back(Op);
+ for (auto [OpIdx, Op] : enumerate(Ops)) {
+ if (auto *OpI = dyn_cast<Instruction>(Op)) {
+ IntrinsicSeedOps[{ID, OpIdx}][OpI->getOpcode()].push_back(Op);
+ }
+ }
}
}
- for (auto &[_, OpGroup] : OpcodeGroups)
- Changed |= tryToVectorizeList(OpGroup, R);
+ for (auto &[_, OpcodeMap] : IntrinsicSeedOps)
+ for (auto &[_, Group] : OpcodeMap) {
+ // Don't include instructions that were deleted by previous
+ // vectorization.
+ auto Candidates = make_filter_range(Group, [&](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && !R.isDeleted(I);
+ });
+ Changed |= tryToVectorizeList(SmallVector<Value *, 4>(Candidates), R);
+ }
return Changed;
}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
index 0ca33e0e6b09f..963b7808335ac 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
@@ -355,6 +355,190 @@ entry:
ret void
}
+define amdgpu_kernel void @kernel_div_scale(ptr addrspace(1) %num, ptr addrspace(1) %den, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @kernel_div_scale(
+; GCN-SAME: ptr addrspace(1) [[NUM:%.*]], ptr addrspace(1) [[DEN:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[ENTRY:.*:]]
+; GCN-NEXT: [[NPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[NUM]], i64 2
+; GCN-NEXT: [[N2:%.*]] = load float, ptr addrspace(1) [[NPTR2]], align 4
+; GCN-NEXT: [[DPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[DEN]], i64 2
+; GCN-NEXT: [[D2:%.*]] = load float, ptr addrspace(1) [[DPTR2]], align 4
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[NUM]], align 4
+; GCN-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], splat (float 2.000000e+00)
+; GCN-NEXT: [[MUL_N2:%.*]] = fmul float [[N2]], 2.000000e+00
+; GCN-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(1) [[DEN]], align 4
+; GCN-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], splat (float 4.000000e+00)
+; GCN-NEXT: [[MUL_D2:%.*]] = fmul float [[D2]], 4.000000e+00
+; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT: [[DS0:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP4]], float [[TMP5]], i1 false)
+; GCN-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; GCN-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT: [[DS1:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP6]], float [[TMP7]], i1 false)
+; GCN-NEXT: [[DS2:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[MUL_N2]], float [[MUL_D2]], i1 false)
+; GCN-NEXT: [[R0:%.*]] = extractvalue { float, i1 } [[DS0]], 0
+; GCN-NEXT: [[R1:%.*]] = extractvalue { float, i1 } [[DS1]], 0
+; GCN-NEXT: [[R2:%.*]] = extractvalue { float, i1 } [[DS2]], 0
+; GCN-NEXT: [[SUM01:%.*]] = fadd float [[R0]], [[R1]]
+; GCN-NEXT: [[SUM:%.*]] = fadd float [[SUM01]], [[R2]]
+; GCN-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT: ret void
+;
+entry:
+ %n0 = load float, ptr addrspace(1) %num, align 4
+ %nptr1 = getelementptr float, ptr addrspace(1) %num, i64 1
+ %n1 = load float, ptr addrspace(1) %nptr1, align 4
+ %nptr2 = getelementptr float, ptr addrspace(1) %num, i64 2
+ %n2 = load float, ptr addrspace(1) %nptr2, align 4
+ %d0 = load float, ptr addrspace(1) %den, align 4
+ %dptr1 = getelementptr float, ptr addrspace(1) %den, i64 1
+ %d1 = load float, ptr addrspace(1) %dptr1, align 4
+ %dptr2 = getelementptr float, ptr addrspace(1) %den, i64 2
+ %d2 = load float, ptr addrspace(1) %dptr2, align 4
+ %mul_n0 = fmul float %n0, 2.0
+ %mul_n1 = fmul float %n1, 2.0
+ %mul_n2 = fmul float %n2, 2.0
+ %mul_d0 = fmul float %d0, 4.0
+ %mul_d1 = fmul float %d1, 4.0
+ %mul_d2 = fmul float %d2, 4.0
+ %ds0 = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %mul_n0, float %mul_d0, i1 false)
+ %ds1 = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %mul_n1, float %mul_d1, i1 false)
+ %ds2 = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %mul_n2, float %mul_d2, i1 false)
+ %r0 = extractvalue { float, i1 } %ds0, 0
+ %r1 = extractvalue { float, i1 } %ds1, 0
+ %r2 = extractvalue { float, i1 } %ds2, 0
+ %sum01 = fadd float %r0, %r1
+ %sum = fadd float %sum01, %r2
+ store float %sum, ptr addrspace(1) %output, align 4
+ ret void
+}
+
+define amdgpu_kernel void @kernel_fmed3(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @kernel_fmed3(
+; GCN-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[ENTRY:.*:]]
+; GCN-NEXT: [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
+; GCN-NEXT: [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
+; GCN-NEXT: [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
+; GCN-NEXT: [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[A]], align 4
+; GCN-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[B]], align 4
+; GCN-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP0]], [[TMP1]]
+; GCN-NEXT: [[ADD2:%.*]] = fadd float [[A2]], [[B2]]
+; GCN-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; GCN-NEXT: [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP3]], float [[TMP3]], float 1.000000e+00)
+; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; GCN-NEXT: [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP4]], float [[TMP4]], float 1.000000e+00)
+; GCN-NEXT: [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD2]], float [[ADD2]], float 1.000000e+00)
+; GCN-NEXT: [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]]
+; GCN-NEXT: [[SUM:%.*]] = fadd float [[SUM01]], [[MED2]]
+; GCN-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT: ret void
+;
+entry:
+ %a0 = load float, ptr addrspace(1) %a, align 4
+ %aptr1 = getelementptr float, ptr addrspace(1) %a, i64 1
+ %a1 = load float, ptr addrspace(1) %aptr1, align 4
+ %aptr2 = getelementptr float, ptr addrspace(1) %a, i64 2
+ %a2 = load float, ptr addrspace(1) %aptr2, align 4
+
+ %b0 = load float, ptr addrspace(1) %b, align 4
+ %bptr1 = getelementptr float, ptr addrspace(1) %b, i64 1
+ %b1 = load float, ptr addrspace(1) %bptr1, align 4
+ %bptr2 = getelementptr float, ptr addrspace(1) %b, i64 2
+ %b2 = load float, ptr addrspace(1) %bptr2, align 4
+
+ %add0 = fadd float %a0, %b0
+ %add1 = fadd float %a1, %b1
+ %add2 = fadd float %a2, %b2
+
+ %med0 = call float @llvm.amdgcn.fmed3.f32(float %add0, float %add0, float 1.0)
+ %med1 = call float @llvm.amdgcn.fmed3.f32(float %add1, float %add1, float 1.0)
+ %med2 = call float @llvm.amdgcn.fmed3.f32(float %add2, float %add2, float 1.0)
+
+ %sum01 = fadd float %med0, %med1
+ %sum = fadd float %sum01, %med2
+ store float %sum, ptr addrspace(1) %output, align 4
+ ret void
+}
+
+define amdgpu_kernel void @kernel_fmed3_1(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @kernel_fmed3_1(
+; GCN-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[ENTRY:.*:]]
+; GCN-NEXT: [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4
+; GCN-NEXT: [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1
+; GCN-NEXT: [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4
+; GCN-NEXT: [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
+; GCN-NEXT: [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
+; GCN-NEXT: [[APTR3:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 3
+; GCN-NEXT: [[A3:%.*]] = load float, ptr addrspace(1) [[APTR3]], align 4
+; GCN-NEXT: [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[B]], align 4
+; GCN-NEXT: [[TMP1:%.*]] = fadd <2 x float> splat (float 5.000000e+00), [[TMP0]]
+; GCN-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(1) [[BPTR2]], align 4
+; GCN-NEXT: [[TMP3:%.*]] = fadd <2 x float> splat (float 5.000000e+00), [[TMP2]]
+; GCN-NEXT: [[TMP4:%.*]] = fadd <2 x float> splat (float 1.000000e+00), [[TMP0]]
+; GCN-NEXT: [[TMP5:%.*]] = fadd <2 x float> splat (float 1.000000e+00), [[TMP2]]
+; GCN-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; GCN-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; GCN-NEXT: [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP6]], float [[TMP7]], float 1.000000e+00)
+; GCN-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; GCN-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; GCN-NEXT: [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP8]], float [[TMP9]], float 1.000000e+00)
+; GCN-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; GCN-NEXT: [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP10]], float [[TMP11]], float 1.000000e+00)
+; GCN-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
+; GCN-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP12]], float [[TMP13]], float 1.000000e+00)
+; GCN-NEXT: [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]]
+; GCN-NEXT: [[SUM02:%.*]] = fadd float [[MED2]], [[MED3]]
+; GCN-NEXT: [[SUM:%.*]] = fadd float [[SUM01]], [[SUM02]]
+; GCN-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT: ret void
+;
+entry:
+ %a0 = load float, ptr addrspace(1) %a, align 4
+ %aptr1 = getelementptr float, ptr addrspace(1) %a, i64 1
+ %a1 = load float, ptr addrspace(1) %aptr1, align 4
+ %aptr2 = getelementptr float, ptr addrspace(1) %a, i64 2
+ %a2 = load float, ptr addrspace(1) %aptr2, align 4
+ %aptr3 = getelementptr float, ptr addrspace(1) %a, i64 3
+ %a3 = load float, ptr addrspace(1) %aptr3, align 4
+
+ %b0 = load float, ptr addrspace(1) %b, align 4
+ %bptr1 = getelementptr float, ptr addrspace(1) %b, i64 1
+ %b1 = load float, ptr addrspace(1) %bptr1, align 4
+ %bptr2 = getelementptr float, ptr addrspace(1) %b, i64 2
+ %b2 = load float, ptr addrspace(1) %bptr2, align 4
+ %bptr3 = getelementptr float, ptr addrspace(1) %b, i64 3
+ %b3 = load float, ptr addrspace(1) %bptr3, align 4
+
+ %add0 = fadd float 5.0, %b0
+ %add1 = fadd float 5.0, %b1
+ %add2 = fadd float 5.0, %b2
+ %add3 = fadd float 5.0, %b3
+
+ %sub0 = fadd float 1.0, %b0
+ %sub1 = fadd float 1.0, %b1
+ %sub2 = fadd float 1.0, %b2
+ %sub3 = fadd float 1.0, %b3
+
+ %med0 = call float @llvm.amdgcn.fmed3.f32(float %add0, float %sub0, float 1.0)
+ %med1 = call float @llvm.amdgcn.fmed3.f32(float %add1, float %sub1, float 1.0)
+ %med2 = call float @llvm.amdgcn.fmed3.f32(float %add2, float %sub2, float 1.0)
+ %med3 = call float @llvm.amdgcn.fmed3.f32(float %add3, float %sub3, float 1.0)
+
+ %sum01 = fadd float %med0, %med1
+ %sum02 = fadd float %med2, %med3
+ %sum = fadd float %sum01, %sum02
+ store float %sum, ptr addrspace(1) %output, align 4
+ ret void
+}
+
+declare float @llvm.amdgcn.fmed3.f32(float, float, float)
+declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1)
declare half @llvm.amdgcn.exp2.f16(half)
declare float @llvm.amdgcn.exp2.f32(float)
declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 immarg, <16 x i32>, i32 immarg, <16 x i32>, i16 immarg, <8 x float>, i32 immarg, i32 immarg, i32, i32 immarg, i32 immarg, i32, i1 immarg, i1 immarg)
More information about the llvm-commits
mailing list