[llvm] [SLP][AMDGPU] Vectorize operands of non-trivially-vectorizable intrinsic calls (PR #189784)

Thu Apr 2 23:25:27 PDT 2026

https://github.com/mssefat updated https://github.com/llvm/llvm-project/pull/189784

>From c5fbe3c0fdef42860e2dc715364d62f8a27937f6 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Tue, 31 Mar 2026 19:28:41 -0500
Subject: [PATCH 1/3] [SLP][AMDGPU] Vectorize operands of
 non-trivially-vectorizable intrinsic calls

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  49 +++
 ...otriviallyvectorizableintrinsicoperands.ll | 360 ++++++++++++++++++
 2 files changed, 409 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f7c78db5a83ac..9301a938160aa 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -243,6 +243,39 @@ static const int MinScheduleRegionSize = 16;
 /// Maximum allowed number of operands in the PHI nodes.
 static const unsigned MaxPHINumOperands = 128;
 
+/// For instructions that are not trivially vectorizable, try to vectorize their
+/// operands.
+/// FIXME: Extend for all non-vectorized functions.
+static Value *getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+  auto *CI = dyn_cast<CallInst>(V);
+  if (!CI)
+    return nullptr;
+  Intrinsic::ID ID = CI->getIntrinsicID();
+  // Only consider intrinsic calls.
+  // FIXME: We may want to relax this condition in future.
+  if (ID == Intrinsic::not_intrinsic)
+    return nullptr;
+  // Skip trivially vectorizable intrinsics.
+  if (isTriviallyVectorizable(ID))
+    return nullptr;
+  // Only look through unary intrinsic calls.
+  if (CI->arg_size() != 1)
+    return nullptr;
+  // Check if it is speculatable, no memory access and will return
+  if (!CI->hasFnAttr(Attribute::Speculatable) || !CI->doesNotAccessMemory() ||
+      !CI->willReturn())
+    return nullptr;
+  auto *Operand = dyn_cast<Instruction>(CI->getArgOperand(0));
+  if (!Operand)
+    return nullptr;
+  // Operand type should match the result type we ignore type changing
+  // intrinsics.
+  if (Operand->getType() != CI->getType())
+    return nullptr;
+
+  return Operand;
+}
+
 /// Predicate for the element types that the SLP vectorizer supports.
 ///
 /// The most important thing to filter here are types which are invalid in LLVM
@@ -29477,6 +29510,22 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       PostProcessCmps.insert(cast<CmpInst>(&*It));
   }
 
+  DenseMap<Intrinsic::ID, SmallSetVector<Value *, 4>> IntrinsicSeedOps;
+  for (Instruction &I : *BB) {
+    if (R.isDeleted(&I))
+      continue;
+    // Collect operands of non-trivially vectorizable intrinsic calls (e.g.,
+    // llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
+    // vectorized independently.
+    // FIXME: Extend for all non-vectorized functions.
+    if (Value *Op = getNonTriviallyVectorizableIntrinsicCallOperand(&I))
+      IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert(Op);
+  }
+  // Try to vectorize per intrinsic call ID.
+  for (auto &[ID, Ops] : IntrinsicSeedOps) {
+    Changed |= tryToVectorizeList(Ops.getArrayRef(), R);
+  }
+
   return Changed;
 }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
new file mode 100644
index 0000000000000..0ca33e0e6b09f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
@@ -0,0 +1,360 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefix GCN
+
+define amdgpu_kernel void @test_with_wmma( ptr addrspace(1) %input, ptr addrspace(1) %output, float %scaled_max, <16 x i32> %A, <16 x i32> %B, i32 %scale_idx) {
+; GCN-LABEL: define amdgpu_kernel void @test_with_wmma(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[SCALED_MAX]], i32 0
+; GCN-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; GCN-NEXT:    [[TMP4:%.*]] = fsub contract <2 x float> [[TMP1]], [[TMP3]]
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
+; GCN-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP6]])
+; GCN-NEXT:    [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0
+; GCN-NEXT:    [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1
+; GCN-NEXT:    [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32>
+; GCN-NEXT:    [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0
+; GCN-NEXT:    [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1
+; GCN-NEXT:    [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; GCN-NEXT:    [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; GCN-NEXT:    store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32
+; GCN-NEXT:    ret void
+;
+entry:
+
+  %in0 = load float, ptr addrspace(1) %input, align 4
+  %ptr1 = getelementptr float, ptr addrspace(1) %input, i64 1
+  %in1 = load float, ptr addrspace(1) %ptr1, align 4
+
+  %mul0 = fmul contract float %in0, 0x3FC0527DC0000000
+  %mul1 = fmul contract float %in1, 0x3FC0527DC0000000
+
+  %sub0 = fsub contract float %mul0, %scaled_max
+  %sub1 = fsub contract float %mul1, %scaled_max
+
+  %exp0 = tail call float @llvm.amdgcn.exp2.f32(float %sub0)
+  %exp1 = tail call float @llvm.amdgcn.exp2.f32(float %sub1)
+
+  %vec0 = insertelement <2 x float> poison, float %exp0, i64 0
+  %vec1 = insertelement <2 x float> %vec0, float %exp1, i64 1
+
+  %vec_i32 = bitcast <2 x float> %vec1 to <2 x i32>
+
+  %scale0 = extractelement <2 x i32> %vec_i32, i64 0
+  %scale1 = extractelement <2 x i32> %vec_i32, i64 1
+
+  %wmma0 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(
+  i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> zeroinitializer,
+  i32 0, i32 0, i32 %scale0, i32 0, i32 0, i32 %scale_idx, i1 false, i1 false)
+
+  %wmma1 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(
+  i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %wmma0,
+  i32 0, i32 0, i32 %scale1, i32 0, i32 0, i32 %scale_idx, i1 false, i1 false)
+
+  store <8 x float> %wmma1, ptr addrspace(1) %output, align 32
+  ret void
+}
+
+define amdgpu_kernel void @test_amdgcn_exp_log(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_log(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP4]])
+; GCN-NEXT:    [[LOG0:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP0]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
+; GCN-NEXT:    [[LOG1:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP1]])
+; GCN-NEXT:    [[SUM:%.*]] = fadd fast float [[LOG0]], [[LOG1]]
+; GCN-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT:    ret void
+;
+entry:
+  %in0 = load float, ptr addrspace(1) %input, align 4
+  %ptr1 = getelementptr float, ptr addrspace(1) %input, i64 1
+  %in1 = load float, ptr addrspace(1) %ptr1, align 4
+  %scale0 = load float, ptr addrspace(1) %scales, align 4
+  %sptr1 = getelementptr float, ptr addrspace(1) %scales, i64 1
+  %scale1 = load float, ptr addrspace(1) %sptr1, align 4
+  %mul0 = fmul contract float %in0, 0x3FC0527DC0000000
+  %mul1 = fmul contract float %in1, 0x3FC0527DC0000000
+  %sub0 = fsub contract float %mul0, %scale0
+  %sub1 = fsub contract float %mul1, %scale1
+  %exp0 = tail call float @llvm.amdgcn.exp2.f32(float %sub0)
+  %log0 = tail call float @llvm.amdgcn.log.f32(float %exp0)
+  %exp1 = tail call float @llvm.amdgcn.exp2.f32(float %sub1)
+  %log1 = tail call float @llvm.amdgcn.log.f32(float %exp1)
+  %sum = fadd fast float %log0, %log1
+  store float %sum, ptr addrspace(1) %output, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_amdgcn_exp_f16(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_f16(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
+; GCN-NEXT:    [[SUM:%.*]] = fadd fast half [[EXP0]], [[EXP1]]
+; GCN-NEXT:    store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT:    ret void
+;
+entry:
+  %in0 = load half, ptr addrspace(1) %input, align 2
+  %ptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+  %in1 = load half, ptr addrspace(1) %ptr1, align 2
+  %scale0 = load half, ptr addrspace(1) %scales, align 2
+  %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+  %scale1 = load half, ptr addrspace(1) %sptr1, align 2
+  %mul0 = fmul contract half %in0, 0xH3E14
+  %mul1 = fmul contract half %in1, 0xH3E14
+  %sub0 = fsub contract half %mul0, %scale0
+  %sub1 = fsub contract half %mul1, %scale1
+  %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %sub0)
+  %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %sub1)
+  %sum = fadd fast half %exp0, %exp1
+  store half %sum, ptr addrspace(1) %output, align 2
+  ret void
+}
+
+define amdgpu_kernel void @kernel_f16(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @kernel_f16(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
+; GCN-NEXT:    [[LOG0:%.*]] = tail call half @llvm.amdgcn.log.f16(half [[EXP0]])
+; GCN-NEXT:    [[LOG1:%.*]] = tail call half @llvm.amdgcn.log.f16(half [[EXP1]])
+; GCN-NEXT:    [[SUM:%.*]] = fadd fast half [[LOG0]], [[LOG1]]
+; GCN-NEXT:    store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT:    ret void
+;
+entry:
+  %in0 = load half, ptr addrspace(1) %input, align 2
+  %ptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+  %in1 = load half, ptr addrspace(1) %ptr1, align 2
+  %scale0 = load half, ptr addrspace(1) %scales, align 2
+  %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+  %scale1 = load half, ptr addrspace(1) %sptr1, align 2
+  %mul0 = fmul contract half %in0, 0xH3E14
+  %mul1 = fmul contract half %in1, 0xH3E14
+  %sub0 = fsub contract half %mul0, %scale0
+  %sub1 = fsub contract half %mul1, %scale1
+  %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %sub0)
+  %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %sub1)
+  %log0 = tail call half @llvm.amdgcn.log.f16(half %exp0)
+  %log1 = tail call half @llvm.amdgcn.log.f16(half %exp1)
+  %sum = fadd fast half %log0, %log1
+  store half %sum, ptr addrspace(1) %output, align 2
+  ret void
+}
+
+define amdgpu_kernel void @look_through_reuse_shuffle(
+; GCN-LABEL: define amdgpu_kernel void @look_through_reuse_shuffle(
+; GCN-SAME: ptr addrspace(1) noalias [[INPUT:%.*]], ptr addrspace(1) noalias [[SCALES:%.*]], ptr addrspace(1) noalias [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fadd contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fmul contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
+; GCN-NEXT:    [[TMP6:%.*]] = insertelement <4 x half> poison, half [[EXP0]], i32 0
+; GCN-NEXT:    [[TMP7:%.*]] = insertelement <4 x half> [[TMP6]], half [[EXP1]], i32 1
+; GCN-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP7]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; GCN-NEXT:    store <4 x half> [[TMP8]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT:    ret void
+;
+  ptr addrspace(1) noalias %input, ptr addrspace(1) noalias %scales,
+  ptr addrspace(1) noalias %output) {
+entry:
+  %iptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+  %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+  %optr1 = getelementptr half, ptr addrspace(1) %output, i64 1
+  %optr2 = getelementptr half, ptr addrspace(1) %output, i64 2
+  %optr3 = getelementptr half, ptr addrspace(1) %output, i64 3
+
+  %in0 = load half, ptr addrspace(1) %input, align 2
+  %in1 = load half, ptr addrspace(1) %iptr1, align 2
+  %s0 = load half, ptr addrspace(1) %scales, align 2
+  %s1 = load half, ptr addrspace(1) %sptr1, align 2
+
+  %add0 = fadd contract half %in0, 0xH3E14
+  %add1 = fadd contract half %in1, 0xH3E14
+
+  %mul0 = fmul contract half %add0, %s0
+  %mul1 = fmul contract half %add1, %s1
+
+  %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %mul0)
+  %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %mul1)
+
+  store half %exp0, ptr addrspace(1) %output, align 2
+  store half %exp1, ptr addrspace(1) %optr1, align 2
+  store half %exp1, ptr addrspace(1) %optr2, align 2
+  store half %exp1, ptr addrspace(1) %optr3, align 2
+  ret void
+}
+
+define amdgpu_kernel void @wider_exp2_f32(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @wider_exp2_f32(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[PTR2:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 2
+; GCN-NEXT:    [[SPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 2
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR2]], align 4
+; GCN-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr addrspace(1) [[SPTR2]], align 4
+; GCN-NEXT:    [[TMP6:%.*]] = fmul contract <2 x float> [[TMP4]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP7:%.*]] = fsub contract <2 x float> [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP8]])
+; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP9]])
+; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
+; GCN-NEXT:    [[EXP2:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP10]])
+; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
+; GCN-NEXT:    [[EXP3:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP11]])
+; GCN-NEXT:    [[SUM01:%.*]] = fadd fast float [[EXP0]], [[EXP1]]
+; GCN-NEXT:    [[SUM23:%.*]] = fadd fast float [[EXP2]], [[EXP3]]
+; GCN-NEXT:    [[SUM:%.*]] = fadd fast float [[SUM01]], [[SUM23]]
+; GCN-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT:    ret void
+;
+entry:
+  %in0 = load float, ptr addrspace(1) %input, align 4
+  %ptr1 = getelementptr float, ptr addrspace(1) %input, i64 1
+  %in1 = load float, ptr addrspace(1) %ptr1, align 4
+  %ptr2 = getelementptr float, ptr addrspace(1) %input, i64 2
+  %in2 = load float, ptr addrspace(1) %ptr2, align 4
+  %ptr3 = getelementptr float, ptr addrspace(1) %input, i64 3
+  %in3 = load float, ptr addrspace(1) %ptr3, align 4
+
+  %scale0 = load float, ptr addrspace(1) %scales, align 4
+  %sptr1 = getelementptr float, ptr addrspace(1) %scales, i64 1
+  %scale1 = load float, ptr addrspace(1) %sptr1, align 4
+  %sptr2 = getelementptr float, ptr addrspace(1) %scales, i64 2
+  %scale2 = load float, ptr addrspace(1) %sptr2, align 4
+  %sptr3 = getelementptr float, ptr addrspace(1) %scales, i64 3
+  %scale3 = load float, ptr addrspace(1) %sptr3, align 4
+
+  %mul0 = fmul contract float %in0, 0x3FC0527DC0000000
+  %mul1 = fmul contract float %in1, 0x3FC0527DC0000000
+  %mul2 = fmul contract float %in2, 0x3FC0527DC0000000
+  %mul3 = fmul contract float %in3, 0x3FC0527DC0000000
+
+  %sub0 = fsub contract float %mul0, %scale0
+  %sub1 = fsub contract float %mul1, %scale1
+  %sub2 = fsub contract float %mul2, %scale2
+  %sub3 = fsub contract float %mul3, %scale3
+
+  %exp0 = tail call float @llvm.amdgcn.exp2.f32(float %sub0)
+  %exp1 = tail call float @llvm.amdgcn.exp2.f32(float %sub1)
+  %exp2 = tail call float @llvm.amdgcn.exp2.f32(float %sub2)
+  %exp3 = tail call float @llvm.amdgcn.exp2.f32(float %sub3)
+
+  %sum01 = fadd fast float %exp0, %exp1
+  %sum23 = fadd fast float %exp2, %exp3
+  %sum = fadd fast float %sum01, %sum23
+
+  store float %sum, ptr addrspace(1) %output, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wider_exp2_half(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @wider_exp2_half(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[PTR2:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 2
+; GCN-NEXT:    [[SPTR2:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 2
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR2]], align 2
+; GCN-NEXT:    [[TMP5:%.*]] = load <2 x half>, ptr addrspace(1) [[SPTR2]], align 2
+; GCN-NEXT:    [[TMP6:%.*]] = fmul contract <2 x half> [[TMP4]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP7:%.*]] = fsub contract <2 x half> [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP8]])
+; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP9]])
+; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x half> [[TMP7]], i32 0
+; GCN-NEXT:    [[EXP2:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP10]])
+; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x half> [[TMP7]], i32 1
+; GCN-NEXT:    [[EXP3:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP11]])
+; GCN-NEXT:    [[SUM01:%.*]] = fadd fast half [[EXP0]], [[EXP1]]
+; GCN-NEXT:    [[SUM23:%.*]] = fadd fast half [[EXP2]], [[EXP3]]
+; GCN-NEXT:    [[SUM:%.*]] = fadd fast half [[SUM01]], [[SUM23]]
+; GCN-NEXT:    store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT:    ret void
+;
+entry:
+  ;; Same 4-wide pattern as wider_exp2_f32. Cost model gives half higher
+  ;; LookThrough cost, so we get 2-wide vectorization (or none on stricter targets).
+  %in0 = load half, ptr addrspace(1) %input, align 2
+  %ptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+  %in1 = load half, ptr addrspace(1) %ptr1, align 2
+  %ptr2 = getelementptr half, ptr addrspace(1) %input, i64 2
+  %in2 = load half, ptr addrspace(1) %ptr2, align 2
+  %ptr3 = getelementptr half, ptr addrspace(1) %input, i64 3
+  %in3 = load half, ptr addrspace(1) %ptr3, align 2
+
+  %scale0 = load half, ptr addrspace(1) %scales, align 2
+  %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+  %scale1 = load half, ptr addrspace(1) %sptr1, align 2
+  %sptr2 = getelementptr half, ptr addrspace(1) %scales, i64 2
+  %scale2 = load half, ptr addrspace(1) %sptr2, align 2
+  %sptr3 = getelementptr half, ptr addrspace(1) %scales, i64 3
+  %scale3 = load half, ptr addrspace(1) %sptr3, align 2
+
+  %mul0 = fmul contract half %in0, 0xH3E14
+  %mul1 = fmul contract half %in1, 0xH3E14
+  %mul2 = fmul contract half %in2, 0xH3E14
+  %mul3 = fmul contract half %in3, 0xH3E14
+
+  %sub0 = fsub contract half %mul0, %scale0
+  %sub1 = fsub contract half %mul1, %scale1
+  %sub2 = fsub contract half %mul2, %scale2
+  %sub3 = fsub contract half %mul3, %scale3
+
+  %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %sub0)
+  %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %sub1)
+  %exp2 = tail call half @llvm.amdgcn.exp2.f16(half %sub2)
+  %exp3 = tail call half @llvm.amdgcn.exp2.f16(half %sub3)
+
+  %sum01 = fadd fast half %exp0, %exp1
+  %sum23 = fadd fast half %exp2, %exp3
+  %sum = fadd fast half %sum01, %sum23
+
+  store half %sum, ptr addrspace(1) %output, align 2
+  ret void
+}
+
+declare half @llvm.amdgcn.exp2.f16(half)
+declare float @llvm.amdgcn.exp2.f32(float)
+declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 immarg, <16 x i32>, i32 immarg, <16 x i32>, i16 immarg, <8 x float>, i32 immarg, i32 immarg, i32, i32 immarg, i32 immarg, i32, i1 immarg, i1 immarg)

>From 7c6b48e692269b6e3006957511117c13d7b054c9 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Wed, 1 Apr 2026 17:50:49 -0500
Subject: [PATCH 2/3] Fixed comment!

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9301a938160aa..3cb7e8e35779b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -258,7 +258,7 @@ static Value *getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
   // Skip trivially vectorizable intrinsics.
   if (isTriviallyVectorizable(ID))
     return nullptr;
-  // Only look through unary intrinsic calls.
+  // Only consider unary intrinsic calls.
   if (CI->arg_size() != 1)
     return nullptr;
   // Check if it is speculatable, no memory access and will return

>From 54f51671b21e84100ef4a3f9356920e9a4ad358a Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 3 Apr 2026 01:16:28 -0500
Subject: [PATCH 3/3] Fixed checks and enabled for all operands

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 62 ++++++++++---------
 .../Transforms/SLPVectorizer/RISCV/revec.ll   |  7 +--
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3cb7e8e35779b..514acaa49fffd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -246,35 +246,32 @@ static const unsigned MaxPHINumOperands = 128;
 /// For instructions that are not trivially vectorizable, try to vectorize their
 /// operands.
 /// FIXME: Extend for all non-vectorized functions.
-static Value *getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+SmallVector<Value *, 4>
+getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+
+  SmallVector<Value *, 4> Operands;
   auto *CI = dyn_cast<CallInst>(V);
-  if (!CI)
-    return nullptr;
+
+  if (!CI || isAssumeLikeIntrinsic(CI))
+    return {};
   Intrinsic::ID ID = CI->getIntrinsicID();
   // Only consider intrinsic calls.
   // FIXME: We may want to relax this condition in future.
-  if (ID == Intrinsic::not_intrinsic)
-    return nullptr;
-  // Skip trivially vectorizable intrinsics.
-  if (isTriviallyVectorizable(ID))
-    return nullptr;
-  // Only consider unary intrinsic calls.
-  if (CI->arg_size() != 1)
-    return nullptr;
-  // Check if it is speculatable, no memory access and will return
-  if (!CI->hasFnAttr(Attribute::Speculatable) || !CI->doesNotAccessMemory() ||
-      !CI->willReturn())
-    return nullptr;
-  auto *Operand = dyn_cast<Instruction>(CI->getArgOperand(0));
-  if (!Operand)
-    return nullptr;
-  // Operand type should match the result type we ignore type changing
-  // intrinsics.
-  if (Operand->getType() != CI->getType())
-    return nullptr;
+  if (ID == Intrinsic::not_intrinsic || isTriviallyVectorizable(ID))
+    return {};
 
-  return Operand;
-}
+  // Skip memory intrinsics (e.g., masked.load, masked.gather etc.)
+  if (CI->mayReadOrWriteMemory())
+    return {};
+
+  for (Value *ArgOp : CI->args()) {
+    if (auto *I = dyn_cast<Instruction>(ArgOp)) {
+      Operands.emplace_back(I);
+    }
+  }
+
+  return Operands;
+}  
 
 /// Predicate for the element types that the SLP vectorizer supports.
 ///
@@ -29510,7 +29507,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       PostProcessCmps.insert(cast<CmpInst>(&*It));
   }
 
-  DenseMap<Intrinsic::ID, SmallSetVector<Value *, 4>> IntrinsicSeedOps;
+  SmallMapVector<Intrinsic::ID, SmallSetVector<Value *, 4>, 4> IntrinsicSeedOps;
   for (Instruction &I : *BB) {
     if (R.isDeleted(&I))
       continue;
@@ -29518,12 +29515,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     // llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
     // vectorized independently.
     // FIXME: Extend for all non-vectorized functions.
-    if (Value *Op = getNonTriviallyVectorizableIntrinsicCallOperand(&I))
-      IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert(Op);
+    SmallVector<Value *, 4> Ops =
+        getNonTriviallyVectorizableIntrinsicCallOperand(&I);
+    if (!Ops.empty())
+      IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert_range(Ops);
   }
   // Try to vectorize per intrinsic call ID.
   for (auto &[ID, Ops] : IntrinsicSeedOps) {
-    Changed |= tryToVectorizeList(Ops.getArrayRef(), R);
+    // Sub-group by opcode so we do not get bailed early
+    SmallMapVector<unsigned, SmallVector<Value *, 4>, 4> OpcodeGroups;
+    for (Value *Op : Ops) {
+      if (auto *I = dyn_cast<Instruction>(Op))
+        OpcodeGroups[I->getOpcode()].push_back(Op);
+    }
+    for (auto &[Opc, Group] : OpcodeGroups)
+      Changed |= tryToVectorizeList(Group, R);
   }
 
   return Changed;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index e13dfce8c29f3..016726e5ae371 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -177,11 +177,10 @@ define ptr @test4() {
 ; NONPOWEROF2-NEXT:    [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ]
 ; NONPOWEROF2-NEXT:    br label [[TMP9:%.*]]
 ; NONPOWEROF2:       10:
-; NONPOWEROF2-NEXT:    [[TMP10:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]]
-; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]]
+; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = fmul <6 x float> zeroinitializer, [[TMP8]]
+; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; NONPOWEROF2-NEXT:    [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
+; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
 ; NONPOWEROF2-NEXT:    [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
 ; NONPOWEROF2-NEXT:    [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]])
 ; NONPOWEROF2-NEXT:    [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])