[llvm] [SLP][AMDGPU] Vectorize operands of non-trivially-vectorizable intrinsic calls (PR #189784)

Sun Apr 5 18:56:25 PDT 2026

https://github.com/mssefat updated https://github.com/llvm/llvm-project/pull/189784

>From c5fbe3c0fdef42860e2dc715364d62f8a27937f6 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Tue, 31 Mar 2026 19:28:41 -0500
Subject: [PATCH 1/6] [SLP][AMDGPU] Vectorize operands of
 non-trivially-vectorizable intrinsic calls

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  49 +++
 ...otriviallyvectorizableintrinsicoperands.ll | 360 ++++++++++++++++++
 2 files changed, 409 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f7c78db5a83ac..9301a938160aa 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -243,6 +243,39 @@ static const int MinScheduleRegionSize = 16;
 /// Maximum allowed number of operands in the PHI nodes.
 static const unsigned MaxPHINumOperands = 128;
 
+/// For instructions that are not trivially vectorizable, try to vectorize their
+/// operands.
+/// FIXME: Extend for all non-vectorized functions.
+static Value *getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+  auto *CI = dyn_cast<CallInst>(V);
+  if (!CI)
+    return nullptr;
+  Intrinsic::ID ID = CI->getIntrinsicID();
+  // Only consider intrinsic calls.
+  // FIXME: We may want to relax this condition in future.
+  if (ID == Intrinsic::not_intrinsic)
+    return nullptr;
+  // Skip trivially vectorizable intrinsics.
+  if (isTriviallyVectorizable(ID))
+    return nullptr;
+  // Only look through unary intrinsic calls.
+  if (CI->arg_size() != 1)
+    return nullptr;
+  // Check if it is speculatable, no memory access and will return
+  if (!CI->hasFnAttr(Attribute::Speculatable) || !CI->doesNotAccessMemory() ||
+      !CI->willReturn())
+    return nullptr;
+  auto *Operand = dyn_cast<Instruction>(CI->getArgOperand(0));
+  if (!Operand)
+    return nullptr;
+  // Operand type should match the result type we ignore type changing
+  // intrinsics.
+  if (Operand->getType() != CI->getType())
+    return nullptr;
+
+  return Operand;
+}
+
 /// Predicate for the element types that the SLP vectorizer supports.
 ///
 /// The most important thing to filter here are types which are invalid in LLVM
@@ -29477,6 +29510,22 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       PostProcessCmps.insert(cast<CmpInst>(&*It));
   }
 
+  DenseMap<Intrinsic::ID, SmallSetVector<Value *, 4>> IntrinsicSeedOps;
+  for (Instruction &I : *BB) {
+    if (R.isDeleted(&I))
+      continue;
+    // Collect operands of non-trivially vectorizable intrinsic calls (e.g.,
+    // llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
+    // vectorized independently.
+    // FIXME: Extend for all non-vectorized functions.
+    if (Value *Op = getNonTriviallyVectorizableIntrinsicCallOperand(&I))
+      IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert(Op);
+  }
+  // Try to vectorize per intrinsic call ID.
+  for (auto &[ID, Ops] : IntrinsicSeedOps) {
+    Changed |= tryToVectorizeList(Ops.getArrayRef(), R);
+  }
+
   return Changed;
 }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
new file mode 100644
index 0000000000000..0ca33e0e6b09f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
@@ -0,0 +1,360 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefix GCN
+
+define amdgpu_kernel void @test_with_wmma( ptr addrspace(1) %input, ptr addrspace(1) %output, float %scaled_max, <16 x i32> %A, <16 x i32> %B, i32 %scale_idx) {
+; GCN-LABEL: define amdgpu_kernel void @test_with_wmma(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[SCALED_MAX]], i32 0
+; GCN-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; GCN-NEXT:    [[TMP4:%.*]] = fsub contract <2 x float> [[TMP1]], [[TMP3]]
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
+; GCN-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP6]])
+; GCN-NEXT:    [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0
+; GCN-NEXT:    [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1
+; GCN-NEXT:    [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32>
+; GCN-NEXT:    [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0
+; GCN-NEXT:    [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1
+; GCN-NEXT:    [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; GCN-NEXT:    [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; GCN-NEXT:    store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32
+; GCN-NEXT:    ret void
+;
+entry:
+
+  %in0 = load float, ptr addrspace(1) %input, align 4
+  %ptr1 = getelementptr float, ptr addrspace(1) %input, i64 1
+  %in1 = load float, ptr addrspace(1) %ptr1, align 4
+
+  %mul0 = fmul contract float %in0, 0x3FC0527DC0000000
+  %mul1 = fmul contract float %in1, 0x3FC0527DC0000000
+
+  %sub0 = fsub contract float %mul0, %scaled_max
+  %sub1 = fsub contract float %mul1, %scaled_max
+
+  %exp0 = tail call float @llvm.amdgcn.exp2.f32(float %sub0)
+  %exp1 = tail call float @llvm.amdgcn.exp2.f32(float %sub1)
+
+  %vec0 = insertelement <2 x float> poison, float %exp0, i64 0
+  %vec1 = insertelement <2 x float> %vec0, float %exp1, i64 1
+
+  %vec_i32 = bitcast <2 x float> %vec1 to <2 x i32>
+
+  %scale0 = extractelement <2 x i32> %vec_i32, i64 0
+  %scale1 = extractelement <2 x i32> %vec_i32, i64 1
+
+  %wmma0 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(
+  i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> zeroinitializer,
+  i32 0, i32 0, i32 %scale0, i32 0, i32 0, i32 %scale_idx, i1 false, i1 false)
+
+  %wmma1 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(
+  i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %wmma0,
+  i32 0, i32 0, i32 %scale1, i32 0, i32 0, i32 %scale_idx, i1 false, i1 false)
+
+  store <8 x float> %wmma1, ptr addrspace(1) %output, align 32
+  ret void
+}
+
+define amdgpu_kernel void @test_amdgcn_exp_log(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_log(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP4]])
+; GCN-NEXT:    [[LOG0:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP0]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
+; GCN-NEXT:    [[LOG1:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP1]])
+; GCN-NEXT:    [[SUM:%.*]] = fadd fast float [[LOG0]], [[LOG1]]
+; GCN-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT:    ret void
+;
+entry:
+  %in0 = load float, ptr addrspace(1) %input, align 4
+  %ptr1 = getelementptr float, ptr addrspace(1) %input, i64 1
+  %in1 = load float, ptr addrspace(1) %ptr1, align 4
+  %scale0 = load float, ptr addrspace(1) %scales, align 4
+  %sptr1 = getelementptr float, ptr addrspace(1) %scales, i64 1
+  %scale1 = load float, ptr addrspace(1) %sptr1, align 4
+  %mul0 = fmul contract float %in0, 0x3FC0527DC0000000
+  %mul1 = fmul contract float %in1, 0x3FC0527DC0000000
+  %sub0 = fsub contract float %mul0, %scale0
+  %sub1 = fsub contract float %mul1, %scale1
+  %exp0 = tail call float @llvm.amdgcn.exp2.f32(float %sub0)
+  %log0 = tail call float @llvm.amdgcn.log.f32(float %exp0)
+  %exp1 = tail call float @llvm.amdgcn.exp2.f32(float %sub1)
+  %log1 = tail call float @llvm.amdgcn.log.f32(float %exp1)
+  %sum = fadd fast float %log0, %log1
+  store float %sum, ptr addrspace(1) %output, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_amdgcn_exp_f16(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_f16(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
+; GCN-NEXT:    [[SUM:%.*]] = fadd fast half [[EXP0]], [[EXP1]]
+; GCN-NEXT:    store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT:    ret void
+;
+entry:
+  %in0 = load half, ptr addrspace(1) %input, align 2
+  %ptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+  %in1 = load half, ptr addrspace(1) %ptr1, align 2
+  %scale0 = load half, ptr addrspace(1) %scales, align 2
+  %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+  %scale1 = load half, ptr addrspace(1) %sptr1, align 2
+  %mul0 = fmul contract half %in0, 0xH3E14
+  %mul1 = fmul contract half %in1, 0xH3E14
+  %sub0 = fsub contract half %mul0, %scale0
+  %sub1 = fsub contract half %mul1, %scale1
+  %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %sub0)
+  %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %sub1)
+  %sum = fadd fast half %exp0, %exp1
+  store half %sum, ptr addrspace(1) %output, align 2
+  ret void
+}
+
+define amdgpu_kernel void @kernel_f16(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @kernel_f16(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
+; GCN-NEXT:    [[LOG0:%.*]] = tail call half @llvm.amdgcn.log.f16(half [[EXP0]])
+; GCN-NEXT:    [[LOG1:%.*]] = tail call half @llvm.amdgcn.log.f16(half [[EXP1]])
+; GCN-NEXT:    [[SUM:%.*]] = fadd fast half [[LOG0]], [[LOG1]]
+; GCN-NEXT:    store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT:    ret void
+;
+entry:
+  %in0 = load half, ptr addrspace(1) %input, align 2
+  %ptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+  %in1 = load half, ptr addrspace(1) %ptr1, align 2
+  %scale0 = load half, ptr addrspace(1) %scales, align 2
+  %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+  %scale1 = load half, ptr addrspace(1) %sptr1, align 2
+  %mul0 = fmul contract half %in0, 0xH3E14
+  %mul1 = fmul contract half %in1, 0xH3E14
+  %sub0 = fsub contract half %mul0, %scale0
+  %sub1 = fsub contract half %mul1, %scale1
+  %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %sub0)
+  %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %sub1)
+  %log0 = tail call half @llvm.amdgcn.log.f16(half %exp0)
+  %log1 = tail call half @llvm.amdgcn.log.f16(half %exp1)
+  %sum = fadd fast half %log0, %log1
+  store half %sum, ptr addrspace(1) %output, align 2
+  ret void
+}
+
+define amdgpu_kernel void @look_through_reuse_shuffle(
+; GCN-LABEL: define amdgpu_kernel void @look_through_reuse_shuffle(
+; GCN-SAME: ptr addrspace(1) noalias [[INPUT:%.*]], ptr addrspace(1) noalias [[SCALES:%.*]], ptr addrspace(1) noalias [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fadd contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fmul contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
+; GCN-NEXT:    [[TMP6:%.*]] = insertelement <4 x half> poison, half [[EXP0]], i32 0
+; GCN-NEXT:    [[TMP7:%.*]] = insertelement <4 x half> [[TMP6]], half [[EXP1]], i32 1
+; GCN-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP7]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; GCN-NEXT:    store <4 x half> [[TMP8]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT:    ret void
+;
+  ptr addrspace(1) noalias %input, ptr addrspace(1) noalias %scales,
+  ptr addrspace(1) noalias %output) {
+entry:
+  %iptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+  %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+  %optr1 = getelementptr half, ptr addrspace(1) %output, i64 1
+  %optr2 = getelementptr half, ptr addrspace(1) %output, i64 2
+  %optr3 = getelementptr half, ptr addrspace(1) %output, i64 3
+
+  %in0 = load half, ptr addrspace(1) %input, align 2
+  %in1 = load half, ptr addrspace(1) %iptr1, align 2
+  %s0 = load half, ptr addrspace(1) %scales, align 2
+  %s1 = load half, ptr addrspace(1) %sptr1, align 2
+
+  %add0 = fadd contract half %in0, 0xH3E14
+  %add1 = fadd contract half %in1, 0xH3E14
+
+  %mul0 = fmul contract half %add0, %s0
+  %mul1 = fmul contract half %add1, %s1
+
+  %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %mul0)
+  %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %mul1)
+
+  store half %exp0, ptr addrspace(1) %output, align 2
+  store half %exp1, ptr addrspace(1) %optr1, align 2
+  store half %exp1, ptr addrspace(1) %optr2, align 2
+  store half %exp1, ptr addrspace(1) %optr3, align 2
+  ret void
+}
+
+define amdgpu_kernel void @wider_exp2_f32(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @wider_exp2_f32(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[PTR2:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 2
+; GCN-NEXT:    [[SPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 2
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR2]], align 4
+; GCN-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr addrspace(1) [[SPTR2]], align 4
+; GCN-NEXT:    [[TMP6:%.*]] = fmul contract <2 x float> [[TMP4]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP7:%.*]] = fsub contract <2 x float> [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP8]])
+; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP9]])
+; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
+; GCN-NEXT:    [[EXP2:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP10]])
+; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
+; GCN-NEXT:    [[EXP3:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP11]])
+; GCN-NEXT:    [[SUM01:%.*]] = fadd fast float [[EXP0]], [[EXP1]]
+; GCN-NEXT:    [[SUM23:%.*]] = fadd fast float [[EXP2]], [[EXP3]]
+; GCN-NEXT:    [[SUM:%.*]] = fadd fast float [[SUM01]], [[SUM23]]
+; GCN-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT:    ret void
+;
+entry:
+  %in0 = load float, ptr addrspace(1) %input, align 4
+  %ptr1 = getelementptr float, ptr addrspace(1) %input, i64 1
+  %in1 = load float, ptr addrspace(1) %ptr1, align 4
+  %ptr2 = getelementptr float, ptr addrspace(1) %input, i64 2
+  %in2 = load float, ptr addrspace(1) %ptr2, align 4
+  %ptr3 = getelementptr float, ptr addrspace(1) %input, i64 3
+  %in3 = load float, ptr addrspace(1) %ptr3, align 4
+
+  %scale0 = load float, ptr addrspace(1) %scales, align 4
+  %sptr1 = getelementptr float, ptr addrspace(1) %scales, i64 1
+  %scale1 = load float, ptr addrspace(1) %sptr1, align 4
+  %sptr2 = getelementptr float, ptr addrspace(1) %scales, i64 2
+  %scale2 = load float, ptr addrspace(1) %sptr2, align 4
+  %sptr3 = getelementptr float, ptr addrspace(1) %scales, i64 3
+  %scale3 = load float, ptr addrspace(1) %sptr3, align 4
+
+  %mul0 = fmul contract float %in0, 0x3FC0527DC0000000
+  %mul1 = fmul contract float %in1, 0x3FC0527DC0000000
+  %mul2 = fmul contract float %in2, 0x3FC0527DC0000000
+  %mul3 = fmul contract float %in3, 0x3FC0527DC0000000
+
+  %sub0 = fsub contract float %mul0, %scale0
+  %sub1 = fsub contract float %mul1, %scale1
+  %sub2 = fsub contract float %mul2, %scale2
+  %sub3 = fsub contract float %mul3, %scale3
+
+  %exp0 = tail call float @llvm.amdgcn.exp2.f32(float %sub0)
+  %exp1 = tail call float @llvm.amdgcn.exp2.f32(float %sub1)
+  %exp2 = tail call float @llvm.amdgcn.exp2.f32(float %sub2)
+  %exp3 = tail call float @llvm.amdgcn.exp2.f32(float %sub3)
+
+  %sum01 = fadd fast float %exp0, %exp1
+  %sum23 = fadd fast float %exp2, %exp3
+  %sum = fadd fast float %sum01, %sum23
+
+  store float %sum, ptr addrspace(1) %output, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wider_exp2_half(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @wider_exp2_half(
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[PTR2:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 2
+; GCN-NEXT:    [[SPTR2:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 2
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR2]], align 2
+; GCN-NEXT:    [[TMP5:%.*]] = load <2 x half>, ptr addrspace(1) [[SPTR2]], align 2
+; GCN-NEXT:    [[TMP6:%.*]] = fmul contract <2 x half> [[TMP4]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP7:%.*]] = fsub contract <2 x half> [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP8]])
+; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP9]])
+; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x half> [[TMP7]], i32 0
+; GCN-NEXT:    [[EXP2:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP10]])
+; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x half> [[TMP7]], i32 1
+; GCN-NEXT:    [[EXP3:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP11]])
+; GCN-NEXT:    [[SUM01:%.*]] = fadd fast half [[EXP0]], [[EXP1]]
+; GCN-NEXT:    [[SUM23:%.*]] = fadd fast half [[EXP2]], [[EXP3]]
+; GCN-NEXT:    [[SUM:%.*]] = fadd fast half [[SUM01]], [[SUM23]]
+; GCN-NEXT:    store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT:    ret void
+;
+entry:
+  ;; Same 4-wide pattern as wider_exp2_f32. Cost model gives half higher
+  ;; LookThrough cost, so we get 2-wide vectorization (or none on stricter targets).
+  %in0 = load half, ptr addrspace(1) %input, align 2
+  %ptr1 = getelementptr half, ptr addrspace(1) %input, i64 1
+  %in1 = load half, ptr addrspace(1) %ptr1, align 2
+  %ptr2 = getelementptr half, ptr addrspace(1) %input, i64 2
+  %in2 = load half, ptr addrspace(1) %ptr2, align 2
+  %ptr3 = getelementptr half, ptr addrspace(1) %input, i64 3
+  %in3 = load half, ptr addrspace(1) %ptr3, align 2
+
+  %scale0 = load half, ptr addrspace(1) %scales, align 2
+  %sptr1 = getelementptr half, ptr addrspace(1) %scales, i64 1
+  %scale1 = load half, ptr addrspace(1) %sptr1, align 2
+  %sptr2 = getelementptr half, ptr addrspace(1) %scales, i64 2
+  %scale2 = load half, ptr addrspace(1) %sptr2, align 2
+  %sptr3 = getelementptr half, ptr addrspace(1) %scales, i64 3
+  %scale3 = load half, ptr addrspace(1) %sptr3, align 2
+
+  %mul0 = fmul contract half %in0, 0xH3E14
+  %mul1 = fmul contract half %in1, 0xH3E14
+  %mul2 = fmul contract half %in2, 0xH3E14
+  %mul3 = fmul contract half %in3, 0xH3E14
+
+  %sub0 = fsub contract half %mul0, %scale0
+  %sub1 = fsub contract half %mul1, %scale1
+  %sub2 = fsub contract half %mul2, %scale2
+  %sub3 = fsub contract half %mul3, %scale3
+
+  %exp0 = tail call half @llvm.amdgcn.exp2.f16(half %sub0)
+  %exp1 = tail call half @llvm.amdgcn.exp2.f16(half %sub1)
+  %exp2 = tail call half @llvm.amdgcn.exp2.f16(half %sub2)
+  %exp3 = tail call half @llvm.amdgcn.exp2.f16(half %sub3)
+
+  %sum01 = fadd fast half %exp0, %exp1
+  %sum23 = fadd fast half %exp2, %exp3
+  %sum = fadd fast half %sum01, %sum23
+
+  store half %sum, ptr addrspace(1) %output, align 2
+  ret void
+}
+
+declare half @llvm.amdgcn.exp2.f16(half)
+declare float @llvm.amdgcn.exp2.f32(float)
+declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 immarg, <16 x i32>, i32 immarg, <16 x i32>, i16 immarg, <8 x float>, i32 immarg, i32 immarg, i32, i32 immarg, i32 immarg, i32, i1 immarg, i1 immarg)

>From 7c6b48e692269b6e3006957511117c13d7b054c9 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Wed, 1 Apr 2026 17:50:49 -0500
Subject: [PATCH 2/6] Fixed comment!

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9301a938160aa..3cb7e8e35779b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -258,7 +258,7 @@ static Value *getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
   // Skip trivially vectorizable intrinsics.
   if (isTriviallyVectorizable(ID))
     return nullptr;
-  // Only look through unary intrinsic calls.
+  // Only consider unary intrinsic calls.
   if (CI->arg_size() != 1)
     return nullptr;
   // Check if it is speculatable, no memory access and will return

>From 54f51671b21e84100ef4a3f9356920e9a4ad358a Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 3 Apr 2026 01:16:28 -0500
Subject: [PATCH 3/6] Fixed checks and enabled for all operands

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 62 ++++++++++---------
 .../Transforms/SLPVectorizer/RISCV/revec.ll   |  7 +--
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3cb7e8e35779b..514acaa49fffd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -246,35 +246,32 @@ static const unsigned MaxPHINumOperands = 128;
 /// For instructions that are not trivially vectorizable, try to vectorize their
 /// operands.
 /// FIXME: Extend for all non-vectorized functions.
-static Value *getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+SmallVector<Value *, 4>
+getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+
+  SmallVector<Value *, 4> Operands;
   auto *CI = dyn_cast<CallInst>(V);
-  if (!CI)
-    return nullptr;
+
+  if (!CI || isAssumeLikeIntrinsic(CI))
+    return {};
   Intrinsic::ID ID = CI->getIntrinsicID();
   // Only consider intrinsic calls.
   // FIXME: We may want to relax this condition in future.
-  if (ID == Intrinsic::not_intrinsic)
-    return nullptr;
-  // Skip trivially vectorizable intrinsics.
-  if (isTriviallyVectorizable(ID))
-    return nullptr;
-  // Only consider unary intrinsic calls.
-  if (CI->arg_size() != 1)
-    return nullptr;
-  // Check if it is speculatable, no memory access and will return
-  if (!CI->hasFnAttr(Attribute::Speculatable) || !CI->doesNotAccessMemory() ||
-      !CI->willReturn())
-    return nullptr;
-  auto *Operand = dyn_cast<Instruction>(CI->getArgOperand(0));
-  if (!Operand)
-    return nullptr;
-  // Operand type should match the result type we ignore type changing
-  // intrinsics.
-  if (Operand->getType() != CI->getType())
-    return nullptr;
+  if (ID == Intrinsic::not_intrinsic || isTriviallyVectorizable(ID))
+    return {};
 
-  return Operand;
-}
+  // Skip memory intrinsics (e.g., masked.load, masked.gather etc.)
+  if (CI->mayReadOrWriteMemory())
+    return {};
+
+  for (Value *ArgOp : CI->args()) {
+    if (auto *I = dyn_cast<Instruction>(ArgOp)) {
+      Operands.emplace_back(I);
+    }
+  }
+
+  return Operands;
+}  
 
 /// Predicate for the element types that the SLP vectorizer supports.
 ///
@@ -29510,7 +29507,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       PostProcessCmps.insert(cast<CmpInst>(&*It));
   }
 
-  DenseMap<Intrinsic::ID, SmallSetVector<Value *, 4>> IntrinsicSeedOps;
+  SmallMapVector<Intrinsic::ID, SmallSetVector<Value *, 4>, 4> IntrinsicSeedOps;
   for (Instruction &I : *BB) {
     if (R.isDeleted(&I))
       continue;
@@ -29518,12 +29515,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     // llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
     // vectorized independently.
     // FIXME: Extend for all non-vectorized functions.
-    if (Value *Op = getNonTriviallyVectorizableIntrinsicCallOperand(&I))
-      IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert(Op);
+    SmallVector<Value *, 4> Ops =
+        getNonTriviallyVectorizableIntrinsicCallOperand(&I);
+    if (!Ops.empty())
+      IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert_range(Ops);
   }
   // Try to vectorize per intrinsic call ID.
   for (auto &[ID, Ops] : IntrinsicSeedOps) {
-    Changed |= tryToVectorizeList(Ops.getArrayRef(), R);
+    // Sub-group by opcode so we do not get bailed early
+    SmallMapVector<unsigned, SmallVector<Value *, 4>, 4> OpcodeGroups;
+    for (Value *Op : Ops) {
+      if (auto *I = dyn_cast<Instruction>(Op))
+        OpcodeGroups[I->getOpcode()].push_back(Op);
+    }
+    for (auto &[Opc, Group] : OpcodeGroups)
+      Changed |= tryToVectorizeList(Group, R);
   }
 
   return Changed;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index e13dfce8c29f3..016726e5ae371 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -177,11 +177,10 @@ define ptr @test4() {
 ; NONPOWEROF2-NEXT:    [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ]
 ; NONPOWEROF2-NEXT:    br label [[TMP9:%.*]]
 ; NONPOWEROF2:       10:
-; NONPOWEROF2-NEXT:    [[TMP10:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]]
-; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]]
+; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = fmul <6 x float> zeroinitializer, [[TMP8]]
+; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; NONPOWEROF2-NEXT:    [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
+; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
 ; NONPOWEROF2-NEXT:    [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
 ; NONPOWEROF2-NEXT:    [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]])
 ; NONPOWEROF2-NEXT:    [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])

>From 5e656dc3b1977cea7719411328cabe68f83dffd5 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 3 Apr 2026 01:32:01 -0500
Subject: [PATCH 4/6] Fixed format

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 514acaa49fffd..36e94de1cc18e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -271,7 +271,7 @@ getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
   }
 
   return Operands;
-}  
+}
 
 /// Predicate for the element types that the SLP vectorizer supports.
 ///
@@ -12276,8 +12276,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
           }
         }
         if (NewLoopNest.size() > CurrentLoopNest.size())
-          CurrentLoopNest.append(std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
-                          NewLoopNest.end());
+          CurrentLoopNest.append(
+              std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
+              NewLoopNest.end());
       }
     }
   }
@@ -18475,10 +18476,9 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
           assert(SLPReVec && "Only supported by REVEC.");
           SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
         }
-        InstructionCost CastCost =
-            TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
-                                  TTI::CastContextHint::None,
-                                  TTI::TCK_RecipThroughput);
+        InstructionCost CastCost = TTI->getCastInstrCost(
+            Opcode, DstTy, SrcTy, TTI::CastContextHint::None,
+            TTI::TCK_RecipThroughput);
         CastCost = ScaleCost(CastCost, Root, /*Scalar=*/nullptr, ReductionRoot);
         Cost += CastCost;
       }
@@ -18647,9 +18647,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
         default:
           break;
         }
-        InstructionCost CastCost =
-            TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
-                                  TTI::TCK_RecipThroughput);
+        InstructionCost CastCost = TTI->getCastInstrCost(
+            Opcode, DstVecTy, SrcVecTy, CCH, TTI::TCK_RecipThroughput);
         CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
                              /*Scalar=*/nullptr, ReductionRoot);
         Cost += CastCost;

>From 25404e5d9db380e46d70d8831f51c65f2c94a721 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 3 Apr 2026 15:19:24 -0500
Subject: [PATCH 5/6] Addressed reviewes

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 50 +++++++++----------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 36e94de1cc18e..05dcc1d134f0f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -261,7 +261,7 @@ getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
     return {};
 
   // Skip memory intrinsics (e.g., masked.load, masked.gather etc.)
-  if (CI->mayReadOrWriteMemory())
+  if (!SLPReVec && CI->getType()->isVectorTy())
     return {};
 
   for (Value *ArgOp : CI->args()) {
@@ -12276,9 +12276,8 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
           }
         }
         if (NewLoopNest.size() > CurrentLoopNest.size())
-          CurrentLoopNest.append(
-              std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
-              NewLoopNest.end());
+          CurrentLoopNest.append(std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
+                          NewLoopNest.end());
       }
     }
   }
@@ -18476,9 +18475,10 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
           assert(SLPReVec && "Only supported by REVEC.");
           SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
         }
-        InstructionCost CastCost = TTI->getCastInstrCost(
-            Opcode, DstTy, SrcTy, TTI::CastContextHint::None,
-            TTI::TCK_RecipThroughput);
+        InstructionCost CastCost =
+            TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
+                                  TTI::CastContextHint::None,
+                                  TTI::TCK_RecipThroughput);
         CastCost = ScaleCost(CastCost, Root, /*Scalar=*/nullptr, ReductionRoot);
         Cost += CastCost;
       }
@@ -18647,8 +18647,9 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
         default:
           break;
         }
-        InstructionCost CastCost = TTI->getCastInstrCost(
-            Opcode, DstVecTy, SrcVecTy, CCH, TTI::TCK_RecipThroughput);
+        InstructionCost CastCost =
+            TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
+                                  TTI::TCK_RecipThroughput);
         CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
                              /*Scalar=*/nullptr, ReductionRoot);
         Cost += CastCost;
@@ -29506,30 +29507,27 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       PostProcessCmps.insert(cast<CmpInst>(&*It));
   }
 
-  SmallMapVector<Intrinsic::ID, SmallSetVector<Value *, 4>, 4> IntrinsicSeedOps;
+  // Collect operands of non-trivially vectorizable intrinsic calls (e.g.,
+  // llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
+  // vectorized independently.
+  // FIXME: Extend for all non-vectorized functions.
+  SmallMapVector<std::pair<Intrinsic::ID, unsigned>, SmallVector<Value *, 4>, 4>
+      OpcodeGroups;
+
   for (Instruction &I : *BB) {
     if (R.isDeleted(&I))
       continue;
-    // Collect operands of non-trivially vectorizable intrinsic calls (e.g.,
-    // llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
-    // vectorized independently.
-    // FIXME: Extend for all non-vectorized functions.
     SmallVector<Value *, 4> Ops =
         getNonTriviallyVectorizableIntrinsicCallOperand(&I);
-    if (!Ops.empty())
-      IntrinsicSeedOps[cast<CallInst>(&I)->getIntrinsicID()].insert_range(Ops);
-  }
-  // Try to vectorize per intrinsic call ID.
-  for (auto &[ID, Ops] : IntrinsicSeedOps) {
-    // Sub-group by opcode so we do not get bailed early
-    SmallMapVector<unsigned, SmallVector<Value *, 4>, 4> OpcodeGroups;
-    for (Value *Op : Ops) {
-      if (auto *I = dyn_cast<Instruction>(Op))
-        OpcodeGroups[I->getOpcode()].push_back(Op);
+    if (!Ops.empty()) {
+      Intrinsic::ID ID = cast<CallInst>(&I)->getIntrinsicID();
+      for (Value *Op : Ops)
+        if (auto *OpI = dyn_cast<Instruction>(Op))
+          OpcodeGroups[{ID, OpI->getOpcode()}].push_back(Op);
     }
-    for (auto &[Opc, Group] : OpcodeGroups)
-      Changed |= tryToVectorizeList(Group, R);
   }
+  for (auto &[_, OpGroup] : OpcodeGroups)
+    Changed |= tryToVectorizeList(OpGroup, R);
 
   return Changed;
 }

>From daca0bb67c57abfdb91ed34143421e843778dd63 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 5 Apr 2026 20:53:37 -0500
Subject: [PATCH 6/6] Addressed reviewers comments, added new tests.

---
 .../llvm/Transforms/Vectorize/SLPVectorizer.h |   5 +
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  55 ++++--
 ...otriviallyvectorizableintrinsicoperands.ll | 184 ++++++++++++++++++
 3 files changed, 226 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 877c83291170b..7c35dd70e2a77 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -152,6 +152,11 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   /// a vectorization chain.
   bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
+  /// Tries to vectorize non-trivially-vectorizable intrinsic calls operands in
+  /// the block.
+  bool vectorizeIntrinsicSeedsInBlock(BasicBlock *BB,
+                                      slpvectorizer::BoUpSLP &R);
+
   std::optional<bool> vectorizeStoreChain(ArrayRef<Value *> Chain,
                                           slpvectorizer::BoUpSLP &R,
                                           unsigned Idx, unsigned MinVF,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 05dcc1d134f0f..db28c2ba18fe2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -249,22 +249,20 @@ static const unsigned MaxPHINumOperands = 128;
 SmallVector<Value *, 4>
 getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
 
-  SmallVector<Value *, 4> Operands;
-  auto *CI = dyn_cast<CallInst>(V);
-
-  if (!CI || isAssumeLikeIntrinsic(CI))
+  auto *II = dyn_cast<IntrinsicInst>(V);
+  if (!II || isAssumeLikeIntrinsic(II))
     return {};
-  Intrinsic::ID ID = CI->getIntrinsicID();
-  // Only consider intrinsic calls.
-  // FIXME: We may want to relax this condition in future.
-  if (ID == Intrinsic::not_intrinsic || isTriviallyVectorizable(ID))
+
+  if (isTriviallyVectorizable(II->getIntrinsicID()))
     return {};
 
   // Skip memory intrinsics (e.g., masked.load, masked.gather etc.)
-  if (!SLPReVec && CI->getType()->isVectorTy())
+  if (!SLPReVec && II->getType()->isVectorTy())
     return {};
 
-  for (Value *ArgOp : CI->args()) {
+  // FIXME: Add non-instructions operands to the list.
+  SmallVector<Value *, 4> Operands;
+  for (Value *ArgOp : II->args()) {
     if (auto *I = dyn_cast<Instruction>(ArgOp)) {
       Operands.emplace_back(I);
     }
@@ -25298,6 +25296,9 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
     // Vectorize trees that end at reductions.
     Changed |= vectorizeChainsInBlock(BB, R);
 
+    // Vectorize non-trivially-vectorizable intrinsic.
+    Changed |= vectorizeIntrinsicSeedsInBlock(BB, R);
+
     // Vectorize the index computations of getelementptr instructions. This
     // is primarily intended to catch gather-like idioms ending at
     // non-consecutive loads.
@@ -29507,13 +29508,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       PostProcessCmps.insert(cast<CmpInst>(&*It));
   }
 
+  return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeIntrinsicSeedsInBlock(BasicBlock *BB,
+                                                       BoUpSLP &R) {
+  bool Changed = false;
   // Collect operands of non-trivially vectorizable intrinsic calls (e.g.,
   // llvm.amdgcn.exp2) and group by intrinsic ID, so their operands can be
   // vectorized independently.
   // FIXME: Extend for all non-vectorized functions.
-  SmallMapVector<std::pair<Intrinsic::ID, unsigned>, SmallVector<Value *, 4>, 4>
-      OpcodeGroups;
-
+  SmallMapVector<std::pair<Intrinsic::ID, unsigned>, // (ID, OpIndex)
+                 SmallMapVector<unsigned,            // Opcode
+                                SmallVector<Value *, 4>, 4>,
+                 4>
+      IntrinsicSeedOps;
   for (Instruction &I : *BB) {
     if (R.isDeleted(&I))
       continue;
@@ -29521,14 +29530,24 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         getNonTriviallyVectorizableIntrinsicCallOperand(&I);
     if (!Ops.empty()) {
       Intrinsic::ID ID = cast<CallInst>(&I)->getIntrinsicID();
-      for (Value *Op : Ops)
-        if (auto *OpI = dyn_cast<Instruction>(Op))
-          OpcodeGroups[{ID, OpI->getOpcode()}].push_back(Op);
+      for (auto [OpIdx, Op] : enumerate(Ops)) {
+        if (auto *OpI = dyn_cast<Instruction>(Op)) {
+          IntrinsicSeedOps[{ID, OpIdx}][OpI->getOpcode()].push_back(Op);
+        }
+      }
     }
   }
-  for (auto &[_, OpGroup] : OpcodeGroups)
-    Changed |= tryToVectorizeList(OpGroup, R);
 
+  for (auto &[_, OpcodeMap] : IntrinsicSeedOps)
+    for (auto &[_, Group] : OpcodeMap) {
+      // Don't include instructions that were deleted by previous
+      // vectorization.
+      auto Candidates = make_filter_range(Group, [&](Value *V) {
+        auto *I = dyn_cast<Instruction>(V);
+        return I && !R.isDeleted(I);
+      });
+      Changed |= tryToVectorizeList(SmallVector<Value *, 4>(Candidates), R);
+    }
   return Changed;
 }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
index 0ca33e0e6b09f..963b7808335ac 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
@@ -355,6 +355,190 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @kernel_div_scale(ptr addrspace(1) %num, ptr addrspace(1) %den, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @kernel_div_scale(
+; GCN-SAME: ptr addrspace(1) [[NUM:%.*]], ptr addrspace(1) [[DEN:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[NPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[NUM]], i64 2
+; GCN-NEXT:    [[N2:%.*]] = load float, ptr addrspace(1) [[NPTR2]], align 4
+; GCN-NEXT:    [[DPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[DEN]], i64 2
+; GCN-NEXT:    [[D2:%.*]] = load float, ptr addrspace(1) [[DPTR2]], align 4
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[NUM]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], splat (float 2.000000e+00)
+; GCN-NEXT:    [[MUL_N2:%.*]] = fmul float [[N2]], 2.000000e+00
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(1) [[DEN]], align 4
+; GCN-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], splat (float 4.000000e+00)
+; GCN-NEXT:    [[MUL_D2:%.*]] = fmul float [[D2]], 4.000000e+00
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT:    [[DS0:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP4]], float [[TMP5]], i1 false)
+; GCN-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; GCN-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT:    [[DS1:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP6]], float [[TMP7]], i1 false)
+; GCN-NEXT:    [[DS2:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[MUL_N2]], float [[MUL_D2]], i1 false)
+; GCN-NEXT:    [[R0:%.*]] = extractvalue { float, i1 } [[DS0]], 0
+; GCN-NEXT:    [[R1:%.*]] = extractvalue { float, i1 } [[DS1]], 0
+; GCN-NEXT:    [[R2:%.*]] = extractvalue { float, i1 } [[DS2]], 0
+; GCN-NEXT:    [[SUM01:%.*]] = fadd float [[R0]], [[R1]]
+; GCN-NEXT:    [[SUM:%.*]] = fadd float [[SUM01]], [[R2]]
+; GCN-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT:    ret void
+;
+entry:
+  %n0 = load float, ptr addrspace(1) %num, align 4
+  %nptr1 = getelementptr float, ptr addrspace(1) %num, i64 1
+  %n1 = load float, ptr addrspace(1) %nptr1, align 4
+  %nptr2 = getelementptr float, ptr addrspace(1) %num, i64 2
+  %n2 = load float, ptr addrspace(1) %nptr2, align 4
+  %d0 = load float, ptr addrspace(1) %den, align 4
+  %dptr1 = getelementptr float, ptr addrspace(1) %den, i64 1
+  %d1 = load float, ptr addrspace(1) %dptr1, align 4
+  %dptr2 = getelementptr float, ptr addrspace(1) %den, i64 2
+  %d2 = load float, ptr addrspace(1) %dptr2, align 4
+  %mul_n0 = fmul float %n0, 2.0
+  %mul_n1 = fmul float %n1, 2.0
+  %mul_n2 = fmul float %n2, 2.0
+  %mul_d0 = fmul float %d0, 4.0
+  %mul_d1 = fmul float %d1, 4.0
+  %mul_d2 = fmul float %d2, 4.0
+  %ds0 = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %mul_n0, float %mul_d0, i1 false)
+  %ds1 = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %mul_n1, float %mul_d1, i1 false)
+  %ds2 = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %mul_n2, float %mul_d2, i1 false)
+  %r0 = extractvalue { float, i1 } %ds0, 0
+  %r1 = extractvalue { float, i1 } %ds1, 0
+  %r2 = extractvalue { float, i1 } %ds2, 0
+  %sum01 = fadd float %r0, %r1
+  %sum   = fadd float %sum01, %r2
+  store float %sum, ptr addrspace(1) %output, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kernel_fmed3(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @kernel_fmed3(
+; GCN-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
+; GCN-NEXT:    [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
+; GCN-NEXT:    [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
+; GCN-NEXT:    [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[A]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[B]], align 4
+; GCN-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP0]], [[TMP1]]
+; GCN-NEXT:    [[ADD2:%.*]] = fadd float [[A2]], [[B2]]
+; GCN-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; GCN-NEXT:    [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP3]], float [[TMP3]], float 1.000000e+00)
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; GCN-NEXT:    [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP4]], float [[TMP4]], float 1.000000e+00)
+; GCN-NEXT:    [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD2]], float [[ADD2]], float 1.000000e+00)
+; GCN-NEXT:    [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]]
+; GCN-NEXT:    [[SUM:%.*]] = fadd float [[SUM01]], [[MED2]]
+; GCN-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT:    ret void
+;
+entry:
+  %a0 = load float, ptr addrspace(1) %a, align 4
+  %aptr1 = getelementptr float, ptr addrspace(1) %a, i64 1
+  %a1 = load float, ptr addrspace(1) %aptr1, align 4
+  %aptr2 = getelementptr float, ptr addrspace(1) %a, i64 2
+  %a2 = load float, ptr addrspace(1) %aptr2, align 4
+
+  %b0 = load float, ptr addrspace(1) %b, align 4
+  %bptr1 = getelementptr float, ptr addrspace(1) %b, i64 1
+  %b1 = load float, ptr addrspace(1) %bptr1, align 4
+  %bptr2 = getelementptr float, ptr addrspace(1) %b, i64 2
+  %b2 = load float, ptr addrspace(1) %bptr2, align 4
+
+  %add0 = fadd float %a0, %b0
+  %add1 = fadd float %a1, %b1
+  %add2 = fadd float %a2, %b2
+
+  %med0 = call float @llvm.amdgcn.fmed3.f32(float %add0, float %add0, float 1.0)
+  %med1 = call float @llvm.amdgcn.fmed3.f32(float %add1, float %add1, float 1.0)
+  %med2 = call float @llvm.amdgcn.fmed3.f32(float %add2, float %add2, float 1.0)
+
+  %sum01 = fadd float %med0, %med1
+  %sum   = fadd float %sum01, %med2
+  store float %sum, ptr addrspace(1) %output, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kernel_fmed3_1(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %output) {
+; GCN-LABEL: define amdgpu_kernel void @kernel_fmed3_1(
+; GCN-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[ENTRY:.*:]]
+; GCN-NEXT:    [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4
+; GCN-NEXT:    [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1
+; GCN-NEXT:    [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4
+; GCN-NEXT:    [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
+; GCN-NEXT:    [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
+; GCN-NEXT:    [[APTR3:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 3
+; GCN-NEXT:    [[A3:%.*]] = load float, ptr addrspace(1) [[APTR3]], align 4
+; GCN-NEXT:    [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[B]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = fadd <2 x float> splat (float 5.000000e+00), [[TMP0]]
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(1) [[BPTR2]], align 4
+; GCN-NEXT:    [[TMP3:%.*]] = fadd <2 x float> splat (float 5.000000e+00), [[TMP2]]
+; GCN-NEXT:    [[TMP4:%.*]] = fadd <2 x float> splat (float 1.000000e+00), [[TMP0]]
+; GCN-NEXT:    [[TMP5:%.*]] = fadd <2 x float> splat (float 1.000000e+00), [[TMP2]]
+; GCN-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; GCN-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; GCN-NEXT:    [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP6]], float [[TMP7]], float 1.000000e+00)
+; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; GCN-NEXT:    [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP8]], float [[TMP9]], float 1.000000e+00)
+; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; GCN-NEXT:    [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP10]], float [[TMP11]], float 1.000000e+00)
+; GCN-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
+; GCN-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP12]], float [[TMP13]], float 1.000000e+00)
+; GCN-NEXT:    [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]]
+; GCN-NEXT:    [[SUM02:%.*]] = fadd float [[MED2]], [[MED3]]
+; GCN-NEXT:    [[SUM:%.*]] = fadd float [[SUM01]], [[SUM02]]
+; GCN-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; GCN-NEXT:    ret void
+;
+entry:
+  %a0 = load float, ptr addrspace(1) %a, align 4
+  %aptr1 = getelementptr float, ptr addrspace(1) %a, i64 1
+  %a1 = load float, ptr addrspace(1) %aptr1, align 4
+  %aptr2 = getelementptr float, ptr addrspace(1) %a, i64 2
+  %a2 = load float, ptr addrspace(1) %aptr2, align 4
+  %aptr3 = getelementptr float, ptr addrspace(1) %a, i64 3
+  %a3 = load float, ptr addrspace(1) %aptr3, align 4
+
+  %b0 = load float, ptr addrspace(1) %b, align 4
+  %bptr1 = getelementptr float, ptr addrspace(1) %b, i64 1
+  %b1 = load float, ptr addrspace(1) %bptr1, align 4
+  %bptr2 = getelementptr float, ptr addrspace(1) %b, i64 2
+  %b2 = load float, ptr addrspace(1) %bptr2, align 4
+  %bptr3 = getelementptr float, ptr addrspace(1) %b, i64 3
+  %b3 = load float, ptr addrspace(1) %bptr3, align 4
+
+  %add0 = fadd float 5.0, %b0
+  %add1 = fadd float 5.0, %b1
+  %add2 = fadd float 5.0, %b2
+  %add3 = fadd float 5.0, %b3
+
+  %sub0 = fadd float 1.0, %b0
+  %sub1 = fadd float 1.0, %b1
+  %sub2 = fadd float 1.0, %b2
+  %sub3 = fadd float 1.0, %b3
+
+  %med0 = call float @llvm.amdgcn.fmed3.f32(float %add0, float %sub0, float 1.0)
+  %med1 = call float @llvm.amdgcn.fmed3.f32(float %add1, float %sub1, float 1.0)
+  %med2 = call float @llvm.amdgcn.fmed3.f32(float %add2, float %sub2, float 1.0)
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %add3, float %sub3, float 1.0)
+
+  %sum01 = fadd float %med0, %med1
+  %sum02 = fadd float %med2, %med3
+  %sum   = fadd float %sum01, %sum02
+  store float %sum, ptr addrspace(1) %output, align 4
+  ret void
+}
+
+declare float @llvm.amdgcn.fmed3.f32(float, float, float)
+declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1)
 declare half @llvm.amdgcn.exp2.f16(half)
 declare float @llvm.amdgcn.exp2.f32(float)
 declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 immarg, <16 x i32>, i32 immarg, <16 x i32>, i16 immarg, <8 x float>, i32 immarg, i32 immarg, i32, i32 immarg, i32 immarg, i32, i1 immarg, i1 immarg)