[llvm] 95c64b7 - AMDGPU: Reduce readfirstlane for single demanded vector element (#128647)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 4 17:36:00 PST 2025
Author: Matt Arsenault
Date: 2025-03-05T08:35:56+07:00
New Revision: 95c64b7ee6158a8a4e90638af383ab8826b03a14
URL: https://github.com/llvm/llvm-project/commit/95c64b7ee6158a8a4e90638af383ab8826b03a14
DIFF: https://github.com/llvm/llvm-project/commit/95c64b7ee6158a8a4e90638af383ab8826b03a14.diff
LOG: AMDGPU: Reduce readfirstlane for single demanded vector element (#128647)
If we are only extracting a single element, rewrite the intrinsic call
to use the element type. We should extend this to arbitrary extract
shuffles.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index d69cfbbe4088e..70ccd7edce2ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1563,6 +1563,49 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
return NewCall;
}
+Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
+ InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
+ APInt &UndefElts) const {
+ auto *VT = dyn_cast<FixedVectorType>(II.getType());
+ if (!VT)
+ return nullptr;
+
+ const unsigned FirstElt = DemandedElts.countr_zero();
+ const unsigned LastElt = DemandedElts.getActiveBits() - 1;
+ const unsigned MaskLen = LastElt - FirstElt + 1;
+
+ // TODO: Handle general subvector extract.
+ if (MaskLen != 1)
+ return nullptr;
+
+ Type *EltTy = VT->getElementType();
+ if (!isTypeLegal(EltTy))
+ return nullptr;
+
+ Value *Src = II.getArgOperand(0);
+
+ assert(FirstElt == LastElt);
+ Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
+
+ // Make sure convergence tokens are preserved.
+ // TODO: CreateIntrinsic should allow directly copying bundles
+ SmallVector<OperandBundleDef, 2> OpBundles;
+ II.getOperandBundlesAsDefs(OpBundles);
+
+ Module *M = IC.Builder.GetInsertBlock()->getModule();
+ Function *Remangled = Intrinsic::getOrInsertDeclaration(
+ M, II.getIntrinsicID(), {Extract->getType()});
+
+ // TODO: Preserve callsite attributes?
+ CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
+
+ Value *Result = IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
+ NewCall, FirstElt);
+ IC.replaceInstUsesWith(II, Result);
+ IC.eraseInstFromFunction(II);
+ return Result;
+}
+
std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
APInt &UndefElts2, APInt &UndefElts3,
@@ -1570,9 +1613,8 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
SimplifyAndSetOp) const {
switch (II.getIntrinsicID()) {
case Intrinsic::amdgcn_readfirstlane:
- // TODO: For a vector extract, should reduce the intrinsic call type.
SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
- return std::nullopt;
+ return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index a0d62008d9ddc..f5062070ac6f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -226,6 +226,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const;
+
+ Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC,
+ IntrinsicInst &II,
+ const APInt &DemandedElts,
+ APInt &UndefElts) const;
+
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
APInt &UndefElts2, APInt &UndefElts3,
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll
index 836c739048411..e9d3b5e963b35 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll
@@ -4,8 +4,8 @@
define i16 @extract_elt0_v2i16_readfirstlane(<2 x i16> %src) {
; CHECK-LABEL: define i16 @extract_elt0_v2i16_readfirstlane(
; CHECK-SAME: <2 x i16> [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i16> [[VEC]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[SRC]], i64 0
+; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]])
; CHECK-NEXT: ret i16 [[ELT]]
;
%vec = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src)
@@ -16,8 +16,8 @@ define i16 @extract_elt0_v2i16_readfirstlane(<2 x i16> %src) {
define i16 @extract_elt0_v1i16_readfirstlane(<1 x i16> %src) {
; CHECK-LABEL: define i16 @extract_elt0_v1i16_readfirstlane(
; CHECK-SAME: <1 x i16> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <1 x i16> @llvm.amdgcn.readfirstlane.v1i16(<1 x i16> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <1 x i16> [[VEC]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i16> [[SRC]], i64 0
+; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]])
; CHECK-NEXT: ret i16 [[ELT]]
;
%vec = call <1 x i16> @llvm.amdgcn.readfirstlane.v1i16(<1 x i16> %src)
@@ -28,8 +28,8 @@ define i16 @extract_elt0_v1i16_readfirstlane(<1 x i16> %src) {
define i16 @extract_elt1_v2i16_readfirstlane(<2 x i16> %src) {
; CHECK-LABEL: define i16 @extract_elt1_v2i16_readfirstlane(
; CHECK-SAME: <2 x i16> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i16> [[VEC]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[SRC]], i64 1
+; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]])
; CHECK-NEXT: ret i16 [[ELT]]
;
%vec = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src)
@@ -40,8 +40,8 @@ define i16 @extract_elt1_v2i16_readfirstlane(<2 x i16> %src) {
define i16 @extract_elt0_v4i16_readfirstlane(<4 x i16> %src) {
; CHECK-LABEL: define i16 @extract_elt0_v4i16_readfirstlane(
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <4 x i16> [[VEC]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SRC]], i64 0
+; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]])
; CHECK-NEXT: ret i16 [[ELT]]
;
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -52,8 +52,8 @@ define i16 @extract_elt0_v4i16_readfirstlane(<4 x i16> %src) {
define i16 @extract_elt2_v4i16_readfirstlane(<4 x i16> %src) {
; CHECK-LABEL: define i16 @extract_elt2_v4i16_readfirstlane(
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <4 x i16> [[VEC]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SRC]], i64 2
+; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]])
; CHECK-NEXT: ret i16 [[ELT]]
;
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -136,8 +136,8 @@ define <2 x i16> @extract_elt30_v4i16_readfirstlane(<4 x i16> %src) {
define half @extract_elt0_v2f16_readfirstlane(<2 x half> %src) {
; CHECK-LABEL: define half @extract_elt0_v2f16_readfirstlane(
; CHECK-SAME: <2 x half> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x half> [[VEC]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[SRC]], i64 0
+; CHECK-NEXT: [[ELT:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[TMP1]])
; CHECK-NEXT: ret half [[ELT]]
;
%vec = call <2 x half> @llvm.amdgcn.readfirstlane.v2i16(<2 x half> %src)
@@ -148,8 +148,8 @@ define half @extract_elt0_v2f16_readfirstlane(<2 x half> %src) {
define half @extract_elt1_v2f16_readfirstlane(<2 x half> %src) {
; CHECK-LABEL: define half @extract_elt1_v2f16_readfirstlane(
; CHECK-SAME: <2 x half> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x half> [[VEC]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[SRC]], i64 1
+; CHECK-NEXT: [[ELT:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[TMP1]])
; CHECK-NEXT: ret half [[ELT]]
;
%vec = call <2 x half> @llvm.amdgcn.readfirstlane.v2i16(<2 x half> %src)
@@ -186,8 +186,8 @@ define i32 @extract_elt0_nxv4i32_readfirstlane(<vscale x 2 x i32> %src) {
define i32 @extract_elt0_v2i32_readfirstlane(<2 x i32> %src) {
; CHECK-LABEL: define i32 @extract_elt0_v2i32_readfirstlane(
; CHECK-SAME: <2 x i32> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i32> [[VEC]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[SRC]], i64 0
+; CHECK-NEXT: [[ELT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]])
; CHECK-NEXT: ret i32 [[ELT]]
;
%vec = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> %src)
@@ -198,8 +198,8 @@ define i32 @extract_elt0_v2i32_readfirstlane(<2 x i32> %src) {
define ptr addrspace(3) @extract_elt0_v2p3_readfirstlane(<2 x ptr addrspace(3)> %src) {
; CHECK-LABEL: define ptr addrspace(3) @extract_elt0_v2p3_readfirstlane(
; CHECK-SAME: <2 x ptr addrspace(3)> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v2p3(<2 x ptr addrspace(3)> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x ptr addrspace(3)> [[VEC]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x ptr addrspace(3)> [[SRC]], i64 0
+; CHECK-NEXT: [[ELT:%.*]] = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) [[TMP1]])
; CHECK-NEXT: ret ptr addrspace(3) [[ELT]]
;
%vec = call <2 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v2p3(<2 x ptr addrspace(3)> %src)
@@ -210,8 +210,8 @@ define ptr addrspace(3) @extract_elt0_v2p3_readfirstlane(<2 x ptr addrspace(3)>
define i64 @extract_elt0_v2i64_readfirstlane(<2 x i64> %src) {
; CHECK-LABEL: define i64 @extract_elt0_v2i64_readfirstlane(
; CHECK-SAME: <2 x i64> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i64> [[VEC]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[SRC]], i64 0
+; CHECK-NEXT: [[ELT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[TMP1]])
; CHECK-NEXT: ret i64 [[ELT]]
;
%vec = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src)
@@ -222,8 +222,8 @@ define i64 @extract_elt0_v2i64_readfirstlane(<2 x i64> %src) {
define i64 @extract_elt1_v2i64_readfirstlane(<2 x i64> %src) {
; CHECK-LABEL: define i64 @extract_elt1_v2i64_readfirstlane(
; CHECK-SAME: <2 x i64> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i64> [[VEC]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[SRC]], i64 1
+; CHECK-NEXT: [[ELT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[TMP1]])
; CHECK-NEXT: ret i64 [[ELT]]
;
%vec = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src)
@@ -306,9 +306,8 @@ define <2 x i16> @extract_elt13_v4i16readfirstlane(<4 x i16> %src) {
define <2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify0(i32 %src0, i32 %src2) {
; CHECK-LABEL: define <2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify0(
; CHECK-SAME: i32 [[SRC0:%.*]], i32 [[SRC2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[INS_1:%.*]] = insertelement <4 x i32> poison, i32 [[SRC0]], i64 1
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[INS_1]])
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; CHECK-NEXT: [[SHUFFLE:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0
; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]]
;
%ins.0 = insertelement <4 x i32> poison, i32 %src0, i32 0
@@ -350,8 +349,8 @@ define i32 @extract_elt0_v2i32_readfirstlane_convergencetoken(<2 x i32> %src) co
; CHECK-LABEL: define i32 @extract_elt0_v2i32_readfirstlane_convergencetoken(
; CHECK-SAME: <2 x i32> [[SRC:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry()
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[SRC]]) [ "convergencectrl"(token [[T]]) ]
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i32> [[VEC]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[SRC]], i64 0
+; CHECK-NEXT: [[ELT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]]) [ "convergencectrl"(token [[T]]) ]
; CHECK-NEXT: ret i32 [[ELT]]
;
%t = call token @llvm.experimental.convergence.entry()
@@ -381,8 +380,8 @@ define < 2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify1_convergenc
define i1 @extract_elt0_v2i1_readfirstlane(<2 x i1> %src) {
; CHECK-LABEL: define i1 @extract_elt0_v2i1_readfirstlane(
; CHECK-SAME: <2 x i1> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x i1> @llvm.amdgcn.readfirstlane.v2i1(<2 x i1> [[SRC]])
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i1> [[VEC]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[SRC]], i64 0
+; CHECK-NEXT: [[ELT:%.*]] = call i1 @llvm.amdgcn.readfirstlane.i1(i1 [[TMP1]])
; CHECK-NEXT: ret i1 [[ELT]]
;
%vec = call <2 x i1> @llvm.amdgcn.readfirstlane.v2i1(<2 x i1> %src)
More information about the llvm-commits
mailing list