[llvm-branch-commits] [llvm] AMDGPU: Simplify demanded bits on readlane/writeline index arguments (PR #117963)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Dec 2 05:36:17 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117963
>From d4e97b490621bbb45ce52bc6ad81c869bb84a4e4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 27 Nov 2024 22:24:15 -0500
Subject: [PATCH 1/3] AMDGPU: Simplify demanded bits on readlane/writeline
index arguments
The main goal is to fold away wave64 code when compiled for wave32.
If we have out of bounds indexing, these will now clamp down to
a low bit which may CSE with the operations on the low half of the
wave.
---
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 43 ++++-
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 4 +
.../lane-index-simplify-demanded-bits.ll | 147 ++++++++++++------
3 files changed, 142 insertions(+), 52 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 18a09c39a06387..a0bb3e181ac526 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -450,6 +450,37 @@ static bool isTriviallyUniform(const Use &U) {
return false;
}
+/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
+///
+/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
+bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
+ IntrinsicInst &II,
+ unsigned LaneArgIdx) const {
+ unsigned MaskBits = ST->isWaveSizeKnown() && ST->isWave32() ? 5 : 6;
+ APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
+
+ KnownBits Known(32);
+ if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
+ return true;
+
+ if (!Known.isConstant())
+ return false;
+
+ // Unlike the DAG version, SimplifyDemandedBits does not change
+ // constants. Make sure we clamp these down. Out of bounds indexes may appear
+ // in wave64 code compiled for wave32.
+
+ Value *LaneArg = II.getArgOperand(LaneArgIdx);
+ Constant *MaskedConst =
+ ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
+ if (MaskedConst != LaneArg) {
+ II.getOperandUse(LaneArgIdx).set(MaskedConst);
+ return true;
+ }
+
+ return false;
+}
+
std::optional<Instruction *>
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
@@ -1092,7 +1123,17 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
const Use &Src = II.getArgOperandUse(0);
if (isTriviallyUniform(Src))
return IC.replaceInstUsesWith(II, Src.get());
- break;
+
+ if (IID == Intrinsic::amdgcn_readlane &&
+ simplifyDemandedLaneMaskArg(IC, II, 1))
+ return &II;
+
+ return std::nullopt;
+ }
+ case Intrinsic::amdgcn_writelane: {
+ if (simplifyDemandedLaneMaskArg(IC, II, 1))
+ return &II;
+ return std::nullopt;
}
case Intrinsic::amdgcn_trig_preop: {
// The intrinsic is declared with name mangling, but currently the
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 10956861650ab3..585f38fc02c29c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -220,6 +220,10 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
const Value *Op1, InstCombiner &IC) const;
+
+ bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
+ unsigned LaneAgIdx) const;
+
std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const;
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll b/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll
index b686f447b8d3c9..327d68bdf550e4 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll
@@ -18,30 +18,45 @@ define i32 @readlane_31(i32 %arg) #0 {
}
define i32 @readlane_32(i32 %arg) #0 {
-; CHECK-LABEL: define i32 @readlane_32(
-; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 32)
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @readlane_32(
+; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 32)
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @readlane_32(
+; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0)
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 32)
ret i32 %res
}
define i32 @readlane_33(i32 %arg) #0 {
-; CHECK-LABEL: define i32 @readlane_33(
-; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 33)
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @readlane_33(
+; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 33)
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @readlane_33(
+; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 1)
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 33)
ret i32 %res
}
define i32 @readlane_63(i32 %arg) #0 {
-; CHECK-LABEL: define i32 @readlane_63(
-; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 63)
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @readlane_63(
+; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 63)
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @readlane_63(
+; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 31)
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 63)
ret i32 %res
@@ -50,7 +65,7 @@ define i32 @readlane_63(i32 %arg) #0 {
define i32 @readlane_64(i32 %arg) #0 {
; CHECK-LABEL: define i32 @readlane_64(
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 64)
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0)
; CHECK-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 64)
@@ -58,11 +73,16 @@ define i32 @readlane_64(i32 %arg) #0 {
}
define i32 @readlane_and_31(i32 %arg, i32 %idx) #0 {
-; CHECK-LABEL: define i32 @readlane_and_31(
-; CHECK-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @readlane_and_31(
+; WAVE64-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @readlane_and_31(
+; WAVE32-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX]])
+; WAVE32-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 31
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 %idx.clamp)
@@ -72,8 +92,7 @@ define i32 @readlane_and_31(i32 %arg, i32 %idx) #0 {
define i32 @readlane_and_63(i32 %arg, i32 %idx) #0 {
; CHECK-LABEL: define i32 @readlane_and_63(
; CHECK-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 63
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX]])
; CHECK-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 63
@@ -92,10 +111,15 @@ define i32 @readlane_poison(i32 %arg) #0 {
}
define float @readlane_f32_63(float %arg) #0 {
-; CHECK-LABEL: define float @readlane_f32_63(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 63)
-; CHECK-NEXT: ret float [[RES]]
+; WAVE64-LABEL: define float @readlane_f32_63(
+; WAVE64-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 63)
+; WAVE64-NEXT: ret float [[RES]]
+;
+; WAVE32-LABEL: define float @readlane_f32_63(
+; WAVE32-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 31)
+; WAVE32-NEXT: ret float [[RES]]
;
%res = call float @llvm.amdgcn.readlane.f32(float %arg, i32 63)
ret float %res
@@ -116,30 +140,45 @@ define i32 @writelane_31(i32 %arg0, i32 %arg1) #0 {
}
define i32 @writelane_32(i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: define i32 @writelane_32(
-; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 32, i32 [[ARG1]])
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @writelane_32(
+; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 32, i32 [[ARG1]])
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @writelane_32(
+; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 0, i32 [[ARG1]])
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 32, i32 %arg1)
ret i32 %res
}
define i32 @writelane_33(i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: define i32 @writelane_33(
-; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 33, i32 [[ARG1]])
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @writelane_33(
+; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 33, i32 [[ARG1]])
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @writelane_33(
+; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 1, i32 [[ARG1]])
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 33, i32 %arg1)
ret i32 %res
}
define i32 @writelane_63(i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: define i32 @writelane_63(
-; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 63, i32 [[ARG1]])
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @writelane_63(
+; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 63, i32 [[ARG1]])
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @writelane_63(
+; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 31, i32 [[ARG1]])
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 63, i32 %arg1)
ret i32 %res
@@ -148,7 +187,7 @@ define i32 @writelane_63(i32 %arg0, i32 %arg1) #0 {
define i32 @writelane_64(i32 %arg0, i32 %arg1) #0 {
; CHECK-LABEL: define i32 @writelane_64(
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 64, i32 [[ARG1]])
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 0, i32 [[ARG1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 64, i32 %arg1)
@@ -156,11 +195,16 @@ define i32 @writelane_64(i32 %arg0, i32 %arg1) #0 {
}
define i32 @writelane_and_31(i32 %arg0, i32 %arg1, i32 %idx) #0 {
-; CHECK-LABEL: define i32 @writelane_and_31(
-; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @writelane_and_31(
+; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @writelane_and_31(
+; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX]], i32 [[ARG1]])
+; WAVE32-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 31
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 %idx.clamp, i32 %arg1)
@@ -170,8 +214,7 @@ define i32 @writelane_and_31(i32 %arg0, i32 %arg1, i32 %idx) #0 {
define i32 @writelane_and_63(i32 %arg0, i32 %arg1, i32 %idx) #0 {
; CHECK-LABEL: define i32 @writelane_and_63(
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 63
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX]], i32 [[ARG1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 63
@@ -190,16 +233,18 @@ define i32 @writelane_poison(i32 %arg0, i32 %arg1) #0 {
}
define float @writelane_f32_63(float %arg0, float %arg1) #0 {
-; CHECK-LABEL: define float @writelane_f32_63(
-; CHECK-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 63, float [[ARG1]])
-; CHECK-NEXT: ret float [[RES]]
+; WAVE64-LABEL: define float @writelane_f32_63(
+; WAVE64-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 63, float [[ARG1]])
+; WAVE64-NEXT: ret float [[RES]]
+;
+; WAVE32-LABEL: define float @writelane_f32_63(
+; WAVE32-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 31, float [[ARG1]])
+; WAVE32-NEXT: ret float [[RES]]
;
%res = call float @llvm.amdgcn.writelane.f32(float %arg0, i32 63, float %arg1)
ret float %res
}
attributes #0 = { nounwind }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; WAVE32: {{.*}}
-; WAVE64: {{.*}}
>From 585d57c83e20d4ba4ccce3cba38f0aa1e7c0f33a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 2 Dec 2024 08:33:29 -0500
Subject: [PATCH 2/3] Just use getWavefrontSizeLog2
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index a0bb3e181ac526..20bc3c3e314e4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -456,7 +456,7 @@ static bool isTriviallyUniform(const Use &U) {
bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
IntrinsicInst &II,
unsigned LaneArgIdx) const {
- unsigned MaskBits = ST->isWaveSizeKnown() && ST->isWave32() ? 5 : 6;
+ unsigned MaskBits = ST->getWavefrontSizeLog2();
APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
KnownBits Known(32);
>From 5a617f85379e58ab62ea634a0c00627b1deb1161 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 2 Dec 2024 08:34:33 -0500
Subject: [PATCH 3/3] Reword comment
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 20bc3c3e314e4a..41b33ac8a7eb4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -466,9 +466,9 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
if (!Known.isConstant())
return false;
- // Unlike the DAG version, SimplifyDemandedBits does not change
- // constants. Make sure we clamp these down. Out of bounds indexes may appear
- // in wave64 code compiled for wave32.
+ // Out of bounds indexes may appear in wave64 code compiled for wave32.
+ // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
+ // manually fix it up.
Value *LaneArg = II.getArgOperand(LaneArgIdx);
Constant *MaskedConst =
More information about the llvm-branch-commits
mailing list