[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jul 7 02:59:58 PDT 2025
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141589
>From d906a978145aabae8b2d1a029477d5a08272ae8c Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 27 May 2025 11:16:16 +0200
Subject: [PATCH 1/4] [AMDGPU] Move S_BFE lowering into RegBankCombiner
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 14 +-
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 51 +++++++
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 125 ++++++++----------
3 files changed, 119 insertions(+), 71 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9587fad1ecd63..94e1175b06b14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[
canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
]>;
+// Early select of uniform BFX into S_BFE instructions.
+// These instructions encode the offset/width in a way that requires using
+// bitwise operations. Selecting these instructions early allow the combiner
+// to potentially fold these.
+class lower_uniform_bfx<Instruction bfx> : GICombineRule<
+ (defs root:$bfx),
+ (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>;
+
+def lower_uniform_sbfx : lower_uniform_bfx<G_SBFX>;
+def lower_uniform_ubfx : lower_uniform_bfx<G_UBFX>;
+
let Predicates = [Has16BitInsts, NotHasMed3_16] in {
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
// saves one instruction compared to the promotion.
@@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
- cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+ cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+ lower_uniform_sbfx, lower_uniform_ubfx]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5e93f0f..2100900bb8eb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
+ bool lowerUniformBFX(MachineInstr &MI) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
MI.eraseFromParent();
}
+bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
+ assert(MI.getOpcode() == TargetOpcode::G_UBFX ||
+ MI.getOpcode() == TargetOpcode::G_SBFX);
+ const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI);
+ assert(RB && "No RB?");
+ if (RB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register OffsetReg = MI.getOperand(2).getReg();
+ Register WidthReg = MI.getOperand(3).getReg();
+
+ const LLT S32 = LLT::scalar(32);
+ LLT Ty = MRI.getType(DstReg);
+
+ const unsigned Opc = (Ty == S32)
+ ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+ : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+
+ // Ensure the high bits are clear to insert the offset.
+ auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
+ auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+
+ // Zeros out the low bits, so don't bother clamping the input value.
+ auto ShiftAmt = B.buildConstant(S32, 16);
+ auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
+
+ // Transformation function, pack the offset and width of a BFE into
+ // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+ // source, bits [5:0] contain the offset and bits [22:16] the width.
+ auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+
+ MRI.setRegBank(OffsetMask.getReg(0), *RB);
+ MRI.setRegBank(ClampOffset.getReg(0), *RB);
+ MRI.setRegBank(ShiftAmt.getReg(0), *RB);
+ MRI.setRegBank(ShiftWidth.getReg(0), *RB);
+ MRI.setRegBank(MergedInputs.getReg(0), *RB);
+
+ auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+ if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
+ llvm_unreachable("failed to constrain BFE");
+
+ MI.eraseFromParent();
+ return true;
+}
+
SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 6874657a4ffe7..140c2babb013f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1492,88 +1492,73 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
const RegisterBank *DstBank =
- OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
- if (DstBank == &AMDGPU::VGPRRegBank) {
- if (Ty == S32)
- return true;
-
- // There is no 64-bit vgpr bitfield extract instructions so the operation
- // is expanded to a sequence of instructions that implement the operation.
- ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
- const LLT S64 = LLT::scalar(64);
- // Shift the source operand so that extracted bits start at bit 0.
- auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
- : B.buildLShr(S64, SrcReg, OffsetReg);
- auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
-
- // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
- // if the width is a constant.
- if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
- // Use the 32-bit bitfield extract instruction if the width is a constant.
- // Depending on the width size, use either the low or high 32-bits.
- auto Zero = B.buildConstant(S32, 0);
- auto WidthImm = ConstWidth->Value.getZExtValue();
- if (WidthImm <= 32) {
- // Use bitfield extract on the lower 32-bit source, and then sign-extend
- // or clear the upper 32-bits.
- auto Extract =
- Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
- : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
- auto Extend =
- Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
- B.buildMergeLikeInstr(DstReg, {Extract, Extend});
- } else {
- // Use bitfield extract on upper 32-bit source, and combine with lower
- // 32-bit source.
- auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
- auto Extract =
- Signed
- ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
- : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
- B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
- }
- MI.eraseFromParent();
+ if (DstBank != &AMDGPU::VGPRRegBank) {
+ // SGPR: Canonicalize to a G_S/UBFX
+ if (!isa<GIntrinsic>(MI))
return true;
- }
- // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
- // operations.
- auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
- auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
if (Signed)
- B.buildAShr(S64, SignBit, ExtShift);
+ B.buildSbfx(DstReg, SrcReg, OffsetReg, WidthReg);
else
- B.buildLShr(S64, SignBit, ExtShift);
+ B.buildUbfx(DstReg, SrcReg, OffsetReg, WidthReg);
MI.eraseFromParent();
return true;
}
- // The scalar form packs the offset and width in a single operand.
-
- ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
-
- // Ensure the high bits are clear to insert the offset.
- auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
- auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
-
- // Zeros out the low bits, so don't bother clamping the input value.
- auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
-
- // Transformation function, pack the offset and width of a BFE into
- // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
- // source, bits [5:0] contain the offset and bits [22:16] the width.
- auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+ // VGPR
+ if (Ty == S32)
+ return true;
- // TODO: It might be worth using a pseudo here to avoid scc clobber and
- // register class constraints.
- unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
- (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+ // There is no 64-bit vgpr bitfield extract instructions so the operation
+ // is expanded to a sequence of instructions that implement the operation.
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
- auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
- if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
- llvm_unreachable("failed to constrain BFE");
+ const LLT S64 = LLT::scalar(64);
+ // Shift the source operand so that extracted bits start at bit 0.
+ auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
+ : B.buildLShr(S64, SrcReg, OffsetReg);
+ auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
+
+ // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
+ // if the width is a constant.
+ if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
+ // Use the 32-bit bitfield extract instruction if the width is a constant.
+ // Depending on the width size, use either the low or high 32-bits.
+ auto Zero = B.buildConstant(S32, 0);
+ auto WidthImm = ConstWidth->Value.getZExtValue();
+ if (WidthImm <= 32) {
+ // Use bitfield extract on the lower 32-bit source, and then sign-extend
+ // or clear the upper 32-bits.
+ auto Extract =
+ Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
+ : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
+ auto Extend =
+ Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
+ B.buildMergeLikeInstr(DstReg, {Extract, Extend});
+ } else {
+ // Use bitfield extract on upper 32-bit source, and combine with lower
+ // 32-bit source.
+ auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
+ auto Extract =
+ Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
+ : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
+ B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
+ }
+ MI.eraseFromParent();
+ return true;
+ }
+ // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
+ // operations.
+ auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
+ auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
+ if (Signed)
+ B.buildAShr(S64, SignBit, ExtShift);
+ else
+ B.buildLShr(S64, SignBit, ExtShift);
MI.eraseFromParent();
return true;
}
>From 1fadb4e1cb00432ff55b3180bd37756419bd2ab2 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 28 May 2025 10:37:09 +0200
Subject: [PATCH 2/4] style change
---
llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 2100900bb8eb2..a364887fa69d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -412,7 +412,7 @@ bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(DstReg);
- const unsigned Opc = (Ty == S32)
+ const unsigned Opc = Ty == S32
? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
: (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
>From 9927a43fca7371b02cbdfb0d17b3a9498258b8ba Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 1 Jul 2025 11:46:19 +0200
Subject: [PATCH 3/4] comments
---
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 7 +-
.../GlobalISel/regbankcombiner-lower-bfx.mir | 107 ++++++++++++++++++
2 files changed, 111 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-lower-bfx.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index a364887fa69d5..8d0c1b673ee40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -416,6 +416,10 @@ bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
: (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+ // Pack the offset and width of a BFE into
+ // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+ // source, bits [5:0] contain the offset and bits [22:16] the width.
+
// Ensure the high bits are clear to insert the offset.
auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
@@ -424,9 +428,6 @@ bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
auto ShiftAmt = B.buildConstant(S32, 16);
auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
- // Transformation function, pack the offset and width of a BFE into
- // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
- // source, bits [5:0] contain the offset and bits [22:16] the width.
auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
MRI.setRegBank(OffsetMask.getReg(0), *RB);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-lower-bfx.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-lower-bfx.mir
new file mode 100644
index 0000000000000..3b7cb4158a897
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-lower-bfx.mir
@@ -0,0 +1,107 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: test_s_bfe_i32__constants
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: test_s_bfe_i32__constants
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %reg:sreg_32(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %width:sgpr(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: %offset:sgpr(s32) = G_CONSTANT i32 7
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL %width, [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:sreg_32(s32) = G_OR %offset, [[SHL]]
+ ; CHECK-NEXT: %bfx:sreg_32(s32) = S_BFE_I32 %reg(s32), [[OR]](s32), implicit-def $scc
+ ; CHECK-NEXT: $sgpr0 = COPY %bfx(s32)
+ %reg:sgpr(s32) = COPY $sgpr0
+ %width:sgpr(s32) = G_CONSTANT i32 5
+ %offset:sgpr(s32) = G_CONSTANT i32 7
+ %bfx:sgpr(s32) = G_SBFX %reg, %offset, %width
+ $sgpr0 = COPY %bfx
+...
+---
+name: test_s_bfe_u32__constants
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: test_s_bfe_u32__constants
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %reg:sreg_32(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %width:sgpr(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: %offset:sgpr(s32) = G_CONSTANT i32 7
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL %width, [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:sreg_32(s32) = G_OR %offset, [[SHL]]
+ ; CHECK-NEXT: %bfx:sreg_32(s32) = S_BFE_U32 %reg(s32), [[OR]](s32), implicit-def $scc
+ ; CHECK-NEXT: $sgpr0 = COPY %bfx(s32)
+ %reg:sgpr(s32) = COPY $sgpr0
+ %width:sgpr(s32) = G_CONSTANT i32 5
+ %offset:sgpr(s32) = G_CONSTANT i32 7
+ %bfx:sgpr(s32) = G_UBFX %reg, %offset, %width
+ $sgpr0 = COPY %bfx
+...
+---
+name: test_s_bfe_i64__constants
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1:
+ liveins: $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: test_s_bfe_i64__constants
+ ; CHECK: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %reg:sreg_64(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: %width:sgpr(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: %offset:sgpr(s32) = G_CONSTANT i32 7
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL %width, [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:sreg_32(s32) = G_OR %offset, [[SHL]]
+ ; CHECK-NEXT: %bfx:sreg_64(s64) = S_BFE_I64 %reg(s64), [[OR]](s32), implicit-def $scc
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY %bfx(s64)
+ %reg:sgpr(s64) = COPY $sgpr0_sgpr1
+ %width:sgpr(s32) = G_CONSTANT i32 5
+ %offset:sgpr(s32) = G_CONSTANT i32 7
+ %bfx:sgpr(s64) = G_SBFX %reg, %offset, %width
+ $sgpr0_sgpr1 = COPY %bfx
+...
+---
+name: test_s_bfe_u64__constants
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1:
+ liveins: $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: test_s_bfe_u64__constants
+ ; CHECK: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %reg:sreg_64(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: %width:sgpr(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: %offset:sgpr(s32) = G_CONSTANT i32 7
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL %width, [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:sreg_32(s32) = G_OR %offset, [[SHL]]
+ ; CHECK-NEXT: %bfx:sreg_64(s64) = S_BFE_U64 %reg(s64), [[OR]](s32), implicit-def $scc
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY %bfx(s64)
+ %reg:sgpr(s64) = COPY $sgpr0_sgpr1
+ %width:sgpr(s32) = G_CONSTANT i32 5
+ %offset:sgpr(s32) = G_CONSTANT i32 7
+ %bfx:sgpr(s64) = G_UBFX %reg, %offset, %width
+ $sgpr0_sgpr1 = COPY %bfx
+...
>From 8e8f6aaede2b1e95ffa21a1cc724107cd57b8411 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 7 Jul 2025 11:59:25 +0200
Subject: [PATCH 4/4] remove clamp
---
llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp | 13 ++++++-------
.../CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll | 9 ++-------
.../CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll | 14 ++++----------
3 files changed, 12 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 8d0c1b673ee40..257acfd7f79b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -419,19 +419,18 @@ bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
// Pack the offset and width of a BFE into
// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
// source, bits [5:0] contain the offset and bits [22:16] the width.
-
- // Ensure the high bits are clear to insert the offset.
- auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
- auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+ // The 64 bit variants use bits [6:0]
+ //
+ // If the value takes more than 5/6 bits, the G_U/SBFX is ill-formed.
+ // Thus, we do not clamp the values. We assume they are in range,
+ // and if they aren't, it is UB anyway.
// Zeros out the low bits, so don't bother clamping the input value.
auto ShiftAmt = B.buildConstant(S32, 16);
auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
- auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+ auto MergedInputs = B.buildOr(S32, OffsetReg, ShiftWidth);
- MRI.setRegBank(OffsetMask.getReg(0), *RB);
- MRI.setRegBank(ClampOffset.getReg(0), *RB);
MRI.setRegBank(ShiftAmt.getReg(0), *RB);
MRI.setRegBank(ShiftWidth.getReg(0), *RB);
MRI.setRegBank(MergedInputs.getReg(0), *RB);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
index 45bade21385be..0e65e1a23788b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
@@ -14,7 +14,6 @@ define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 {
define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 {
; GFX6-LABEL: s_bfe_i32_arg_arg_arg:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s1, s1, 63
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_bfe_i32 s0, s0, s1
@@ -32,7 +31,6 @@ define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i3
define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 {
; GFX6-LABEL: s_bfe_i64_arg_arg_arg:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s2, s2, 63
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], s2
@@ -46,7 +44,6 @@ define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s3, s3, 63
; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000
; GFX6-NEXT: s_bfe_i32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
@@ -65,7 +62,7 @@ define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0,
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_or_b32 s3, 59, s3
+; GFX6-NEXT: s_or_b32 s3, 0x7b, s3
; GFX6-NEXT: s_bfe_i32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
@@ -82,9 +79,8 @@ define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s4, s2, 63
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_or_b32 s3, s4, s3
+; GFX6-NEXT: s_or_b32 s3, s2, s3
; GFX6-NEXT: s_bfe_i32 s3, 0x7b, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
@@ -120,7 +116,6 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s3, s3, 63
; GFX6-NEXT: s_bfe_i32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
index d327c15ae547f..6d435ace64599 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
@@ -14,7 +14,6 @@ define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 {
define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 {
; GFX6-LABEL: s_bfe_i32_arg_arg_arg:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s1, s1, 63
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_bfe_u32 s0, s0, s1
@@ -32,7 +31,6 @@ define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i3
define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 {
; GFX6-LABEL: s_bfe_i64_arg_arg_arg:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s2, s2, 63
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_bfe_u64 s[0:1], s[0:1], s2
@@ -46,9 +44,8 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s4, s3, 63
-; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_or_b32 s3, s4, s3
+; GFX6-NEXT: s_lshl_b32 s4, s3, 16
+; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_bfe_u32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
@@ -65,7 +62,6 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s3, s3, 63
; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000
; GFX6-NEXT: s_bfe_u32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
@@ -84,7 +80,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0,
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_or_b32 s3, 59, s3
+; GFX6-NEXT: s_or_b32 s3, 0x7b, s3
; GFX6-NEXT: s_bfe_u32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
@@ -101,9 +97,8 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s4, s2, 63
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_or_b32 s3, s4, s3
+; GFX6-NEXT: s_or_b32 s3, s2, s3
; GFX6-NEXT: s_bfe_u32 s3, 0x7b, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
@@ -120,7 +115,6 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s3, s3, 63
; GFX6-NEXT: s_bfe_u32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
More information about the llvm-branch-commits
mailing list