[llvm] AMDGPU/GlobalISel: Regbanklegalize rules for G_UNMERGE_VALUES (PR #171653)
Petar Avramovic via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 12 04:00:53 PST 2025
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/171653
>From 9953d40e4e3afeaf0bb537479abe1e3bba7a6ae8 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Wed, 10 Dec 2025 18:08:38 +0100
Subject: [PATCH] AMDGPU/GlobalISel: Regbanklegalize rules for G_UNMERGE_VALUES
Move G_UNMERGE_VALUES handling to AMDGPURegBankLegalizeRules.cpp.
Fix sgpr S16 unmerge by lowering using shift and using S32.
Previously sgpr S16 unmerge was selected using _lo16 and _hi16 subreg
indexes which are exclusive to vgpr register classes.
For remaing cases we do trivial mapping, assigns same reg bank
to all operands, vgpr or sgpr.
---
.../Target/AMDGPU/AMDGPUGlobalISelUtils.cpp | 16 +++++
.../lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h | 4 ++
.../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 4 +-
.../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 60 +++++++++++++++++++
.../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 11 ++++
.../AMDGPU/AMDGPURegBankLegalizeRules.h | 9 ++-
llvm/test/CodeGen/AMDGPU/GlobalISel/fpext.ll | 40 ++++---------
.../AMDGPU/GlobalISel/unmerge-sgpr-s16.ll | 36 +++++++++++
8 files changed, 150 insertions(+), 30 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/unmerge-sgpr-s16.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index f36935d8c0e8f..f684e830e20fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -186,3 +186,19 @@ void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
.addReg(VgprSrc);
});
}
+
+bool AMDGPU::isBRC(LLT Ty) {
+ if (Ty.isPointer())
+ return true;
+
+ unsigned Size = Ty.getSizeInBits();
+ if (Size % 32 != 0)
+ return false;
+
+ // 32, 2x32, 3x32 ... 12x32, 16x32, 32x32
+ unsigned NumB32s = Size / 32;
+ if ((NumB32s >= 1 && NumB32s <= 12) || NumB32s == 16 || NumB32s == 32)
+ return true;
+
+ return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 5e1000ee0ab26..86af5b8bfd2f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -53,6 +53,10 @@ void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
+
+// "Reg Class" low level types. LLTs that fit into some register class on both
+// sgpr and vgpr. LLTs with sizes 32, 2x32, 3x32 ... 12x32, 16x32, 32x32.
+bool isBRC(LLT Ty);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index 839120da89711..a1fce93eb01f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -442,8 +442,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
// Opcodes that support pretty much all combinations of reg banks and LLTs
// (except S1). There is no point in writing rules for them.
- if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
- Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) {
+ if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_MERGE_VALUES ||
+ Opc == AMDGPU::G_BITCAST) {
RBLHelper.applyMappingTrivial(*MI);
continue;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index cc31d7d5c55ac..b5a8c7b89e64c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -910,6 +910,55 @@ bool RegBankLegalizeHelper::lower(MachineInstr &MI,
return lowerUnpackAExt(MI);
case WidenMMOToS32:
return widenMMOToS32(cast<GAnyLoad>(MI));
+ case VerifyAllSgpr: {
+ for (unsigned i = 0; i < MI.getNumOperands(); ++i)
+ assert(MRI.getRegBankOrNull(MI.getOperand(i).getReg()) == SgprRB);
+ return true;
+ }
+ case ApplyAllVgpr: {
+ B.setInstr(MI);
+ unsigned NumDefs = MI.getNumDefs();
+
+ for (unsigned i = 0; i < NumDefs; ++i)
+ assert(MRI.getRegBankOrNull(MI.getOperand(i).getReg()) == VgprRB);
+
+ for (unsigned i = NumDefs; i < MI.getNumOperands(); ++i) {
+ Register Reg = MI.getOperand(i).getReg();
+ if (MRI.getRegBank(Reg) != VgprRB) {
+ auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
+ MI.getOperand(i).setReg(Copy.getReg(0));
+ }
+ }
+
+ return true;
+ }
+ case UnmergeToShiftTrunc: {
+ GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
+ LLT Ty = MRI.getType(Unmerge->getSourceReg());
+ if (Ty.getSizeInBits() % 32 != 0) {
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: unmerge not multiple of 32",
+ MI);
+ return false;
+ }
+
+ B.setInstr(MI);
+ if (Ty.getSizeInBits() > 32) {
+ auto Unmerge32 = B.buildUnmerge(SgprRB_S32, Unmerge->getSourceReg());
+ for (unsigned i = 0; i < Unmerge32->getNumDefs(); ++i) {
+ auto [Dst0S32, Dst1S32] = unpackAExt(Unmerge32->getOperand(i).getReg());
+ B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
+ B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
+ }
+ } else {
+ auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
+ B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
+ B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
}
if (!WaterfallSgprs.empty()) {
@@ -1035,6 +1084,11 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
Ty == LLT::fixed_vector(8, 64))
return Ty;
return LLT();
+ case SgprBRC:
+ case VgprBRC:
+ if (isBRC(Ty))
+ return Ty;
+ return LLT();
default:
return LLT();
}
@@ -1069,6 +1123,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprB128:
case SgprB256:
case SgprB512:
+ case SgprBRC:
case UniInVcc:
case UniInVgprS16:
case UniInVgprS32:
@@ -1108,6 +1163,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case VgprB128:
case VgprB256:
case VgprB512:
+ case VgprBRC:
case Vgpr32SExt:
case Vgpr32ZExt:
return VgprRB;
@@ -1167,6 +1223,7 @@ bool RegBankLegalizeHelper::applyMappingDst(
case SgprB128:
case SgprB256:
case SgprB512:
+ case SgprBRC:
case SgprPtr32:
case SgprPtr64:
case SgprPtr128:
@@ -1176,6 +1233,7 @@ bool RegBankLegalizeHelper::applyMappingDst(
case VgprB128:
case VgprB256:
case VgprB512:
+ case VgprBRC:
case VgprPtr32:
case VgprPtr64:
case VgprPtr128: {
@@ -1307,6 +1365,7 @@ bool RegBankLegalizeHelper::applyMappingSrc(
case SgprB128:
case SgprB256:
case SgprB512:
+ case SgprBRC:
case SgprPtr32:
case SgprPtr64:
case SgprPtr128: {
@@ -1341,6 +1400,7 @@ bool RegBankLegalizeHelper::applyMappingSrc(
case VgprB128:
case VgprB256:
case VgprB512:
+ case VgprBRC:
case VgprPtr32:
case VgprPtr64:
case VgprPtr128: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index d01afee331025..545294ac55ce4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -14,6 +14,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPURegBankLegalizeRules.h"
+#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
@@ -132,6 +133,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg);
case UniB512:
return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg);
+ case UniBRC:
+ return isBRC(MRI.getType(Reg)) && MUI.isUniform(Reg);
case DivS1:
return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg);
case DivS16:
@@ -172,6 +175,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg);
case DivB512:
return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
+ case DivBRC:
+ return isBRC(MRI.getType(Reg)) && MUI.isDivergent(Reg);
case _:
return true;
default:
@@ -560,6 +565,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});
+ addRulesForGOpcs({G_UNMERGE_VALUES})
+ .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
+ .Any({{DivS16}, {{}, {}, ApplyAllVgpr}})
+ .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
+ .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
+
addRulesForGOpcs({G_ICMP})
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
.Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 1ac117304b76f..4aeea206ab5ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -113,6 +113,7 @@ enum UniformityLLTOpPredicateID {
UniB128,
UniB256,
UniB512,
+ UniBRC,
DivB32,
DivB64,
@@ -120,6 +121,7 @@ enum UniformityLLTOpPredicateID {
DivB128,
DivB256,
DivB512,
+ DivBRC
};
// How to apply register bank on register operand.
@@ -156,6 +158,7 @@ enum RegBankLLTMappingApplyID {
SgprB128,
SgprB256,
SgprB512,
+ SgprBRC,
// vgpr scalars, pointers, vectors and B-types
Vgpr16,
@@ -178,6 +181,7 @@ enum RegBankLLTMappingApplyID {
VgprB128,
VgprB256,
VgprB512,
+ VgprBRC,
VgprV4S32,
// Dst only modifiers: read-any-lane and truncs
@@ -233,7 +237,10 @@ enum LoweringMethodID {
SplitLoad,
WidenLoad,
WidenMMOToS32,
- UnpackAExt
+ UnpackAExt,
+ VerifyAllSgpr,
+ ApplyAllVgpr,
+ UnmergeToShiftTrunc
};
enum FastRulesTypes {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpext.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpext.ll
index 990153ebce243..d73b56aab8505 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpext.ll
@@ -137,34 +137,20 @@ define amdgpu_ps void @fpext_f16_to_f64_div(half %a, ptr addrspace(1) %ptr) {
}
define amdgpu_ps <2 x float> @fpext_v2f16_to_v2f32_uniform(<2 x half> inreg %a) {
-; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32_uniform:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 16
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s1
-; GFX11-FAKE16-NEXT: ; return to shader part epilog
-;
-; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32_uniform:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-TRUE16-NEXT: ; return to shader part epilog
-;
-; GFX12-FAKE16-LABEL: fpext_v2f16_to_v2f32_uniform:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_cvt_f32_f16 s1, s0
-; GFX12-FAKE16-NEXT: s_cvt_hi_f32_f16 s0, s0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
-; GFX12-FAKE16-NEXT: ; return to shader part epilog
+; GFX11-LABEL: fpext_v2f16_to_v2f32_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_lshr_b32 s1, s0, 16
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s1
+; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-TRUE16-LABEL: fpext_v2f16_to_v2f32_uniform:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_cvt_f32_f16 s0, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0
-; GFX12-TRUE16-NEXT: ; return to shader part epilog
+; GFX12-LABEL: fpext_v2f16_to_v2f32_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_cvt_f32_f16 s1, s0
+; GFX12-NEXT: s_cvt_hi_f32_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: ; return to shader part epilog
%result = fpext <2 x half> %a to <2 x float>
ret <2 x float> %result
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/unmerge-sgpr-s16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/unmerge-sgpr-s16.ll
new file mode 100644
index 0000000000000..fb013d35d540b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/unmerge-sgpr-s16.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX11 %s
+
+define amdgpu_ps void @unmerge_sgprS16_from_V2S16(ptr addrspace(1) inreg %ptr, ptr addrspace(1) inreg %out) {
+; GFX11-LABEL: unmerge_sgprS16_from_V2S16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
+ %load = load <2 x i16>, ptr addrspace(1) %ptr
+ %shuffle = shufflevector <2 x i16> %load, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
+ store <2 x i16> %shuffle, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @unmerge_sgprS16_from_V4S16(ptr addrspace(1) inreg %ptr, ptr addrspace(1) inreg %out) {
+; GFX11-LABEL: unmerge_sgprS16_from_V4S16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_pack_lh_b32_b16 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
+ %load = load <4 x i16>, ptr addrspace(1) %ptr
+ %shuffle = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 3>
+ store <2 x i16> %shuffle, ptr addrspace(1) %out
+ ret void
+}
More information about the llvm-commits
mailing list