[llvm] [AMDGPU][GlobalISel] Add register bank legalization for G_FADD (PR #163407)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 14 07:36:50 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Abhinav Garg (abhigargrepo)
<details>
<summary>Changes</summary>
This patch adds register bank legalization support for G_FADD opcodes in the AMDGPU GlobalISel pipeline.
Added new reg bank type UniInVgprS64.
---
Full diff: https://github.com/llvm/llvm-project/pull/163407.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp (+3)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+10-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h (+4)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll (+246)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 540756653dd22..198ee6b73b0b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -837,6 +837,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
return LLT::scalar(32);
case Sgpr64:
case Vgpr64:
+ case UniInVgprS64:
return LLT::scalar(64);
case Sgpr128:
case Vgpr128:
@@ -960,6 +961,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case UniInVcc:
case UniInVgprS16:
case UniInVgprS32:
+ case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32:
case UniInVgprB32:
@@ -1092,6 +1094,7 @@ void RegBankLegalizeHelper::applyMappingDst(
break;
}
case UniInVgprS32:
+ case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index bfe2c80c810ef..9cf0c52717318 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -904,9 +904,18 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
bool hasSALUFloat = ST->hasSALUFloatInsts();
addRulesForGOpcs({G_FADD}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
+ .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
- .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
+ .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
addRulesForGOpcs({G_FPTOUI})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 93e0efda77fdd..1cf9ae2e226ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID {
V4S32,
UniV2S16,
+ UniV2S32,
DivV2S16,
+ DivV2S32,
// B types
B32,
@@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID {
UniInVcc,
UniInVgprS16,
UniInVgprS32,
+ UniInVgprS64,
UniInVgprV2S16,
+ UniInVgprV2S32,
UniInVgprV4S32,
UniInVgprB32,
UniInVgprB64,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
new file mode 100644
index 0000000000000..ec221496f450c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+
+define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
+; GFX11-SDAG-FAKE16-LABEL: fadd_s16_uniform:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
+; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-TRUE16-LABEL: fadd_s16_uniform:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
+; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-GISEL-FAKE16-LABEL: fadd_s16_uniform:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-GISEL-TRUE16-LABEL: fadd_s16_uniform:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s16_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_add_f16 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd half %a, %b
+ ret half %fadd
+}
+
+define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
+; GFX11-SDAG-FAKE16-LABEL: fadd_s16_div:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-TRUE16-LABEL: fadd_s16_div:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-GISEL-FAKE16-LABEL: fadd_s16_div:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-GISEL-TRUE16-LABEL: fadd_s16_div:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: fadd_s16_div:
+; GFX12-SDAG-FAKE16: ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX12-SDAG-TRUE16-LABEL: fadd_s16_div:
+; GFX12-SDAG-TRUE16: ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: fadd_s16_div:
+; GFX12-GISEL-FAKE16: ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-TRUE16-LABEL: fadd_s16_div:
+; GFX12-GISEL-TRUE16: ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
+ %fadd = fadd half %a, %b
+ ret half %fadd
+}
+
+define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) {
+; GFX11-LABEL: fadd_s32_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s32_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_add_f32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd float %a, %b
+ ret float %fadd
+}
+
+define amdgpu_ps float @fadd_s32_div(float %a, float %b) {
+; GCN-LABEL: fadd_s32_div:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %fadd = fadd float %a, %b
+ ret float %fadd
+}
+
+define amdgpu_ps double @fadd_s64_uniform(double inreg %a, double inreg %b) {
+; GFX11-LABEL: fadd_s64_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s64_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v1
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd double %a, %b
+ ret double %fadd
+}
+
+define amdgpu_ps double @fadd_s64_div(double %a, double %b) {
+; GFX11-LABEL: fadd_s64_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s64_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v1
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd double %a, %b
+ ret double %fadd
+}
+
+define <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX11-LABEL: fadd_v2s16_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, s0, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fadd_v2s16_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v0, s0, s1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %fadd = fadd <2 x half> %a, %b
+ ret <2 x half> %fadd
+}
+
+define <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
+; GFX11-LABEL: fadd_v2s16_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fadd_v2s16_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %fadd = fadd <2 x half> %a, %b
+ ret <2 x half> %fadd
+}
+
+define <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
+; GFX11-LABEL: fadd_v2s32_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e64 v0, s0, s2
+; GFX11-NEXT: v_add_f32_e64 v1, s1, s3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fadd_v2s32_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_f32 s0, s0, s2
+; GFX12-NEXT: s_add_f32 s1, s1, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %fadd = fadd <2 x float> %a, %b
+ ret <2 x float> %fadd
+}
+
+define <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
+; GFX11-LABEL: fadd_v2s32_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fadd_v2s32_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %fadd = fadd <2 x float> %a, %b
+ ret <2 x float> %fadd
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL: {{.*}}
+; GFX11-SDAG: {{.*}}
+; GFX12-GISEL: {{.*}}
+; GFX12-SDAG: {{.*}}
``````````
</details>
https://github.com/llvm/llvm-project/pull/163407
More information about the llvm-commits
mailing list