[llvm] AMDGPU/GlobalISel: RegBankLegalize rules for dot products (PR #189110)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 27 13:56:05 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: vangthao95
<details>
<summary>Changes</summary>
---
Full diff: https://github.com/llvm/llvm-project/pull/189110.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+7)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll (+23-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll (+22-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll (+23-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll (+23-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll (+50-4)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 198e52a6f9ae2..910f3d93d9dd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -1577,6 +1577,13 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
.Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
+ addRulesForIOpcs({amdgcn_udot4, amdgcn_sdot4, amdgcn_udot8, amdgcn_sdot8,
+ amdgcn_dot4_f32_bf8_bf8, amdgcn_dot4_f32_bf8_fp8,
+ amdgcn_dot4_f32_fp8_fp8, amdgcn_dot4_f32_fp8_bf8},
+ Standard)
+ .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
+
addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard)
.Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
.Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index cf835a0a32e28..bcaa9a2252e74 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
define i32 @v_sdot4(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4:
@@ -129,6 +129,26 @@ define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
ret i32 %r
}
+define amdgpu_ps i32 @v_sdot4_s_s_s(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GFX906-LABEL: v_sdot4_s_s_s:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: v_mov_b32_e32 v0, s1
+; GFX906-NEXT: v_mov_b32_e32 v1, s2
+; GFX906-NEXT: v_dot4_i32_i8 v0, s0, v0, v1
+; GFX906-NEXT: v_readfirstlane_b32 s0, v0
+; GFX906-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: v_sdot4_s_s_s:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: v_dot4c_i32_i8 v1, s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-NEXT: ; return to shader part epilog
+ %r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 false)
+ ret i32 %r
+}
+
declare i32 @llvm.amdgcn.sdot4(i32, i32, i32, i1 immarg) #0
attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
index 4dbcffee35942..f65fca1132502 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
define i32 @v_sdot8(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8:
@@ -83,6 +83,25 @@ define i32 @v_sdot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
ret i32 %r
}
+define amdgpu_ps i32 @v_sdot8_s_s_s(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GFX906-LABEL: v_sdot8_s_s_s:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: v_mov_b32_e32 v0, s1
+; GFX906-NEXT: v_mov_b32_e32 v1, s2
+; GFX906-NEXT: v_dot8_i32_i4 v0, s0, v0, v1
+; GFX906-NEXT: v_readfirstlane_b32 s0, v0
+; GFX906-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: v_sdot8_s_s_s:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_dot8_i32_i4 v0, s0, s1, v0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+ %r = call i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 false)
+ ret i32 %r
+}
+
declare i32 @llvm.amdgcn.sdot8(i32, i32, i32, i1 immarg) #0
attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index eeedc083aa4c4..529ae8a18019a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
define i32 @v_udot4(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4:
@@ -148,6 +148,25 @@ define i32 @v_udot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
ret i32 %r
}
+define amdgpu_ps i32 @v_udot4_s_s_s(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GFX906-LABEL: v_udot4_s_s_s:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: v_mov_b32_e32 v0, s1
+; GFX906-NEXT: v_mov_b32_e32 v1, s2
+; GFX906-NEXT: v_dot4_u32_u8 v0, s0, v0, v1
+; GFX906-NEXT: v_readfirstlane_b32 s0, v0
+; GFX906-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: v_udot4_s_s_s:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s2
+; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, s0, s1, v0
+; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %r = call i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 false)
+ ret i32 %r
+}
+
declare i32 @llvm.amdgcn.udot4(i32, i32, i32, i1 immarg) #0
attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
index df900856bfa82..19e64fba593ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX10PLUS %s
define i32 @v_udot8(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8:
@@ -84,6 +84,25 @@ define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
ret i32 %r
}
+define amdgpu_ps i32 @v_udot8_s_s_s(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GFX906-LABEL: v_udot8_s_s_s:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: v_mov_b32_e32 v0, s1
+; GFX906-NEXT: v_mov_b32_e32 v1, s2
+; GFX906-NEXT: v_dot8_u32_u4 v0, s0, v0, v1
+; GFX906-NEXT: v_readfirstlane_b32 s0, v0
+; GFX906-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: v_udot8_s_s_s:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s2
+; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, s0, s1, v0
+; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %r = call i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 false)
+ ret i32 %r
+}
+
declare i32 @llvm.amdgcn.udot8(i32, i32, i32, i1 immarg) #0
attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
index 5891456364fa7..7539310e9248c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck -check-prefixes=GCN,GFX1170 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck -check-prefixes=GCN,GFX1170 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck -check-prefixes=GCN,GFX1170 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
define float @test_amdgcn_dot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) {
; GFX1170-LABEL: test_amdgcn_dot4_f32_fp8_bf8:
@@ -448,6 +448,54 @@ entry:
ret float %ret
}
+define amdgpu_ps float @test_amdgcn_dot4_f32_fp8_bf8_s_s_s(i32 inreg %a, i32 inreg %b, float inreg %c) {
+; GCN-LABEL: test_amdgcn_dot4_f32_fp8_bf8_s_s_s:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_dot4_f32_fp8_bf8 v0, s0, s1, v0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
+ ret float %ret
+}
+
+define amdgpu_ps float @test_amdgcn_dot4_f32_bf8_fp8_s_s_s(i32 inreg %a, i32 inreg %b, float inreg %c) {
+; GCN-LABEL: test_amdgcn_dot4_f32_bf8_fp8_s_s_s:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_dot4_f32_bf8_fp8 v0, s0, s1, v0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
+ ret float %ret
+}
+
+define amdgpu_ps float @test_amdgcn_dot4_f32_fp8_fp8_s_s_s(i32 inreg %a, i32 inreg %b, float inreg %c) {
+; GCN-LABEL: test_amdgcn_dot4_f32_fp8_fp8_s_s_s:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_dot4_f32_fp8_fp8 v0, s0, s1, v0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
+ ret float %ret
+}
+
+define amdgpu_ps float @test_amdgcn_dot4_f32_bf8_bf8_s_s_s(i32 inreg %a, i32 inreg %b, float inreg %c) {
+; GCN-LABEL: test_amdgcn_dot4_f32_bf8_bf8_s_s_s:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_dot4_f32_bf8_bf8 v0, s0, s1, v0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
+ ret float %ret
+}
+
declare float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
declare float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
declare float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
@@ -455,5 +503,3 @@ declare float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
declare float @llvm.fabs.f32(float %a)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
``````````
</details>
https://github.com/llvm/llvm-project/pull/189110
More information about the llvm-commits
mailing list