[llvm] AMDGPU/GlobalISel: RegBankLegalize rules for dot products (PR #189110)

Fri Mar 27 13:56:05 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: vangthao95

<details>
<summary>Changes</summary>



---
Full diff: https://github.com/llvm/llvm-project/pull/189110.diff


6 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+7) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll (+23-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll (+22-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll (+23-4) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll (+23-4) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll (+50-4) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 198e52a6f9ae2..910f3d93d9dd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -1577,6 +1577,13 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
       .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
 
+  addRulesForIOpcs({amdgcn_udot4, amdgcn_sdot4, amdgcn_udot8, amdgcn_sdot8,
+                    amdgcn_dot4_f32_bf8_bf8, amdgcn_dot4_f32_bf8_fp8,
+                    amdgcn_dot4_f32_fp8_fp8, amdgcn_dot4_f32_fp8_bf8},
+                   Standard)
+      .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
+
   addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard)
       .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
       .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index cf835a0a32e28..bcaa9a2252e74 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
 
 define i32 @v_sdot4(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot4:
@@ -129,6 +129,26 @@ define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
   ret i32 %r
 }
 
+define amdgpu_ps i32 @v_sdot4_s_s_s(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GFX906-LABEL: v_sdot4_s_s_s:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    v_mov_b32_e32 v0, s1
+; GFX906-NEXT:    v_mov_b32_e32 v1, s2
+; GFX906-NEXT:    v_dot4_i32_i8 v0, s0, v0, v1
+; GFX906-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX906-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_sdot4_s_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    v_dot4c_i32_i8 v1, s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    ; return to shader part epilog
+  %r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 false)
+  ret i32 %r
+}
+
 declare i32 @llvm.amdgcn.sdot4(i32, i32, i32, i1 immarg) #0
 
 attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
index 4dbcffee35942..f65fca1132502 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
 
 define i32 @v_sdot8(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot8:
@@ -83,6 +83,25 @@ define i32 @v_sdot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
   ret i32 %r
 }
 
+define amdgpu_ps i32 @v_sdot8_s_s_s(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GFX906-LABEL: v_sdot8_s_s_s:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    v_mov_b32_e32 v0, s1
+; GFX906-NEXT:    v_mov_b32_e32 v1, s2
+; GFX906-NEXT:    v_dot8_i32_i4 v0, s0, v0, v1
+; GFX906-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX906-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_sdot8_s_s_s:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %r = call i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 false)
+  ret i32 %r
+}
+
 declare i32 @llvm.amdgcn.sdot8(i32, i32, i32, i1 immarg) #0
 
 attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index eeedc083aa4c4..529ae8a18019a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
 
 define i32 @v_udot4(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot4:
@@ -148,6 +148,25 @@ define i32 @v_udot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
   ret i32 %r
 }
 
+define amdgpu_ps i32 @v_udot4_s_s_s(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GFX906-LABEL: v_udot4_s_s_s:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    v_mov_b32_e32 v0, s1
+; GFX906-NEXT:    v_mov_b32_e32 v1, s2
+; GFX906-NEXT:    v_dot4_u32_u8 v0, s0, v0, v1
+; GFX906-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX906-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: v_udot4_s_s_s:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10PLUS-NEXT:    v_dot4_u32_u8 v0, s0, s1, v0
+; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+  %r = call i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 false)
+  ret i32 %r
+}
+
 declare i32 @llvm.amdgcn.udot4(i32, i32, i32, i1 immarg) #0
 
 attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
index df900856bfa82..19e64fba593ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX10PLUS %s
 
 define i32 @v_udot8(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot8:
@@ -84,6 +84,25 @@ define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
   ret i32 %r
 }
 
+define amdgpu_ps i32 @v_udot8_s_s_s(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GFX906-LABEL: v_udot8_s_s_s:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    v_mov_b32_e32 v0, s1
+; GFX906-NEXT:    v_mov_b32_e32 v1, s2
+; GFX906-NEXT:    v_dot8_u32_u4 v0, s0, v0, v1
+; GFX906-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX906-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: v_udot8_s_s_s:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10PLUS-NEXT:    v_dot8_u32_u4 v0, s0, s1, v0
+; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+  %r = call i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 false)
+  ret i32 %r
+}
+
 declare i32 @llvm.amdgcn.udot8(i32, i32, i32, i1 immarg) #0
 
 attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
index 5891456364fa7..7539310e9248c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck -check-prefixes=GCN,GFX1170 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck -check-prefixes=GCN,GFX1170 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck -check-prefixes=GCN,GFX1170 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
 
 define float @test_amdgcn_dot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) {
 ; GFX1170-LABEL: test_amdgcn_dot4_f32_fp8_bf8:
@@ -448,6 +448,54 @@ entry:
   ret float %ret
 }
 
+define amdgpu_ps float @test_amdgcn_dot4_f32_fp8_bf8_s_s_s(i32 inreg %a, i32 inreg %b, float inreg %c) {
+; GCN-LABEL: test_amdgcn_dot4_f32_fp8_bf8_s_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_dot4_f32_fp8_bf8 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
+  ret float %ret
+}
+
+define amdgpu_ps float @test_amdgcn_dot4_f32_bf8_fp8_s_s_s(i32 inreg %a, i32 inreg %b, float inreg %c) {
+; GCN-LABEL: test_amdgcn_dot4_f32_bf8_fp8_s_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_dot4_f32_bf8_fp8 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
+  ret float %ret
+}
+
+define amdgpu_ps float @test_amdgcn_dot4_f32_fp8_fp8_s_s_s(i32 inreg %a, i32 inreg %b, float inreg %c) {
+; GCN-LABEL: test_amdgcn_dot4_f32_fp8_fp8_s_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_dot4_f32_fp8_fp8 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
+  ret float %ret
+}
+
+define amdgpu_ps float @test_amdgcn_dot4_f32_bf8_bf8_s_s_s(i32 inreg %a, i32 inreg %b, float inreg %c) {
+; GCN-LABEL: test_amdgcn_dot4_f32_bf8_bf8_s_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_dot4_f32_bf8_bf8 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
+  ret float %ret
+}
+
 declare float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
 declare float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
 declare float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
@@ -455,5 +503,3 @@ declare float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
 
 declare float @llvm.fabs.f32(float %a)
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}

``````````

</details>


https://github.com/llvm/llvm-project/pull/189110