[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Add RegBankLegalize rules for [us]addsat/[us]subsat (PR #176255)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jan 15 14:01:08 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-backend-amdgpu
Author: None (vangthao95)
<details>
<summary>Changes</summary>
---
Patch is 298.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/176255.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+785-1050)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+832-1085)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+331-220)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+331-220)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 5f5344b55ac35..f1810e219c7d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -487,6 +487,14 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}})
.Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
+ addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
bool hasMulHi = ST->hasScalarMulHiInsts();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 2f956d7a0a534..f6e36241a05dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_saddsat_i7:
@@ -98,8 +98,9 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 9
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 9
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_saddsat_i7:
@@ -107,8 +108,9 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
@@ -207,8 +209,9 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_saddsat_i8:
@@ -216,8 +219,9 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@@ -436,58 +440,48 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_sext_i32_i16 s1, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
+; GFX9-NEXT: s_sext_i32_i16 s2, 0x80008
+; GFX9-NEXT: s_ashr_i32 s1, s1, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX9-NEXT: s_lshr_b32 s1, s0, 16
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_and_b32 s0, s0, 0xff
+; GFX9-NEXT: s_lshl_b32 s1, s1, 8
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_saddsat_v2i8:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s2, s0, 8
-; GFX10-NEXT: s_lshr_b32 s3, s1, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT: s_lshr_b32 s2, s0, 16
-; GFX10-NEXT: s_lshr_b32 s3, s1, 16
-; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX10-NEXT: s_lshl_b32 s2, s2, 8
-; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX10-NEXT: s_lshl_b32 s3, s3, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp
-; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_saddsat_v2i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s2, s0, 8
-; GFX11-NEXT: s_lshr_b32 s3, s1, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT: v_pk_add_i16 v0, s0, s1 clamp
-; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_saddsat_v2i8:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 8
+; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 8
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
+; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16
+; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 8
+; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s1 clamp
+; GFX10PLUS-NEXT: s_sext_i32_i16 s1, 0x80008
+; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_sext_i32_i16 s2, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 16
+; GFX10PLUS-NEXT: s_ashr_i32 s1, s2, s1
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX10PLUS-NEXT: s_lshr_b32 s1, s0, 16
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
+; GFX10PLUS-NEXT: ; return to shader part epilog
%lhs = bitcast i16 %lhs.arg to <2 x i8>
%rhs = bitcast i16 %rhs.arg to <2 x i8>
%result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -886,66 +880,89 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
-; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_pk_add_i16 v1, s2, v1 clamp
-; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v3, 8
-; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 24
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_add_i16 v0, s2, v0 clamp
+; GFX9-NEXT: s_sext_i32_i16 s2, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
+; GFX9-NEXT: s_sext_i32_i16 s3, 0x80008
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_ashr_i32 s2, s2, s3
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX9-NEXT: s_sext_i32_i16 s2, s1
+; GFX9-NEXT: s_ashr_i32 s1, s1, 16
+; GFX9-NEXT: s_ashr_i32 s2, s2, s3
+; GFX9-NEXT: s_ashr_i32 s1, s1, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1
+; GFX9-NEXT: s_and_b32 s2, s0, 0xff
+; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT: s_lshl_b32 s0, s0, 8
+; GFX9-NEXT: s_or_b32 s0, s2, s0
+; GFX9-NEXT: s_and_b32 s2, s1, 0xff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_lshl_b32 s1, s1, 24
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_saddsat_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s2, s0, 8
+; GFX10-NEXT: s_lshr_b32 s5, s1, 8
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s4, s0, 24
-; GFX10-NEXT: s_lshr_b32 s5, s1, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT: s_lshr_b32 s4, s0, 16
-; GFX10-NEXT: s_lshr_b32 s5, s2, 16
-; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
+; GFX10-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX10-NEXT: s_lshl_b32 s4, s4, 8
-; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
-; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX10-NEXT: s_lshr_b32 s4, s1, 16
-; GFX10-NEXT: s_lshr_b32 s5, s3, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX10-NEXT: s_lshl_b32 s4, s4, 8
-; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
+; GFX10-NEXT: s_lshr_b32 s3, s2, 16
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
+; GFX10-NEXT: s_lshr_b32 s5, s4, 16
+; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_lshl_b32 s4, s4, 0x80008
+; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp
-; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp
-; GFX10-NEXT: v_mov_b32_e32 v2, 8
-; GFX10-NEXT: v_mov_b32_e32 v4, 24
-; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s3
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s5
+; GFX10-NEXT: s_sext_i32_i16 s3, 0x80008
+; GFX10-NEXT: v_pk_add_i16 v1, s0, s1 clamp
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: s_sext_i32_i16 s2, s0
+; GFX10-NEXT: s_ashr_i32 s0, s0, 16
+; GFX10-NEXT: s_ashr_i32 s2, s2, s3
+; GFX10-NEXT: s_ashr_i32 s0, s0, 8
+; GFX10-NEXT: s_sext_i32_i16 s4, s1
+; GFX10-NEXT: s_ashr_i32 s1, s1, 16
+; GFX10-NEXT: s_ashr_i32 s3, s4, s3
+; GFX10-NEXT: s_ashr_i32 s1, s1, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1
+; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80010
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: s_lshl_b32 s2, s2, 8
+; GFX10-NEXT: s_and_b32 s3, s1, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s2, s3, 16
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s1, s1, 24
+; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_saddsat_v4i8:
@@ -965,28 +982,40 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX11-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-NEXT: v_pk_add_i16 v0, s2, s3 clamp
; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX11-NEXT: s_lshl_b32 s4, s4, 8
; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-NEXT: v_pk_add_i16 v0, s2, s3 clamp
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s5
+; GFX11-NEXT: s_sext_i32_i16 s3, 0x80008
; GFX11-NEXT: v_pk_add_i16 v1, s0, s1 clamp
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8
-; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: s_sext_i32_i16 s2, s0
+; GFX11-NEXT: s_ashr_i32 s0, s0, 16
+; GFX11-NEXT: s_ashr_i32 s2, s2, s3
+; GFX11-NEXT: s_ashr_i32 s0, s0, 8
+; GFX11-NEXT: s_sext_i32_i16 s4, s1
+; GFX11-NEXT: s_ashr_i32 s1, s1, 16
+; GFX11-NEXT: s_ashr_i32 s3, s4, s3
+; GFX11-NEXT: s_ashr_i32 s1, s1, 8
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s3, s1
+; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80010
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-NEXT: s_and_b32 s3, s1, 0xff
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_lshl_b32 s2, s3, 16
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_lshl_b32 s1, s1, 24
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%lhs = bitcast i32 %lhs.arg to <4 x i8>
%rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -1085,8 +1114,8 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp
-; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_saddsat_i24:
@@ -1094,8 +1123,8 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs)
ret i24 %result
@@ -4090,9 +4119,9 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2
; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc
-; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16
-; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
-; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v5, v6, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5
@@ -4108,9 +4137,9 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc
-; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16
-; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
-; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: v_bfe_i32 v5, v6, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 16
; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5
@@ -4178,51 +4207,49 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-LABEL: s_saddsat_i48:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s4, s0, s2
-; GFX6-NEXT: s_addc_u32 s3, s1, s3
+; GFX6-NEXT: s_addc_u32 s5, s1, s3
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: s_cselect_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/176255
More information about the llvm-branch-commits
mailing list