[llvm] a304968 - [AMDGPU][GlobalISel] Add RegBankLegalize rules for SMED3 and CVT_PK_I16_I32 (#176596)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 21 08:57:57 PST 2026
Author: vangthao95
Date: 2026-01-21T08:57:53-08:00
New Revision: a304968d5e2e7a6895ce9cc53cc4d254199ecf9a
URL: https://github.com/llvm/llvm-project/commit/a304968d5e2e7a6895ce9cc53cc4d254199ecf9a
DIFF: https://github.com/llvm/llvm-project/commit/a304968d5e2e7a6895ce9cc53cc4d254199ecf9a.diff
LOG: [AMDGPU][GlobalISel] Add RegBankLegalize rules for SMED3 and CVT_PK_I16_I32 (#176596)
These opcodes are created together for the i64->i16 signed clamp
pattern.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 1eaec8fc8f446..7dbfde0fdf517 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -1077,6 +1077,13 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+ // TODO: This opcode is generated from the i64->i16 signed clamped pattern in
+ // the PreLegalizerCombiner. Move the combine to RegBankCombiner to keep more
+ // instructions on SALU.
+ addRulesForGOpcs({G_AMDGPU_SMED3}, Standard)
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
// FNEG and FABS are either folded as source modifiers or can be selected as
// bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
// targets without SALU float we still select them as VGPR since there would
@@ -1124,6 +1131,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
.Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat);
+ addRulesForGOpcs({G_AMDGPU_CVT_PK_I16_I32}, Standard)
+ .Uni(V2S16, {{UniInVgprV2S16}, {Vgpr32, Vgpr32}})
+ .Div(V2S16, {{VgprV2S16}, {Vgpr32, Vgpr32}});
+
addRulesForGOpcs({G_FPTRUNC})
.Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
.Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
index 7db49bca36062..567be78f19614 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefixes=GCN,GFX678 %s
-; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,GFX10 %s
-; RUN: llc -global-isel -mcpu=gfx1100 -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefixes=GCN,GFX678 %s
+; RUN: llc -global-isel -new-reg-bank-select -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mcpu=gfx1010 -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mcpu=gfx1100 -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,GFX11 %s
declare i64 @llvm.smax.i64(i64, i64)
declare i64 @llvm.smin.i64(i64, i64)
@@ -314,3 +314,61 @@ entry:
%result = trunc i64 %min to i16
ret i16 %result
}
+
+define i16 @clamp_i64_i16_uniform(i64 inreg %in) #0 {
+; GFX678-LABEL: clamp_i64_i16_uniform:
+; GFX678: ; %bb.0: ; %entry
+; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s17
+; GFX678-NEXT: v_cvt_pk_i16_i32_e32 v0, s16, v0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xffff8000
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX678-NEXT: v_med3_i32 v0, v1, v0, v2
+; GFX678-NEXT: v_readfirstlane_b32 s4, v0
+; GFX678-NEXT: s_add_i32 s4, s4, s4
+; GFX678-NEXT: v_mov_b32_e32 v0, s4
+; GFX678-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: clamp_i64_i16_uniform:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s17
+; GFX9-NEXT: v_cvt_pk_i16_i32 v0, s16, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX9-NEXT: v_med3_i32 v0, v1, v0, v2
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_add_i32 s4, s4, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: clamp_i64_i16_uniform:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_pk_i16_i32 v0, s16, s17
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x7fff
+; GFX10-NEXT: v_med3_i32 v0, 0xffff8000, v0, v1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: s_add_i32 s4, s4, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: clamp_i64_i16_uniform:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_pk_i16_i32 v0, s0, s1
+; GFX11-NEXT: v_mov_b32_e32 v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_med3_i32 v0, 0xffff8000, v0, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_add_i32 s0, s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %max = call i64 @llvm.smax.i64(i64 %in, i64 -32768)
+ %min = call i64 @llvm.smin.i64(i64 %max, i64 32767)
+ %result = trunc i64 %min to i16
+ %ret = add i16 %result, %result
+ ret i16 %ret
+}
More information about the llvm-commits
mailing list