[llvm] selecting v_sat_pk instruction, version 2 (PR #123297)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 16 23:57:09 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (Shoreshen)
<details>
<summary>Changes</summary>
This PR uses TRUNCATE_SSAT_U node to select v_sat_pk instruction.
Compare to previous #<!-- -->121124 , this PR put most of pattern match task to combiner, instead of instruction selection.
---
Full diff: https://github.com/llvm/llvm-project/pull/123297.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td (+2)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+15)
- (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+92-69)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index cca9fa72d0ca53..da9fe7e15e6620 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5498,6 +5498,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(UMIN3)
NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)
+ NODE_NAME_CASE(SAT_PK_CAST)
NODE_NAME_CASE(UMED3)
NODE_NAME_CASE(FMAXIMUM3)
NODE_NAME_CASE(FMINIMUM3)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index c74dc7942f52c0..6df4066c0fe6bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -461,6 +461,7 @@ enum NodeType : unsigned {
FMED3,
SMED3,
UMED3,
+ SAT_PK_CAST,
FMAXIMUM3,
FMINIMUM3,
FDOT2,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index bec294a945d2fe..2c4c9025134015 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -332,6 +332,8 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
[]
>;
+def AMDGPUsat_pk_cast : SDNode<"AMDGPUISD::SAT_PK_CAST", SDTUnaryOp, []>;
+
def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e068b5f0b8769b..58361b1e633039 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -865,6 +865,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
}
+ // special dealing for v_sat_pk instruction
+ if (AMDGPU::isGFX9(STI) || AMDGPU::isGFX11(STI) || AMDGPU::isGFX12(STI)) {
+ // In foldToSaturated during DAG combine
+ // 1. isOperationLegalOrCustom(Opc, SrcVT) getOperationAction(Op, SrcVT) == Custom
+ // 2. isTypeDesirableForOp checks regclass for v2i8 (hooked now checking DstVT == v2i8)
+ // In CustomLowerNode during legalizing, checks getOperationAction(Op, DstVT) == Custom
+ setOperationAction(ISD::TRUNCATE_SSAT_U, {MVT::v2i16, MVT::v2i8}, Custom);
+ }
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
@@ -1974,6 +1983,12 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
// create setcc with i1 operands. We don't have instructions for i1 setcc.
if (VT == MVT::i1 && Op == ISD::SETCC)
return false;
+
+ // Avoiding legality check for reg type of v2i8
+ // (do not need to addRegisterClass for v2i8)
+ // VT is result type, ensure the result type is v2i8
+ if (VT == MVT::v2i8 && Op == ISD::TRUNCATE_SSAT_U)
+ return true;
return TargetLowering::isTypeDesirableForOp(Op, VT);
}
@@ -6605,6 +6620,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
break;
}
+ case ISD::TRUNCATE_SSAT_U: {
+ SDLoc SL(N);
+ SDValue Op = DAG.getNode(AMDGPUISD::SAT_PK_CAST, SL, MVT::i16, N->getOperand(0));
+ Results.push_back(Op);
+ break;
+ }
default:
AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
break;
@@ -15184,6 +15205,16 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return Widened;
[[fallthrough]];
}
+ case ISD::BITCAST: {
+ // This is possible beause for (i16 bitcase (v2i8 trunc ...))
+ // It may be replaced bu (i16 bitcase (v2i8 truncssat_u ...))
+ // And then (i16 bitcase (i16 AMDGPUsat_pk_cast ...))
+ // There is no instruction of casting to the same type
+ SDValue Src = N->getOperand(0);
+ if (N->getValueType(0) == Src.getValueType()) {
+ return Src;
+ }
+ }
default: {
if (!DCI.isBeforeLegalize()) {
if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1abbf4c217a697..7f098e37b893bf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3309,6 +3309,21 @@ def : GCNPat <
(v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
>;
+multiclass V_SAT_PK_Pat<Instruction inst> {
+ def : GCNPat<
+ (i16 (AMDGPUsat_pk_cast v2i16:$src)),
+ (inst VRegSrc_32:$src)
+ >;
+}
+
+let OtherPredicates = [NotHasTrue16BitInsts] in {
+ defm : V_SAT_PK_Pat<V_SAT_PK_U8_I16_e64>;
+} // End OtherPredicates = [NotHasTrue16BitInsts]
+
+let True16Predicate = UseFakeTrue16Insts in {
+ defm : V_SAT_PK_Pat<V_SAT_PK_U8_I16_fake16_e64>;
+} // End True16Predicate = UseFakeTrue16Insts
+
// With multiple uses of the shift, this will duplicate the shift and
// increase register pressure.
def : GCNPat <
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 2d84e877229515..695c8e1c680eef 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12 %s
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12 %s
; <GFX9 has no V_SAT_PK, GFX9+ has V_SAT_PK, GFX11 has V_SAT_PK with t16
@@ -815,15 +815,15 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: basic_smax_smin_bit_or:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: basic_smax_smin_bit_or:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-LABEL: basic_smax_smin_bit_or:
; SDAG-GFX12: ; %bb.0:
@@ -860,6 +860,16 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: basic_smax_smin_bit_or:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX12-LABEL: basic_smax_smin_bit_or:
; GISEL-GFX12: ; %bb.0:
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -873,6 +883,15 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) {
; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1
; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: basic_smax_smin_bit_or:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0)
%src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255)
@@ -902,15 +921,15 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: basic_umax_umin_bit_or:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_u16 v1, 0xff, v1
-; GFX11-NEXT: v_min_u16 v0, 0xff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: basic_umax_umin_bit_or:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_min_u16 v1, 0xff, v1
+; SDAG-GFX11-NEXT: v_min_u16 v0, 0xff, v0
+; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-LABEL: basic_umax_umin_bit_or:
; SDAG-GFX12: ; %bb.0:
@@ -944,6 +963,16 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: basic_umax_umin_bit_or:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_min_u16 v1, 0xff, v1
+; GISEL-GFX11-NEXT: v_min_u16 v0, 0xff, v0
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX12-LABEL: basic_umax_umin_bit_or:
; GISEL-GFX12: ; %bb.0:
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -957,6 +986,15 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) {
; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1
; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: basic_umax_umin_bit_or:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_min_u16 v1, 0xff, v1
+; GFX11-NEXT: v_min_u16 v0, 0xff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%src0.max = call i16 @llvm.umax.i16(i16 %src0, i16 0)
%src0.clamp = call i16 @llvm.umin.i16(i16 %src0.max, i16 255)
@@ -1093,15 +1131,15 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: basic_smax_smin_bit_shl:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_i16 v1, v1, 0
-; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: basic_smax_smin_bit_shl:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_max_i16 v1, v1, 0
+; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-LABEL: basic_smax_smin_bit_shl:
; SDAG-GFX12: ; %bb.0:
@@ -1137,6 +1175,16 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: basic_smax_smin_bit_shl:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_max_i16 v1, v1, 0
+; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX12-LABEL: basic_smax_smin_bit_shl:
; GISEL-GFX12: ; %bb.0:
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1150,6 +1198,15 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) {
; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1
; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: basic_smax_smin_bit_shl:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_i16 v1, v1, 0
+; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0)
%src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255)
@@ -1174,24 +1231,13 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) {
; SDAG-GFX9-LABEL: basic_smax_smin_vec_input:
; SDAG-GFX9: ; %bb.0:
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff
-; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0]
-; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-LABEL: basic_smax_smin_vec_input:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-LABEL: basic_smax_smin_vec_input:
@@ -1201,13 +1247,7 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) {
; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-VI-LABEL: basic_smax_smin_vec_input:
@@ -1290,24 +1330,13 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) {
; SDAG-GFX9-LABEL: basic_smax_smin_vec_input_rev:
; SDAG-GFX9: ; %bb.0:
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff
-; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0]
-; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-LABEL: basic_smax_smin_vec_input_rev:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-LABEL: basic_smax_smin_vec_input_rev:
@@ -1317,13 +1346,7 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) {
; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-VI-LABEL: basic_smax_smin_vec_input_rev:
``````````
</details>
https://github.com/llvm/llvm-project/pull/123297
More information about the llvm-commits
mailing list