[llvm] [AMDGPU] selecting v_sat_pk instruction, version 2 (PR #123297)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 19 17:11:23 PST 2025
https://github.com/Shoreshen updated https://github.com/llvm/llvm-project/pull/123297
>From 5844fd43354c99d8319118e2463f8c55f0ec0058 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 17 Jan 2025 15:48:03 +0800
Subject: [PATCH 1/8] selecting v_sat_pk instruction, version 2
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 +
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 2 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 31 ++++
llvm/lib/Target/AMDGPU/SIInstructions.td | 15 ++
llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 161 ++++++++++--------
6 files changed, 142 insertions(+), 69 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index cca9fa72d0ca53..da9fe7e15e6620 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5498,6 +5498,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(UMIN3)
NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)
+ NODE_NAME_CASE(SAT_PK_CAST)
NODE_NAME_CASE(UMED3)
NODE_NAME_CASE(FMAXIMUM3)
NODE_NAME_CASE(FMINIMUM3)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index c74dc7942f52c0..6df4066c0fe6bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -461,6 +461,7 @@ enum NodeType : unsigned {
FMED3,
SMED3,
UMED3,
+ SAT_PK_CAST,
FMAXIMUM3,
FMINIMUM3,
FDOT2,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index bec294a945d2fe..2c4c9025134015 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -332,6 +332,8 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
[]
>;
+def AMDGPUsat_pk_cast : SDNode<"AMDGPUISD::SAT_PK_CAST", SDTUnaryOp, []>;
+
def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e068b5f0b8769b..58361b1e633039 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -865,6 +865,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
}
+ // special dealing for v_sat_pk instruction
+ if (AMDGPU::isGFX9(STI) || AMDGPU::isGFX11(STI) || AMDGPU::isGFX12(STI)) {
+ // In foldToSaturated during DAG combine
+ // 1. isOperationLegalOrCustom(Opc, SrcVT) getOperationAction(Op, SrcVT) == Custom
+ // 2. isTypeDesirableForOp checks regclass for v2i8 (hooked now checking DstVT == v2i8)
+ // In CustomLowerNode during legalizing, checks getOperationAction(Op, DstVT) == Custom
+ setOperationAction(ISD::TRUNCATE_SSAT_U, {MVT::v2i16, MVT::v2i8}, Custom);
+ }
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
@@ -1974,6 +1983,12 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
// create setcc with i1 operands. We don't have instructions for i1 setcc.
if (VT == MVT::i1 && Op == ISD::SETCC)
return false;
+
+ // Avoiding legality check for reg type of v2i8
+ // (do not need to addRegisterClass for v2i8)
+ // VT is result type, ensure the result type is v2i8
+ if (VT == MVT::v2i8 && Op == ISD::TRUNCATE_SSAT_U)
+ return true;
return TargetLowering::isTypeDesirableForOp(Op, VT);
}
@@ -6605,6 +6620,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
break;
}
+ case ISD::TRUNCATE_SSAT_U: {
+ SDLoc SL(N);
+ SDValue Op = DAG.getNode(AMDGPUISD::SAT_PK_CAST, SL, MVT::i16, N->getOperand(0));
+ Results.push_back(Op);
+ break;
+ }
default:
AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
break;
@@ -15184,6 +15205,16 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return Widened;
[[fallthrough]];
}
+ case ISD::BITCAST: {
+ // This is possible beause for (i16 bitcase (v2i8 trunc ...))
+ // It may be replaced bu (i16 bitcase (v2i8 truncssat_u ...))
+ // And then (i16 bitcase (i16 AMDGPUsat_pk_cast ...))
+ // There is no instruction of casting to the same type
+ SDValue Src = N->getOperand(0);
+ if (N->getValueType(0) == Src.getValueType()) {
+ return Src;
+ }
+ }
default: {
if (!DCI.isBeforeLegalize()) {
if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1abbf4c217a697..7f098e37b893bf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3309,6 +3309,21 @@ def : GCNPat <
(v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
>;
+multiclass V_SAT_PK_Pat<Instruction inst> {
+ def : GCNPat<
+ (i16 (AMDGPUsat_pk_cast v2i16:$src)),
+ (inst VRegSrc_32:$src)
+ >;
+}
+
+let OtherPredicates = [NotHasTrue16BitInsts] in {
+ defm : V_SAT_PK_Pat<V_SAT_PK_U8_I16_e64>;
+} // End OtherPredicates = [NotHasTrue16BitInsts]
+
+let True16Predicate = UseFakeTrue16Insts in {
+ defm : V_SAT_PK_Pat<V_SAT_PK_U8_I16_fake16_e64>;
+} // End True16Predicate = UseFakeTrue16Insts
+
// With multiple uses of the shift, this will duplicate the shift and
// increase register pressure.
def : GCNPat <
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 2d84e877229515..695c8e1c680eef 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12 %s
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12 %s
; <GFX9 has no V_SAT_PK, GFX9+ has V_SAT_PK, GFX11 has V_SAT_PK with t16
@@ -815,15 +815,15 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: basic_smax_smin_bit_or:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: basic_smax_smin_bit_or:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-LABEL: basic_smax_smin_bit_or:
; SDAG-GFX12: ; %bb.0:
@@ -860,6 +860,16 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: basic_smax_smin_bit_or:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX12-LABEL: basic_smax_smin_bit_or:
; GISEL-GFX12: ; %bb.0:
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -873,6 +883,15 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) {
; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1
; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: basic_smax_smin_bit_or:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0)
%src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255)
@@ -902,15 +921,15 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: basic_umax_umin_bit_or:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_u16 v1, 0xff, v1
-; GFX11-NEXT: v_min_u16 v0, 0xff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: basic_umax_umin_bit_or:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_min_u16 v1, 0xff, v1
+; SDAG-GFX11-NEXT: v_min_u16 v0, 0xff, v0
+; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-LABEL: basic_umax_umin_bit_or:
; SDAG-GFX12: ; %bb.0:
@@ -944,6 +963,16 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: basic_umax_umin_bit_or:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_min_u16 v1, 0xff, v1
+; GISEL-GFX11-NEXT: v_min_u16 v0, 0xff, v0
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX12-LABEL: basic_umax_umin_bit_or:
; GISEL-GFX12: ; %bb.0:
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -957,6 +986,15 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) {
; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1
; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: basic_umax_umin_bit_or:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_min_u16 v1, 0xff, v1
+; GFX11-NEXT: v_min_u16 v0, 0xff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%src0.max = call i16 @llvm.umax.i16(i16 %src0, i16 0)
%src0.clamp = call i16 @llvm.umin.i16(i16 %src0.max, i16 255)
@@ -1093,15 +1131,15 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: basic_smax_smin_bit_shl:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_i16 v1, v1, 0
-; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: basic_smax_smin_bit_shl:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_max_i16 v1, v1, 0
+; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-LABEL: basic_smax_smin_bit_shl:
; SDAG-GFX12: ; %bb.0:
@@ -1137,6 +1175,16 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: basic_smax_smin_bit_shl:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_max_i16 v1, v1, 0
+; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX12-LABEL: basic_smax_smin_bit_shl:
; GISEL-GFX12: ; %bb.0:
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1150,6 +1198,15 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) {
; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1
; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: basic_smax_smin_bit_shl:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_i16 v1, v1, 0
+; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0)
%src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255)
@@ -1174,24 +1231,13 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) {
; SDAG-GFX9-LABEL: basic_smax_smin_vec_input:
; SDAG-GFX9: ; %bb.0:
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff
-; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0]
-; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-LABEL: basic_smax_smin_vec_input:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-LABEL: basic_smax_smin_vec_input:
@@ -1201,13 +1247,7 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) {
; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-VI-LABEL: basic_smax_smin_vec_input:
@@ -1290,24 +1330,13 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) {
; SDAG-GFX9-LABEL: basic_smax_smin_vec_input_rev:
; SDAG-GFX9: ; %bb.0:
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff
-; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0]
-; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-LABEL: basic_smax_smin_vec_input_rev:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-LABEL: basic_smax_smin_vec_input_rev:
@@ -1317,13 +1346,7 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) {
; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0
; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-VI-LABEL: basic_smax_smin_vec_input_rev:
>From 8505185299a1009bda85764ca32a63e20bb6ae04 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 17 Jan 2025 16:13:11 +0800
Subject: [PATCH 2/8] fix format
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 23 +++++++++++++----------
1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 58361b1e633039..709a7def732592 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -868,12 +868,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// special dealing for v_sat_pk instruction
if (AMDGPU::isGFX9(STI) || AMDGPU::isGFX11(STI) || AMDGPU::isGFX12(STI)) {
// In foldToSaturated during DAG combine
- // 1. isOperationLegalOrCustom(Opc, SrcVT) getOperationAction(Op, SrcVT) == Custom
- // 2. isTypeDesirableForOp checks regclass for v2i8 (hooked now checking DstVT == v2i8)
- // In CustomLowerNode during legalizing, checks getOperationAction(Op, DstVT) == Custom
+ // 1. isOperationLegalOrCustom(Opc, SrcVT) getOperationAction(Op, SrcVT) ==
+ // Custom
+ // 2. isTypeDesirableForOp checks regclass for v2i8 (hooked now checking
+ // DstVT == v2i8) In CustomLowerNode during legalizing, checks
+ // getOperationAction(Op, DstVT) == Custom
setOperationAction(ISD::TRUNCATE_SSAT_U, {MVT::v2i16, MVT::v2i8}, Custom);
}
-
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
@@ -1983,8 +1985,8 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
// create setcc with i1 operands. We don't have instructions for i1 setcc.
if (VT == MVT::i1 && Op == ISD::SETCC)
return false;
-
- // Avoiding legality check for reg type of v2i8
+
+ // Avoiding legality check for reg type of v2i8
// (do not need to addRegisterClass for v2i8)
// VT is result type, ensure the result type is v2i8
if (VT == MVT::v2i8 && Op == ISD::TRUNCATE_SSAT_U)
@@ -6622,7 +6624,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::TRUNCATE_SSAT_U: {
SDLoc SL(N);
- SDValue Op = DAG.getNode(AMDGPUISD::SAT_PK_CAST, SL, MVT::i16, N->getOperand(0));
+ SDValue Op =
+ DAG.getNode(AMDGPUISD::SAT_PK_CAST, SL, MVT::i16, N->getOperand(0));
Results.push_back(Op);
break;
}
@@ -9977,9 +9980,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
? 1
: 0,
- DL, MVT::i8)); // swz
- Ops.push_back(M0Val.getValue(0)); // Chain
- Ops.push_back(M0Val.getValue(1)); // Glue
+ DL, MVT::i8)); // swz
+ Ops.push_back(M0Val.getValue(0)); // Chain
+ Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
MachineMemOperand *LoadMMO = M->getMemOperand();
>From 506ccd326000e18333182f342b2fc208ce28f09f Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 17 Jan 2025 16:29:08 +0800
Subject: [PATCH 3/8] update comments
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 34 +++++++++++++----------
1 file changed, 19 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 709a7def732592..3adbac7a1d9d59 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -867,12 +867,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// special dealing for v_sat_pk instruction
if (AMDGPU::isGFX9(STI) || AMDGPU::isGFX11(STI) || AMDGPU::isGFX12(STI)) {
- // In foldToSaturated during DAG combine
- // 1. isOperationLegalOrCustom(Opc, SrcVT) getOperationAction(Op, SrcVT) ==
- // Custom
- // 2. isTypeDesirableForOp checks regclass for v2i8 (hooked now checking
- // DstVT == v2i8) In CustomLowerNode during legalizing, checks
- // getOperationAction(Op, DstVT) == Custom
+ // Reasons for putting both {MVT::v2i16, MVT::v2i8}
+ // 1. In foldToSaturated during DAG combine
+ // a. isOperationLegalOrCustom(Opc, SrcVT)
+ // will check getOperationAction(Op, SrcVT) == Custom
+ // b. isTypeDesirableForOp checks regclass for v2i8
+ // (hooked now checking DstVT == v2i8)
+ // 2. In CustomLowerNode during legalizing, checks
+ // getOperationAction(Op, DstVT) == Custom
setOperationAction(ISD::TRUNCATE_SSAT_U, {MVT::v2i16, MVT::v2i8}, Custom);
}
@@ -1986,9 +1988,7 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
if (VT == MVT::i1 && Op == ISD::SETCC)
return false;
- // Avoiding legality check for reg type of v2i8
- // (do not need to addRegisterClass for v2i8)
- // VT is result type, ensure the result type is v2i8
+ // v2i8 is illegal and only allowed in specific cases
if (VT == MVT::v2i8 && Op == ISD::TRUNCATE_SSAT_U)
return true;
@@ -15209,14 +15209,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
[[fallthrough]];
}
case ISD::BITCAST: {
- // This is possible beause for (i16 bitcase (v2i8 trunc ...))
- // It may be replaced bu (i16 bitcase (v2i8 truncssat_u ...))
- // And then (i16 bitcase (i16 AMDGPUsat_pk_cast ...))
- // There is no instruction of casting to the same type
+ // If src.VT == dst.VT, there is no instruction can be select
+ // which causes selection fail.
+ //
+ // One of the stuation is (i16 (bitcast (v2i8 (trunc (v2i16 (smed ...)))))
+ // The pattern will experience the following steps to
+ // create (i16 (bitcast i16)):
+ //
+ // 1. During DAG combine: (i16 (bitcast (v2i8 (truncssat_u ...)))
+ // 2. During legalizing: (i16 (bitcast (i16 (sat_pk_cast ...)))
SDValue Src = N->getOperand(0);
- if (N->getValueType(0) == Src.getValueType()) {
+ if (N->getValueType(0) == Src.getValueType())
return Src;
- }
}
default: {
if (!DCI.isBeforeLegalize()) {
>From 7fce2b24db03cd251654f32e2df9e39c6b624205 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 17 Jan 2025 16:30:28 +0800
Subject: [PATCH 4/8] upadte comments
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3adbac7a1d9d59..9e99d1e25d16d4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -865,7 +865,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
}
- // special dealing for v_sat_pk instruction
+ // special case for v_sat_pk
if (AMDGPU::isGFX9(STI) || AMDGPU::isGFX11(STI) || AMDGPU::isGFX12(STI)) {
// Reasons for putting both {MVT::v2i16, MVT::v2i8}
// 1. In foldToSaturated during DAG combine
>From 424938fdafd32ccc7f5b38cf3382ae212086570a Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 17 Jan 2025 16:38:17 +0800
Subject: [PATCH 5/8] fix format
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9e99d1e25d16d4..398e143e2fc102 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -869,10 +869,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (AMDGPU::isGFX9(STI) || AMDGPU::isGFX11(STI) || AMDGPU::isGFX12(STI)) {
// Reasons for putting both {MVT::v2i16, MVT::v2i8}
// 1. In foldToSaturated during DAG combine
- // a. isOperationLegalOrCustom(Opc, SrcVT)
+ // a. isOperationLegalOrCustom(Opc, SrcVT)
// will check getOperationAction(Op, SrcVT) == Custom
- // b. isTypeDesirableForOp checks regclass for v2i8
- // (hooked now checking DstVT == v2i8)
+ // b. isTypeDesirableForOp checks regclass for v2i8
+ // (hooked now checking DstVT == v2i8)
// 2. In CustomLowerNode during legalizing, checks
// getOperationAction(Op, DstVT) == Custom
setOperationAction(ISD::TRUNCATE_SSAT_U, {MVT::v2i16, MVT::v2i8}, Custom);
@@ -15213,13 +15213,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
// which causes selection fail.
//
// One of the stuation is (i16 (bitcast (v2i8 (trunc (v2i16 (smed ...)))))
- // The pattern will experience the following steps to
+ // The pattern will experience the following steps to
// create (i16 (bitcast i16)):
//
// 1. During DAG combine: (i16 (bitcast (v2i8 (truncssat_u ...)))
// 2. During legalizing: (i16 (bitcast (i16 (sat_pk_cast ...)))
SDValue Src = N->getOperand(0);
- if (N->getValueType(0) == Src.getValueType())
+ if (N->getValueType(0) == Src.getValueType())
return Src;
}
default: {
>From 88e52c175c284784f065b36ae7eace31a6a9bd1f Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 17 Jan 2025 19:33:00 +0800
Subject: [PATCH 6/8] fix comments
---
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 42 +++++++----------------
2 files changed, 13 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 2c4c9025134015..6e9e3f80bacf35 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -332,6 +332,7 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
[]
>;
+// Special node to handle v_sat_pk to avoid v2i8
def AMDGPUsat_pk_cast : SDNode<"AMDGPUISD::SAT_PK_CAST", SDTUnaryOp, []>;
def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 398e143e2fc102..a3996a78b42f56 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -816,6 +816,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
Custom);
}
+
+ // true 16 currently unsupported
+ if (!Subtarget->hasTrue16BitInsts() ||
+ (!Subtarget->useRealTrue16Insts() || !Subtarget->useRealTrue16Insts())) {
+ // MVT::v2i16 for src type check in foldToSaturated
+ // MVT::v2i8 for dst type check in CustomLowerNode
+ setOperationAction(ISD::TRUNCATE_SSAT_U, {MVT::v2i16, MVT::v2i8}, Custom);
+ }
}
setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
@@ -865,19 +873,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
}
- // special case for v_sat_pk
- if (AMDGPU::isGFX9(STI) || AMDGPU::isGFX11(STI) || AMDGPU::isGFX12(STI)) {
- // Reasons for putting both {MVT::v2i16, MVT::v2i8}
- // 1. In foldToSaturated during DAG combine
- // a. isOperationLegalOrCustom(Opc, SrcVT)
- // will check getOperationAction(Op, SrcVT) == Custom
- // b. isTypeDesirableForOp checks regclass for v2i8
- // (hooked now checking DstVT == v2i8)
- // 2. In CustomLowerNode during legalizing, checks
- // getOperationAction(Op, DstVT) == Custom
- setOperationAction(ISD::TRUNCATE_SSAT_U, {MVT::v2i16, MVT::v2i8}, Custom);
- }
-
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
@@ -6626,6 +6621,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
SDLoc SL(N);
SDValue Op =
DAG.getNode(AMDGPUISD::SAT_PK_CAST, SL, MVT::i16, N->getOperand(0));
+ Op = DAG.getNode(ISD::BITCAST, SL, MVT::v2i8, Op);
Results.push_back(Op);
break;
}
@@ -9980,9 +9976,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
? 1
: 0,
- DL, MVT::i8)); // swz
- Ops.push_back(M0Val.getValue(0)); // Chain
- Ops.push_back(M0Val.getValue(1)); // Glue
+ DL, MVT::i8)); // swz
+ Ops.push_back(M0Val.getValue(0)); // Chain
+ Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
MachineMemOperand *LoadMMO = M->getMemOperand();
@@ -15208,20 +15204,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return Widened;
[[fallthrough]];
}
- case ISD::BITCAST: {
- // If src.VT == dst.VT, there is no instruction can be select
- // which causes selection fail.
- //
- // One of the stuation is (i16 (bitcast (v2i8 (trunc (v2i16 (smed ...)))))
- // The pattern will experience the following steps to
- // create (i16 (bitcast i16)):
- //
- // 1. During DAG combine: (i16 (bitcast (v2i8 (truncssat_u ...)))
- // 2. During legalizing: (i16 (bitcast (i16 (sat_pk_cast ...)))
- SDValue Src = N->getOperand(0);
- if (N->getValueType(0) == Src.getValueType())
- return Src;
- }
default: {
if (!DCI.isBeforeLegalize()) {
if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
>From 1dc8b9c698e38a8608cfefc6d5317fdce2358952 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 17 Jan 2025 19:41:20 +0800
Subject: [PATCH 7/8] fix format
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a3996a78b42f56..335d2c88b0edce 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -818,8 +818,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
// true 16 currently unsupported
- if (!Subtarget->hasTrue16BitInsts() ||
- (!Subtarget->useRealTrue16Insts() || !Subtarget->useRealTrue16Insts())) {
+ if (!Subtarget->hasTrue16BitInsts() || (!Subtarget->useRealTrue16Insts() ||
+ !Subtarget->useRealTrue16Insts())) {
// MVT::v2i16 for src type check in foldToSaturated
// MVT::v2i8 for dst type check in CustomLowerNode
setOperationAction(ISD::TRUNCATE_SSAT_U, {MVT::v2i16, MVT::v2i8}, Custom);
>From c5e3e65cf4594982f0ba49031683559a0606266e Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 17 Jan 2025 22:52:41 +0800
Subject: [PATCH 8/8] fix comment
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 335d2c88b0edce..5492fb1f67e132 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -817,9 +817,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
Custom);
}
- // true 16 currently unsupported
- if (!Subtarget->hasTrue16BitInsts() || (!Subtarget->useRealTrue16Insts() ||
- !Subtarget->useRealTrue16Insts())) {
+ // Avoid true 16 instruction
+ if (!Subtarget->hasTrue16BitInsts() || !Subtarget->useRealTrue16Insts()) {
// MVT::v2i16 for src type check in foldToSaturated
// MVT::v2i8 for dst type check in CustomLowerNode
setOperationAction(ISD::TRUNCATE_SSAT_U, {MVT::v2i16, MVT::v2i8}, Custom);
More information about the llvm-commits
mailing list