[llvm] edca49c - [AMDGPU] Match med3 for (max (min ..))
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 7 02:14:41 PST 2023
Author: pvanhout
Date: 2023-03-07T11:14:31+01:00
New Revision: edca49cfb766ef4f6e665b13808976d119bc1f1d
URL: https://github.com/llvm/llvm-project/commit/edca49cfb766ef4f6e665b13808976d119bc1f1d
DIFF: https://github.com/llvm/llvm-project/commit/edca49cfb766ef4f6e665b13808976d119bc1f1d.diff
LOG: [AMDGPU] Match med3 for (max (min ..))
We previously only matched (min (max ...))
Depends on D144728
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D145159
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/test/CodeGen/AMDGPU/mad_uint24.ll
llvm/test/CodeGen/AMDGPU/saddsat.ll
llvm/test/CodeGen/AMDGPU/smed3.ll
llvm/test/CodeGen/AMDGPU/ssubsat.ll
llvm/test/CodeGen/AMDGPU/umed3.ll
llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f30cd8d550155..aa85ee359060f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10588,45 +10588,41 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
}
}
-SDValue SITargetLowering::performIntMed3ImmCombine(
- SelectionDAG &DAG, const SDLoc &SL,
- SDValue Op0, SDValue Op1, bool Signed) const {
- ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
- if (!K1)
- return SDValue();
+SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
+ const SDLoc &SL, SDValue Src,
+ SDValue MinVal,
+ SDValue MaxVal,
+ bool Signed) const {
+
+ // med3 comes from
+ // min(max(x, K0), K1), K0 < K1
+ // max(min(x, K0), K1), K1 < K0
+ //
+ // "MinVal" and "MaxVal" respectively refer to the rhs of the
+ // min/max op.
+ ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
+ ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
- ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
- if (!K0)
+ if (!MinK || !MaxK)
return SDValue();
if (Signed) {
- if (K0->getAPIntValue().sge(K1->getAPIntValue()))
+ if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
return SDValue();
} else {
- if (K0->getAPIntValue().uge(K1->getAPIntValue()))
+ if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
return SDValue();
}
- EVT VT = K0->getValueType(0);
+ EVT VT = MinK->getValueType(0);
unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
- if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
- return DAG.getNode(Med3Opc, SL, VT,
- Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
- }
-
- // If there isn't a 16-bit med3 operation, convert to 32-bit.
- if (VT == MVT::i16) {
- MVT NVT = MVT::i32;
- unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-
- SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
- SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
- SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
-
- SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
- return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
- }
+ if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
+ return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
+ // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
+ // not available, but this is unlikely to be profitable as constants
+ // will often need to be materialized & extended, especially on
+ // pre-GFX10 where VOP3 instructions couldn't take literal operands.
return SDValue();
}
@@ -10738,13 +10734,26 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
}
// min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
+ // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
- if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
+ if (SDValue Med3 = performIntMed3ImmCombine(
+ DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
+ return Med3;
+ }
+ if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
+ if (SDValue Med3 = performIntMed3ImmCombine(
+ DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
return Med3;
}
if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
- if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
+ if (SDValue Med3 = performIntMed3ImmCombine(
+ DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
+ return Med3;
+ }
+ if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
+ if (SDValue Med3 = performIntMed3ImmCombine(
+ DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
return Med3;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d9fde1c6adceb..0aba20585daf7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -193,7 +193,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1) const;
SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
- SDValue Op0, SDValue Op1, bool Signed) const;
+ SDValue Src, SDValue MinVal, SDValue MaxVal,
+ bool Signed) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index b9143150280a8..ef26280ffc9e0 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -158,9 +158,11 @@ bb18: ; preds = %bb4
; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
; EG: 8
; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
+; SI: v_med3_i32 v{{[0-9]}}, [[EXT]],
; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; GCN: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
-; GCN: v_med3_i32 v{{[0-9]}}, [[EXT]],
+; VI: v_max_i16_e32 [[MAX:v[0-9]]], 0xff80, [[MAD]]
+; VI: v_min_i16_e32 {{v[0-9]}}, 0x7f, [[MAX]]
define amdgpu_kernel void @i8_mad_sat_16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(5) %idx) {
entry:
%retval.0.i = load i64, ptr addrspace(5) %idx
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 6f6ff8124ffdf..7d6b547cff7c9 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -12,8 +12,9 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0
-; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0
+; GFX6-NEXT: s_movk_i32 s4, 0xff80
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f
+; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_i8:
@@ -53,8 +54,9 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
-; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
+; GFX6-NEXT: s_movk_i32 s4, 0x8000
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x7fff
+; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_i16:
@@ -135,14 +137,14 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT: s_movk_i32 s4, 0x8000
+; GFX6-NEXT: v_mov_b32_e32 v3, 0x7fff
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
-; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
-; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
-; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX6-NEXT: v_med3_i32 v1, v1, s4, v3
+; GFX6-NEXT: v_med3_i32 v0, v0, s4, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -196,16 +198,15 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GFX6-NEXT: s_movk_i32 s4, 0x8000
+; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fff
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
-; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
-; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
+; GFX6-NEXT: v_med3_i32 v1, v1, s4, v4
+; GFX6-NEXT: v_med3_i32 v0, v0, s4, v4
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
-; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
-; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2
+; GFX6-NEXT: v_med3_i32 v3, v2, s4, v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3
; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
@@ -268,11 +269,11 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; GFX6-NEXT: s_movk_i32 s4, 0x8000
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
-; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
-; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
-; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
+; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5
+; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
@@ -282,10 +283,8 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
-; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2
-; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
-; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2
+; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5
+; GFX6-NEXT: v_med3_i32 v2, v2, s4, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index d9e21e5a83c99..a3c6ca1c8673d 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
@@ -94,7 +94,9 @@ declare i64 @llvm.smax.i64(i64, i64)
declare i64 @llvm.smin.i64(i64, i64)
; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
-; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; VI: v_max_i16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}}
+; VI: v_min_i16_e32 {{v[0-9]}}, 17, [[MAX]]
; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define amdgpu_kernel void @v_test_smed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 13deecbc78857..ad9ecd24555f9 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -12,8 +12,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0
-; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0
+; GFX6-NEXT: s_movk_i32 s4, 0xff80
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f
+; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_i8:
@@ -53,8 +54,9 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
-; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
+; GFX6-NEXT: s_movk_i32 s4, 0x8000
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x7fff
+; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_i16:
@@ -135,14 +137,14 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT: s_movk_i32 s4, 0x8000
+; GFX6-NEXT: v_mov_b32_e32 v3, 0x7fff
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
-; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
-; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
-; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX6-NEXT: v_med3_i32 v1, v1, s4, v3
+; GFX6-NEXT: v_med3_i32 v0, v0, s4, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -196,16 +198,15 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
+; GFX6-NEXT: s_movk_i32 s4, 0x8000
+; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fff
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
-; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
-; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
+; GFX6-NEXT: v_med3_i32 v1, v1, s4, v4
+; GFX6-NEXT: v_med3_i32 v0, v0, s4, v4
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
-; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
-; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2
+; GFX6-NEXT: v_med3_i32 v3, v2, s4, v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
@@ -268,11 +269,11 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; GFX6-NEXT: s_movk_i32 s4, 0x8000
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
-; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
-; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
-; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
+; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5
+; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
@@ -282,10 +283,8 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
-; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2
-; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
-; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2
+; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5
+; GFX6-NEXT: v_med3_i32 v2, v2, s4, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 13b47c0d7f8f9..ced86b98d779e 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
@@ -81,7 +81,9 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_i64(ptr addrspace(1) %out, ptr add
}
; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
-; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; VI: v_max_u16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}}
+; VI: v_min_u16_e32 {{v[0-9]}}, 17, [[MAX]]
; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define amdgpu_kernel void @v_test_umed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 2a9da1385e50c..4018eabd4657e 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -19,13 +19,12 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) {
; SDAG-VI-LABEL: basic_smax_smin:
; SDAG-VI: ; %bb.0:
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-VI-NEXT: s_movk_i32 s4, 0xff
-; SDAG-VI-NEXT: v_bfe_i32 v1, v1, 0, 16
-; SDAG-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
-; SDAG-VI-NEXT: v_med3_i32 v1, v1, 0, s4
-; SDAG-VI-NEXT: v_med3_i32 v0, v0, 0, s4
-; SDAG-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0
+; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff
+; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0
+; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: basic_smax_smin:
@@ -74,12 +73,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: s_sext_i32_i16 s2, s2
-; SDAG-VI-NEXT: s_sext_i32_i16 s3, s3
-; SDAG-VI-NEXT: v_med3_i32 v1, s2, 0, v0
-; SDAG-VI-NEXT: v_med3_i32 v0, s3, 0, v0
-; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0
+; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0
+; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1
+; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dword v[0:1], v2
@@ -201,29 +199,25 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) {
; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX9-LABEL: basic_smin_smax:
-; SDAG-GFX9: ; %bb.0:
-; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX9-NEXT: v_min_i16_e32 v0, 0xff, v0
-; SDAG-GFX9-NEXT: v_min_i16_e32 v1, 0xff, v1
-; SDAG-GFX9-NEXT: v_max_i16_e32 v0, 0, v0
-; SDAG-GFX9-NEXT: v_max_i16_e32 v1, 0, v1
-; SDAG-GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: basic_smin_smax:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
+; GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX11-LABEL: basic_smin_smax:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; SDAG-GFX11-NEXT: v_min_i16 v0, 0xff, v0
-; SDAG-GFX11-NEXT: v_min_i16 v1, 0xff, v1
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX11-NEXT: v_max_i16 v0, v0, 0
-; SDAG-GFX11-NEXT: v_max_i16 v1, v1, 0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: basic_smin_smax:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-VI-LABEL: basic_smin_smax:
; GISEL-VI: ; %bb.0:
@@ -235,26 +229,6 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) {
; GISEL-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX9-LABEL: basic_smin_smax:
-; GISEL-GFX9: ; %bb.0:
-; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff
-; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
-; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
-; GISEL-GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GISEL-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX11-LABEL: basic_smin_smax:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
%src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255)
%src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0)
%src1.min = call i16 @llvm.smin.i16(i16 %src1, i16 255)
@@ -268,36 +242,33 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) {
; SDAG-VI-LABEL: basic_smin_smax_combined:
; SDAG-VI: ; %bb.0:
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-VI-NEXT: v_bfe_i32 v1, v1, 0, 16
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff
; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0
-; SDAG-VI-NEXT: v_med3_i32 v1, v1, 0, v2
-; SDAG-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff
+; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0
; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX9-LABEL: basic_smin_smax_combined:
-; SDAG-GFX9: ; %bb.0:
-; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX9-NEXT: v_min_i16_e32 v0, 0xff, v0
-; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff
-; SDAG-GFX9-NEXT: v_max_i16_e32 v0, 0, v0
-; SDAG-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
-; SDAG-GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: basic_smin_smax_combined:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
+; GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX11-LABEL: basic_smin_smax_combined:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; SDAG-GFX11-NEXT: v_min_i16 v0, 0xff, v0
-; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_max_i16 v0, v0, 0
-; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: basic_smin_smax_combined:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-VI-LABEL: basic_smin_smax_combined:
; GISEL-VI: ; %bb.0:
@@ -309,26 +280,6 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) {
; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX9-LABEL: basic_smin_smax_combined:
-; GISEL-GFX9: ; %bb.0:
-; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff
-; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
-; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
-; GISEL-GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GISEL-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX11-LABEL: basic_smin_smax_combined:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
%src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255)
%src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0)
%src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0)
@@ -342,13 +293,13 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) {
; SDAG-VI-LABEL: vec_smax_smin:
; SDAG-VI: ; %bb.0:
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-VI-NEXT: v_bfe_i32 v1, v0, 0, 16
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-VI-NEXT: v_max_i16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff
-; SDAG-VI-NEXT: v_ashrrev_i32_e32 v0, 16, v0
-; SDAG-VI-NEXT: v_med3_i32 v0, v0, 0, v2
-; SDAG-VI-NEXT: v_med3_i32 v1, v1, 0, v2
-; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0
+; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9-LABEL: vec_smax_smin:
@@ -400,12 +351,12 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: s_sext_i32_i16 s3, s2
-; SDAG-VI-NEXT: s_ashr_i32 s2, s2, 16
-; SDAG-VI-NEXT: v_med3_i32 v1, s3, 0, v0
-; SDAG-VI-NEXT: v_med3_i32 v0, s2, 0, v0
-; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16
+; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0
+; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0
+; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1
+; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dword v[0:1], v2
More information about the llvm-commits
mailing list