[llvm] fd85ffb - [AMDGPU] Handle min/max in isNarrowingProfitable (#140206)

via llvm-commits llvm-commits at lists.llvm.org
Fri May 16 01:16:49 PDT 2025


Author: Pierre van Houtryve
Date: 2025-05-16T10:16:44+02:00
New Revision: fd85ffb4c43be80c179145a382d78a7c9ca3c1b7

URL: https://github.com/llvm/llvm-project/commit/fd85ffb4c43be80c179145a382d78a7c9ca3c1b7
DIFF: https://github.com/llvm/llvm-project/commit/fd85ffb4c43be80c179145a382d78a7c9ca3c1b7.diff

LOG: [AMDGPU] Handle min/max in isNarrowingProfitable (#140206)

Introduces a slight regression in some cases but it'll even out once we
disable the promotion in CGP.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/min.ll
    llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8b93ed342c64a..7ed055e8da2b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1042,6 +1042,10 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
   case ISD::MUL:
   case ISD::SETCC:
   case ISD::SELECT:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
     if (Subtarget->has16BitInsts() &&
         (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
       // Don't narrow back down to i16 if promoted to i32 already.

diff  --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index d2f4f54cefe78..201b85c745c18 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -649,36 +649,35 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ;
 ; GFX9-LABEL: s_test_imin_sle_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s3, s[8:9], 0x4c
 ; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x28
+; GFX9-NEXT:    s_load_dword s3, s[8:9], 0x4c
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s3, 16
-; GFX9-NEXT:    s_ashr_i32 s9, s3, 24
-; GFX9-NEXT:    s_ashr_i32 s6, s2, 24
-; GFX9-NEXT:    s_bfe_i32 s8, s8, 0x80000
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_bfe_i32 s5, s5, 0x80000
+; GFX9-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NEXT:    s_sext_i32_i16 s7, s3
-; GFX9-NEXT:    v_min_i16_e32 v1, s6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
-; GFX9-NEXT:    s_lshr_b32 s7, s7, 8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT:    v_min_i16_e32 v2, s5, v2
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 8
-; GFX9-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x80000
-; GFX9-NEXT:    v_min_i16_e32 v2, s4, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX9-NEXT:    v_min_i16_e32 v3, s2, v3
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_ashr_i32 s7, s7, 8
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 8
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX9-NEXT:    s_ashr_i32 s6, s3, 24
+; GFX9-NEXT:    s_min_i32 s5, s5, s7
+; GFX9-NEXT:    s_sext_i32_i8 s7, s3
+; GFX9-NEXT:    s_sext_i32_i8 s8, s2
+; GFX9-NEXT:    s_bfe_i32 s3, s3, 0x80010
+; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x80010
+; GFX9-NEXT:    s_min_i32 s7, s8, s7
+; GFX9-NEXT:    s_min_i32 s4, s4, s6
+; GFX9-NEXT:    s_min_i32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX9-NEXT:    s_or_b32 s5, s7, s5
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s2, s5, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -688,111 +687,70 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x28
 ; GFX10-NEXT:    s_load_dword s3, s[8:9], 0x4c
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s4, s2
+; GFX10-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX10-NEXT:    s_sext_i32_i16 s7, s3
-; GFX10-NEXT:    s_ashr_i32 s6, s2, 24
-; GFX10-NEXT:    s_ashr_i32 s9, s3, 24
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 8
-; GFX10-NEXT:    s_lshr_b32 s7, s7, 8
-; GFX10-NEXT:    v_min_i16 v0, s6, s9
-; GFX10-NEXT:    v_min_i16 v1, s4, s7
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s8, s3, 16
-; GFX10-NEXT:    s_bfe_i32 s2, s2, 0x80000
-; GFX10-NEXT:    s_bfe_i32 s5, s5, 0x80000
-; GFX10-NEXT:    s_bfe_i32 s4, s8, 0x80000
-; GFX10-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX10-NEXT:    v_min_i16 v2, s5, s4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX10-NEXT:    v_min_i16 v3, s2, s3
-; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX10-NEXT:    s_ashr_i32 s6, s3, 24
+; GFX10-NEXT:    s_sext_i32_i8 s8, s3
+; GFX10-NEXT:    s_sext_i32_i8 s9, s2
+; GFX10-NEXT:    s_bfe_i32 s3, s3, 0x80010
+; GFX10-NEXT:    s_bfe_i32 s2, s2, 0x80010
+; GFX10-NEXT:    s_ashr_i32 s7, s7, 8
+; GFX10-NEXT:    s_ashr_i32 s5, s5, 8
+; GFX10-NEXT:    s_min_i32 s8, s9, s8
+; GFX10-NEXT:    s_min_i32 s4, s4, s6
+; GFX10-NEXT:    s_min_i32 s2, s2, s3
+; GFX10-NEXT:    s_min_i32 s3, s5, s7
+; GFX10-NEXT:    s_and_b32 s5, s8, 0xff
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    s_or_b32 s3, s5, s3
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-TRUE16-LABEL: s_test_imin_sle_v4i8:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    s_load_b32 s0, s[4:5], 0x28
-; GFX11-TRUE16-NEXT:    s_load_b32 s1, s[4:5], 0x4c
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_sext_i32_i16 s2, s0
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX11-TRUE16-NEXT:    s_sext_i32_i16 s7, s1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX11-TRUE16-NEXT:    s_ashr_i32 s6, s0, 24
-; GFX11-TRUE16-NEXT:    s_ashr_i32 s9, s1, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s2, 8
-; GFX11-TRUE16-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX11-TRUE16-NEXT:    s_bfe_i32 s0, s0, 0x80000
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s7, 8
-; GFX11-TRUE16-NEXT:    s_bfe_i32 s8, s8, 0x80000
-; GFX11-TRUE16-NEXT:    s_bfe_i32 s1, s1, 0x80000
-; GFX11-TRUE16-NEXT:    v_min_i16 v0.l, s6, s9
-; GFX11-TRUE16-NEXT:    v_min_i16 v1.l, s3, s8
-; GFX11-TRUE16-NEXT:    v_min_i16 v2.l, s2, s7
-; GFX11-TRUE16-NEXT:    v_min_i16 v3.l, s0, s1
-; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_test_imin_sle_v4i8:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x28
-; GFX11-FAKE16-NEXT:    s_load_b32 s1, s[4:5], 0x4c
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_sext_i32_i16 s2, s0
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX11-FAKE16-NEXT:    s_sext_i32_i16 s7, s1
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX11-FAKE16-NEXT:    s_ashr_i32 s6, s0, 24
-; GFX11-FAKE16-NEXT:    s_bfe_i32 s0, s0, 0x80000
-; GFX11-FAKE16-NEXT:    s_ashr_i32 s9, s1, 24
-; GFX11-FAKE16-NEXT:    s_bfe_i32 s1, s1, 0x80000
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s2, 8
-; GFX11-FAKE16-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s7, 8
-; GFX11-FAKE16-NEXT:    s_bfe_i32 s8, s8, 0x80000
-; GFX11-FAKE16-NEXT:    v_min_i16 v0, s6, s9
-; GFX11-FAKE16-NEXT:    v_min_i16 v1, s0, s1
-; GFX11-FAKE16-NEXT:    v_min_i16 v2, s3, s8
-; GFX11-FAKE16-NEXT:    v_min_i16 v3, s2, s7
-; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: s_test_imin_sle_v4i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x28
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x4c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_sext_i32_i16 s5, s2
+; GFX11-NEXT:    s_sext_i32_i16 s7, s3
+; GFX11-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX11-NEXT:    s_ashr_i32 s6, s3, 24
+; GFX11-NEXT:    s_sext_i32_i8 s8, s3
+; GFX11-NEXT:    s_sext_i32_i8 s9, s2
+; GFX11-NEXT:    s_bfe_i32 s3, s3, 0x80010
+; GFX11-NEXT:    s_bfe_i32 s2, s2, 0x80010
+; GFX11-NEXT:    s_ashr_i32 s7, s7, 8
+; GFX11-NEXT:    s_ashr_i32 s5, s5, 8
+; GFX11-NEXT:    s_min_i32 s8, s9, s8
+; GFX11-NEXT:    s_min_i32 s4, s4, s6
+; GFX11-NEXT:    s_min_i32 s2, s2, s3
+; GFX11-NEXT:    s_min_i32 s3, s5, s7
+; GFX11-NEXT:    s_and_b32 s5, s8, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_or_b32 s3, s5, s3
+; GFX11-NEXT:    s_or_b32 s2, s2, s4
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %val, ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 86fc0ace2c43f..6ab3022a91cd7 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -162,10 +162,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ; SDAG-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, 0xff
 ; SDAG-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT:    v_max_i16_e64 v1, s2, 0
-; SDAG-VI-NEXT:    v_max_i16_e64 v2, s3, 0
-; SDAG-VI-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; SDAG-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
+; SDAG-VI-NEXT:    s_sext_i32_i16 s2, s2
+; SDAG-VI-NEXT:    s_sext_i32_i16 s3, s3
+; SDAG-VI-NEXT:    v_med3_i32 v1, s2, 0, v0
+; SDAG-VI-NEXT:    v_med3_i32 v0, s3, 0, v0
+; SDAG-VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SDAG-VI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, s0
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -689,12 +690,12 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
 ; SDAG-VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, 0xff
 ; SDAG-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT:    s_lshr_b32 s3, s2, 16
-; SDAG-VI-NEXT:    v_max_i16_e64 v1, s2, 0
-; SDAG-VI-NEXT:    v_max_i16_e64 v2, s3, 0
-; SDAG-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
-; SDAG-VI-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; SDAG-VI-NEXT:    v_or_b32_e32 v2, v1, v0
+; SDAG-VI-NEXT:    s_ashr_i32 s3, s2, 16
+; SDAG-VI-NEXT:    s_sext_i32_i16 s2, s2
+; SDAG-VI-NEXT:    v_med3_i32 v1, s2, 0, v0
+; SDAG-VI-NEXT:    v_med3_i32 v0, s3, 0, v0
+; SDAG-VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, s0
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v1, s1
 ; SDAG-VI-NEXT:    flat_store_dword v[0:1], v2


        


More information about the llvm-commits mailing list