[llvm] [DAG] Use known-bits when creating umulh/smulh. (PR #160916)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 26 09:00:54 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: David Green (davemgreen)
<details>
<summary>Changes</summary>
This extends the creation of umulh/smulh instructions to handle cases where one operand is a zext/sext and the other has enough known-zero or sign bits to create a mulh. This can be useful when one of the operands is hoisted out of a loop.
---
Full diff: https://github.com/llvm/llvm-project/pull/160916.diff
5 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+22-10)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+8-8)
- (modified) llvm/test/CodeGen/Thumb2/mve-vmulh.ll (+32-112)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c81568672de3c..30cb410d6cf39 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10789,6 +10789,10 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
SDValue LeftOp = ShiftOperand.getOperand(0);
SDValue RightOp = ShiftOperand.getOperand(1);
+ if (LeftOp.getOpcode() != ISD::SIGN_EXTEND &&
+ LeftOp.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(LeftOp, RightOp);
+
bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
@@ -10821,18 +10825,26 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
}
SDValue MulhRightOp;
- if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
- unsigned ActiveBits = IsSignExt
- ? Constant->getAPIntValue().getSignificantBits()
- : Constant->getAPIntValue().getActiveBits();
- if (ActiveBits > NarrowVTSize)
+ if (LeftOp.getOpcode() != RightOp.getOpcode()) {
+ if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
+ unsigned ActiveBits = IsSignExt
+ ? Constant->getAPIntValue().getSignificantBits()
+ : Constant->getAPIntValue().getActiveBits();
+ if (ActiveBits > NarrowVTSize)
+ return SDValue();
+ MulhRightOp = DAG.getConstant(
+ Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
+ NarrowVT);
+ } else if (IsZeroExt &&
+ DAG.computeKnownBits(RightOp).countMinLeadingZeros() >=
+ NarrowVTSize) {
+ MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
+ } else if (IsSignExt && DAG.ComputeNumSignBits(RightOp) > NarrowVTSize) {
+ MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
+ } else {
return SDValue();
- MulhRightOp = DAG.getConstant(
- Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
- NarrowVT);
+ }
} else {
- if (LeftOp.getOpcode() != RightOp.getOpcode())
- return SDValue();
// Check that the two extend nodes are the same type.
if (NarrowVT != RightOp.getOperand(0).getValueType())
return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index ddec6af0af69e..68fe14db7edd0 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -572,7 +572,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
; GCN-NEXT: v_mul_lo_u32 v3, v3, v2
; GCN-NEXT: v_mul_hi_u32 v3, v2, v3
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_hi_u32 v2, v1, v2
+; GCN-NEXT: v_mul_hi_u32 v2, v2, v1
; GCN-NEXT: v_mul_u32_u24_e32 v3, v2, v0
; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
@@ -599,7 +599,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v2
; GCN-IR-NEXT: v_mul_hi_u32 v3, v2, v3
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-IR-NEXT: v_mul_hi_u32 v2, v1, v2
+; GCN-IR-NEXT: v_mul_hi_u32 v2, v2, v1
; GCN-IR-NEXT: v_mul_u32_u24_e32 v3, v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index dc11e81476a7e..5b48a1259c680 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -512,7 +512,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s3
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_mul_i32 s0, s0, s8
; GCN-NEXT: s_sub_i32 s0, s3, s0
@@ -548,7 +548,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s3
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
; GCN-IR-NEXT: s_mul_i32 s0, s0, s8
; GCN-IR-NEXT: s_sub_i32 s0, s3, s0
@@ -592,7 +592,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-NEXT: s_lshr_b32 s2, s3, 1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_mul_i32 s0, s0, s8
@@ -630,7 +630,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
; GCN-IR-NEXT: s_mul_i32 s0, s0, s8
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index dc25caadb99a9..0ae448277feaa 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -469,7 +469,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-NEXT: s_lshr_b32 s2, s3, 1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
@@ -504,7 +504,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
@@ -546,7 +546,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-NEXT: s_lshr_b32 s1, s9, 1
; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s1, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_readfirstlane_b32 s2, v0
@@ -564,7 +564,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mul_hi_u32 v0, v1, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT: v_mul_hi_u32 v2, s7, v0
+; GCN-NEXT: v_mul_hi_u32 v2, v0, s7
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v3, v1
@@ -601,7 +601,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-IR-NEXT: s_lshr_b32 s1, s9, 1
; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s1, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s1
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0
@@ -619,7 +619,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GCN-IR-NEXT: v_mul_hi_u32 v2, s7, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, s7
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
; GCN-IR-NEXT: v_mov_b32_e32 v3, v1
@@ -730,7 +730,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GCN-NEXT: v_mul_lo_u32 v1, v1, s4
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s7
; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v1
; GCN-NEXT: v_readfirstlane_b32 s4, v0
@@ -777,7 +777,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, s4
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s7, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s7
; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffff, v1
; GCN-IR-NEXT: v_readfirstlane_b32 s4, v0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
index 32648b6b449a8..8d8e5e9f48ab8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
@@ -793,23 +793,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhs_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhs_kb_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: smmul r0, r0, r1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: smmul r1, r1, r2
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: smmul r0, r0, r1
-; CHECK-NEXT: vmov r1, s10
-; CHECK-NEXT: smmul r1, r1, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.s32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <4 x i32> %s0 to <4 x i64>
@@ -823,23 +811,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhu_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhu_kb_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: umull r0, r1, r0, r1
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: umull r0, r2, r0, r2
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: umull r0, r1, r0, r1
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: umull r0, r2, r0, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <4 x i32> %s0 to <4 x i64>
@@ -853,23 +829,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhs_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhs_kbc_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: smmul r0, r1, r0
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: smmul r1, r2, r1
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: smmul r0, r1, r0
-; CHECK-NEXT: vmov r1, s10
-; CHECK-NEXT: smmul r1, r2, r1
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.s32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <4 x i32> %s0 to <4 x i64>
@@ -883,23 +847,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhu_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhu_kbc_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: umull r0, r2, r2, r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: umull r0, r2, r2, r0
-; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <4 x i32> %s0 to <4 x i64>
@@ -913,25 +865,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhs_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhs_kb_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.s16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
-; CHECK-NEXT: vshr.s32 q3, q3, #16
-; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q4, q3
-; CHECK-NEXT: vshr.s32 q1, q1, #16
; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q0, q1
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmov.f32 s7, s10
+; CHECK-NEXT: vshr.u32 q1, q1, #16
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <8 x i16> %s0 to <8 x i32>
@@ -945,25 +889,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhu_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhu_kb_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.u16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vshr.u32 q3, q3, #16
; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q4, q3
; CHECK-NEXT: vshr.u32 q1, q1, #16
-; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q0, q1
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.u16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <8 x i16> %s0 to <8 x i32>
@@ -977,25 +913,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhs_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhs_kbc_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.s16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
-; CHECK-NEXT: vshr.s32 q3, q3, #16
-; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q3, q4
-; CHECK-NEXT: vshr.s32 q1, q1, #16
; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmov.f32 s7, s10
+; CHECK-NEXT: vshr.u32 q1, q1, #16
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <8 x i16> %s0 to <8 x i32>
@@ -1009,25 +937,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhu_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhu_kbc_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.u16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vshr.u32 q3, q3, #16
; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q3, q4
; CHECK-NEXT: vshr.u32 q1, q1, #16
-; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.u16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <8 x i16> %s0 to <8 x i32>
``````````
</details>
https://github.com/llvm/llvm-project/pull/160916
More information about the llvm-commits
mailing list