[llvm] [DAG] Use known-bits when creating umulh/smulh. (PR #160916)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 28 10:24:16 PDT 2025
https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/160916
>From db9d4ea65762e8d71b9905020950d992a541d104 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 26 Sep 2025 16:58:43 +0100
Subject: [PATCH 1/2] [DAG] Use known-bits when creating umulh/smulh.
This extends the creation of umulh/smulh instructions to handle cases where one
operand is a zext/sext and the other has enough known-zero or sign bits to
create a mulh. This can be useful when one of the operands is hoisted out of a
loop.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 32 ++--
llvm/test/CodeGen/AMDGPU/sdiv64.ll | 4 +-
llvm/test/CodeGen/AMDGPU/udiv64.ll | 8 +-
llvm/test/CodeGen/AMDGPU/urem64.ll | 16 +-
llvm/test/CodeGen/Thumb2/mve-vmulh.ll | 144 ++++--------------
5 files changed, 68 insertions(+), 136 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cf221bba1e3a3..9e5311b65170f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10789,6 +10789,10 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
SDValue LeftOp = ShiftOperand.getOperand(0);
SDValue RightOp = ShiftOperand.getOperand(1);
+ if (LeftOp.getOpcode() != ISD::SIGN_EXTEND &&
+ LeftOp.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(LeftOp, RightOp);
+
bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
@@ -10821,18 +10825,26 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
}
SDValue MulhRightOp;
- if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
- unsigned ActiveBits = IsSignExt
- ? Constant->getAPIntValue().getSignificantBits()
- : Constant->getAPIntValue().getActiveBits();
- if (ActiveBits > NarrowVTSize)
+ if (LeftOp.getOpcode() != RightOp.getOpcode()) {
+ if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
+ unsigned ActiveBits = IsSignExt
+ ? Constant->getAPIntValue().getSignificantBits()
+ : Constant->getAPIntValue().getActiveBits();
+ if (ActiveBits > NarrowVTSize)
+ return SDValue();
+ MulhRightOp = DAG.getConstant(
+ Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
+ NarrowVT);
+ } else if (IsZeroExt &&
+ DAG.computeKnownBits(RightOp).countMinLeadingZeros() >=
+ NarrowVTSize) {
+ MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
+ } else if (IsSignExt && DAG.ComputeNumSignBits(RightOp) > NarrowVTSize) {
+ MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
+ } else {
return SDValue();
- MulhRightOp = DAG.getConstant(
- Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
- NarrowVT);
+ }
} else {
- if (LeftOp.getOpcode() != RightOp.getOpcode())
- return SDValue();
// Check that the two extend nodes are the same type.
if (NarrowVT != RightOp.getOperand(0).getValueType())
return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 71f5a94a7f245..2d2468ea1c5e6 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -571,7 +571,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
; GCN-NEXT: v_mul_lo_u32 v3, v3, v2
; GCN-NEXT: v_mul_hi_u32 v3, v2, v3
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_hi_u32 v2, v1, v2
+; GCN-NEXT: v_mul_hi_u32 v2, v2, v1
; GCN-NEXT: v_mul_u32_u24_e32 v3, v2, v0
; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
@@ -598,7 +598,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v2
; GCN-IR-NEXT: v_mul_hi_u32 v3, v2, v3
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-IR-NEXT: v_mul_hi_u32 v2, v1, v2
+; GCN-IR-NEXT: v_mul_hi_u32 v2, v2, v1
; GCN-IR-NEXT: v_mul_u32_u24_e32 v3, v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index fd461ac80ea55..6bc5577aec407 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -515,7 +515,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s3
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_mul_i32 s0, s0, s8
; GCN-NEXT: s_sub_i32 s0, s3, s0
@@ -551,7 +551,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s3
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
; GCN-IR-NEXT: s_mul_i32 s0, s0, s8
; GCN-IR-NEXT: s_sub_i32 s0, s3, s0
@@ -595,7 +595,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-NEXT: s_lshr_b32 s2, s3, 1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_mul_i32 s0, s0, s8
@@ -633,7 +633,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
; GCN-IR-NEXT: s_mul_i32 s0, s0, s8
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 137dc1fe42294..ad601d8e75973 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -467,7 +467,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-NEXT: s_lshr_b32 s2, s3, 1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
@@ -502,7 +502,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
@@ -544,7 +544,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-NEXT: s_lshr_b32 s1, s9, 1
; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s1, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_readfirstlane_b32 s2, v0
@@ -562,7 +562,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mul_hi_u32 v0, v1, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT: v_mul_hi_u32 v2, s7, v0
+; GCN-NEXT: v_mul_hi_u32 v2, v0, s7
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v3, v1
@@ -599,7 +599,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-IR-NEXT: s_lshr_b32 s1, s9, 1
; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s1, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s1
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0
@@ -617,7 +617,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GCN-IR-NEXT: v_mul_hi_u32 v2, s7, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, s7
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
; GCN-IR-NEXT: v_mov_b32_e32 v3, v1
@@ -728,7 +728,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GCN-NEXT: v_mul_lo_u32 v1, v1, s4
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s7
; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v1
; GCN-NEXT: v_readfirstlane_b32 s4, v0
@@ -775,7 +775,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, s4
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s7, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s7
; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffff, v1
; GCN-IR-NEXT: v_readfirstlane_b32 s4, v0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
index 32648b6b449a8..8d8e5e9f48ab8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
@@ -793,23 +793,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhs_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhs_kb_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: smmul r0, r0, r1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: smmul r1, r1, r2
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: smmul r0, r0, r1
-; CHECK-NEXT: vmov r1, s10
-; CHECK-NEXT: smmul r1, r1, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.s32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <4 x i32> %s0 to <4 x i64>
@@ -823,23 +811,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhu_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhu_kb_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: umull r0, r1, r0, r1
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: umull r0, r2, r0, r2
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: umull r0, r1, r0, r1
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: umull r0, r2, r0, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <4 x i32> %s0 to <4 x i64>
@@ -853,23 +829,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhs_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhs_kbc_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: smmul r0, r1, r0
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: smmul r1, r2, r1
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: smmul r0, r1, r0
-; CHECK-NEXT: vmov r1, s10
-; CHECK-NEXT: smmul r1, r2, r1
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.s32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <4 x i32> %s0 to <4 x i64>
@@ -883,23 +847,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhu_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhu_kbc_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: umull r0, r2, r2, r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: umull r0, r2, r2, r0
-; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <4 x i32> %s0 to <4 x i64>
@@ -913,25 +865,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhs_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhs_kb_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.s16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
-; CHECK-NEXT: vshr.s32 q3, q3, #16
-; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q4, q3
-; CHECK-NEXT: vshr.s32 q1, q1, #16
; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q0, q1
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmov.f32 s7, s10
+; CHECK-NEXT: vshr.u32 q1, q1, #16
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <8 x i16> %s0 to <8 x i32>
@@ -945,25 +889,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhu_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhu_kb_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.u16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vshr.u32 q3, q3, #16
; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q4, q3
; CHECK-NEXT: vshr.u32 q1, q1, #16
-; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q0, q1
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.u16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <8 x i16> %s0 to <8 x i32>
@@ -977,25 +913,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhs_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhs_kbc_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.s16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
-; CHECK-NEXT: vshr.s32 q3, q3, #16
-; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q3, q4
-; CHECK-NEXT: vshr.s32 q1, q1, #16
; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmov.f32 s7, s10
+; CHECK-NEXT: vshr.u32 q1, q1, #16
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <8 x i16> %s0 to <8 x i32>
@@ -1009,25 +937,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhu_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhu_kbc_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.u16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vshr.u32 q3, q3, #16
; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q3, q4
; CHECK-NEXT: vshr.u32 q1, q1, #16
-; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.u16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <8 x i16> %s0 to <8 x i32>
>From 8c5cab314ed6bc5f5c77b2455980f3b9afc448be Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 28 Oct 2025 17:23:48 +0000
Subject: [PATCH 2/2] Update tests and add a oneuse check
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 17 +-
llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll | 71 +++----
llvm/test/CodeGen/X86/combine-pmuldq.ll | 191 +++++++++++-------
3 files changed, 142 insertions(+), 137 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9e5311b65170f..4e2b2015ccabd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10826,20 +10826,11 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
SDValue MulhRightOp;
if (LeftOp.getOpcode() != RightOp.getOpcode()) {
- if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
- unsigned ActiveBits = IsSignExt
- ? Constant->getAPIntValue().getSignificantBits()
- : Constant->getAPIntValue().getActiveBits();
- if (ActiveBits > NarrowVTSize)
- return SDValue();
- MulhRightOp = DAG.getConstant(
- Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
- NarrowVT);
- } else if (IsZeroExt &&
- DAG.computeKnownBits(RightOp).countMinLeadingZeros() >=
- NarrowVTSize) {
+ if (IsZeroExt && ShiftOperand.hasOneUse() &&
+ DAG.computeKnownBits(RightOp).countMinLeadingZeros() >= NarrowVTSize) {
MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
- } else if (IsSignExt && DAG.ComputeNumSignBits(RightOp) > NarrowVTSize) {
+ } else if (IsSignExt && ShiftOperand.hasOneUse() &&
+ DAG.ComputeNumSignBits(RightOp) > NarrowVTSize) {
MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
} else {
return SDValue();
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll
index 3fd7f5be860cf..c0c9b1797f91f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll
@@ -48,18 +48,11 @@ define <vscale x 1 x i32> @vmulhu_vi_nxv1i32_0(<vscale x 1 x i32> %va) {
}
define <vscale x 1 x i32> @vmulhu_vi_nxv1i32_1(<vscale x 1 x i32> %va) {
-; RV32-LABEL: vmulhu_vi_nxv1i32_1:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
-; RV32-NEXT: vsrl.vi v8, v8, 28
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vmulhu_vi_nxv1i32_1:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 16
-; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
-; RV64-NEXT: vmulhu.vx v8, v8, a0
-; RV64-NEXT: ret
+; CHECK-LABEL: vmulhu_vi_nxv1i32_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v8, 28
+; CHECK-NEXT: ret
%vb = zext <vscale x 1 x i32> splat (i32 16) to <vscale x 1 x i64>
%vc = zext <vscale x 1 x i32> %va to <vscale x 1 x i64>
%vd = mul <vscale x 1 x i64> %vb, %vc
@@ -114,18 +107,11 @@ define <vscale x 2 x i32> @vmulhu_vi_nxv2i32_0(<vscale x 2 x i32> %va) {
}
define <vscale x 2 x i32> @vmulhu_vi_nxv2i32_1(<vscale x 2 x i32> %va) {
-; RV32-LABEL: vmulhu_vi_nxv2i32_1:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; RV32-NEXT: vsrl.vi v8, v8, 28
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vmulhu_vi_nxv2i32_1:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 16
-; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV64-NEXT: vmulhu.vx v8, v8, a0
-; RV64-NEXT: ret
+; CHECK-LABEL: vmulhu_vi_nxv2i32_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v8, 28
+; CHECK-NEXT: ret
%vb = zext <vscale x 2 x i32> splat (i32 16) to <vscale x 2 x i64>
%vc = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
%vd = mul <vscale x 2 x i64> %vb, %vc
@@ -180,18 +166,11 @@ define <vscale x 4 x i32> @vmulhu_vi_nxv4i32_0(<vscale x 4 x i32> %va) {
}
define <vscale x 4 x i32> @vmulhu_vi_nxv4i32_1(<vscale x 4 x i32> %va) {
-; RV32-LABEL: vmulhu_vi_nxv4i32_1:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT: vsrl.vi v8, v8, 28
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vmulhu_vi_nxv4i32_1:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 16
-; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV64-NEXT: vmulhu.vx v8, v8, a0
-; RV64-NEXT: ret
+; CHECK-LABEL: vmulhu_vi_nxv4i32_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v8, 28
+; CHECK-NEXT: ret
%vb = zext <vscale x 4 x i32> splat (i32 16) to <vscale x 4 x i64>
%vc = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
%vd = mul <vscale x 4 x i64> %vb, %vc
@@ -246,18 +225,11 @@ define <vscale x 8 x i32> @vmulhu_vi_nxv8i32_0(<vscale x 8 x i32> %va) {
}
define <vscale x 8 x i32> @vmulhu_vi_nxv8i32_1(<vscale x 8 x i32> %va) {
-; RV32-LABEL: vmulhu_vi_nxv8i32_1:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, ma
-; RV32-NEXT: vsrl.vi v8, v8, 28
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vmulhu_vi_nxv8i32_1:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 16
-; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV64-NEXT: vmulhu.vx v8, v8, a0
-; RV64-NEXT: ret
+; CHECK-LABEL: vmulhu_vi_nxv8i32_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v8, 28
+; CHECK-NEXT: ret
%vb = zext <vscale x 8 x i32> splat (i32 16) to <vscale x 8 x i64>
%vc = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
%vd = mul <vscale x 8 x i64> %vb, %vc
@@ -265,3 +237,6 @@ define <vscale x 8 x i32> @vmulhu_vi_nxv8i32_1(<vscale x 8 x i32> %va) {
%vf = trunc <vscale x 8 x i64> %ve to <vscale x 8 x i32>
ret <vscale x 8 x i32> %vf
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index ff5329c637251..0b5ef7e3a514c 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -330,22 +330,27 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) {
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0,0]
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: .p2align 4
; SSE-NEXT: .LBB7_1: # %loop
; SSE-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero
-; SSE-NEXT: pmuludq %xmm2, %xmm6
+; SSE-NEXT: movdqu 2097152(%rdi,%rax), %xmm4
+; SSE-NEXT: movdqu 2097168(%rdi,%rax), %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm6
; SSE-NEXT: pmuludq %xmm2, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3]
-; SSE-NEXT: paddd %xmm5, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
+; SSE-NEXT: paddd %xmm5, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm5
; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3]
-; SSE-NEXT: paddd %xmm4, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; SSE-NEXT: paddd %xmm4, %xmm0
; SSE-NEXT: subq $-128, %rax
; SSE-NEXT: jne .LBB7_1
; SSE-NEXT: # %bb.2: # %end
@@ -356,27 +361,33 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) {
; AVX1-NEXT: movl %esi, %eax
; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm5[1]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm5[0]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm5[1]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
; AVX1-NEXT: .p2align 4
; AVX1-NEXT: .LBB7_1: # %loop
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6
-; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3]
-; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vmovdqu 2097152(%rdi,%rax), %xmm5
+; AVX1-NEXT: vmovdqu 2097168(%rdi,%rax), %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm1, %xmm7, %xmm7
+; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm7, %xmm7
+; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
; AVX1-NEXT: subq $-128, %rax
; AVX1-NEXT: jne .LBB7_1
; AVX1-NEXT: # %bb.2: # %end
@@ -389,16 +400,19 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) {
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
+; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX2-NEXT: .p2align 4
; AVX2-NEXT: .LBB7_1: # %loop
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3
+; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm4
+; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm0
; AVX2-NEXT: subq $-128, %rax
; AVX2-NEXT: jne .LBB7_1
; AVX2-NEXT: # %bb.2: # %end
@@ -410,14 +424,18 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) {
; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1
; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
+; AVX512VL-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX512VL-NEXT: .p2align 4
; AVX512VL-NEXT: .LBB7_1: # %loop
; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512VL-NEXT: vpmuludq %zmm2, %zmm1, %zmm2
-; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2
-; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2
-; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7]
+; AVX512VL-NEXT: vpmuludq %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+; AVX512VL-NEXT: vpaddd %ymm0, %ymm3, %ymm0
; AVX512VL-NEXT: subq $-128, %rax
; AVX512VL-NEXT: jne .LBB7_1
; AVX512VL-NEXT: # %bb.2: # %end
@@ -429,14 +447,18 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) {
; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1
; AVX512DQVL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512DQVL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
+; AVX512DQVL-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX512DQVL-NEXT: .p2align 4
; AVX512DQVL-NEXT: .LBB7_1: # %loop
; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512DQVL-NEXT: vpmuludq %zmm2, %zmm1, %zmm2
-; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2
-; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2
-; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7]
+; AVX512DQVL-NEXT: vpmuludq %ymm2, %ymm4, %ymm4
+; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
+; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm3, %ymm0
; AVX512DQVL-NEXT: subq $-128, %rax
; AVX512DQVL-NEXT: jne .LBB7_1
; AVX512DQVL-NEXT: # %bb.2: # %end
@@ -531,27 +553,33 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
; AVX1-NEXT: movslq %esi, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm5[1]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm5[0]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm5[1]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
; AVX1-NEXT: .p2align 4
; AVX1-NEXT: .LBB8_1: # %loop
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT: vpmovsxdq 2097152(%rdi,%rax), %xmm3
-; AVX1-NEXT: vpmovsxdq 2097160(%rdi,%rax), %xmm4
-; AVX1-NEXT: vpmovsxdq 2097168(%rdi,%rax), %xmm5
-; AVX1-NEXT: vpmovsxdq 2097176(%rdi,%rax), %xmm6
-; AVX1-NEXT: vpmuldq %xmm6, %xmm2, %xmm6
-; AVX1-NEXT: vpmuldq %xmm5, %xmm1, %xmm5
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3]
-; AVX1-NEXT: vpmuldq %xmm4, %xmm2, %xmm4
-; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vmovdqu 2097152(%rdi,%rax), %xmm5
+; AVX1-NEXT: vmovdqu 2097168(%rdi,%rax), %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm1, %xmm7, %xmm7
+; AVX1-NEXT: vpmuldq %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm7, %xmm7
+; AVX1-NEXT: vpmuldq %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
; AVX1-NEXT: subq $-128, %rax
; AVX1-NEXT: jne .LBB8_1
; AVX1-NEXT: # %bb.2: # %end
@@ -564,16 +592,19 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
+; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX2-NEXT: .p2align 4
; AVX2-NEXT: .LBB8_1: # %loop
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT: vpmovsxdq 2097152(%rdi,%rax), %ymm2
-; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm3
-; AVX2-NEXT: vpmuldq %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3
+; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmuldq %ymm2, %ymm4, %ymm4
+; AVX2-NEXT: vpmuldq %ymm1, %ymm3, %ymm3
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm0
; AVX2-NEXT: subq $-128, %rax
; AVX2-NEXT: jne .LBB8_1
; AVX2-NEXT: # %bb.2: # %end
@@ -585,14 +616,18 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1
; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
+; AVX512VL-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX512VL-NEXT: .p2align 4
; AVX512VL-NEXT: .LBB8_1: # %loop
; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512VL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2
-; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2
-; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2
-; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7]
+; AVX512VL-NEXT: vpmuldq %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT: vpmuldq %ymm1, %ymm3, %ymm3
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+; AVX512VL-NEXT: vpaddd %ymm0, %ymm3, %ymm0
; AVX512VL-NEXT: subq $-128, %rax
; AVX512VL-NEXT: jne .LBB8_1
; AVX512VL-NEXT: # %bb.2: # %end
@@ -604,14 +639,18 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1
; AVX512DQVL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512DQVL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
+; AVX512DQVL-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX512DQVL-NEXT: .p2align 4
; AVX512DQVL-NEXT: .LBB8_1: # %loop
; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512DQVL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2
-; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2
-; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2
-; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7]
+; AVX512DQVL-NEXT: vpmuldq %ymm2, %ymm4, %ymm4
+; AVX512DQVL-NEXT: vpmuldq %ymm1, %ymm3, %ymm3
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
+; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm3, %ymm0
; AVX512DQVL-NEXT: subq $-128, %rax
; AVX512DQVL-NEXT: jne .LBB8_1
; AVX512DQVL-NEXT: # %bb.2: # %end
More information about the llvm-commits
mailing list