[llvm] DAG: Fix vector_shuffle -> splat fold defining undef lanes (PR #123596)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 20 03:52:24 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-selectiondag
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
For shuffle vector splats with undef lanes in the mask,
this was introducing real values. Filter out build_vector
results based on the undef elements in the mask.
This avoids AMDGPU test regressions in a future change.
test/CodeGen/X86/urem-seteq-illegal-types.ll looks worse
but I didn't investigate.
---
Patch is 24.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123596.diff
9 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+9-1)
- (modified) llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll (+10-10)
- (modified) llvm/test/CodeGen/WebAssembly/simd.ll (-30)
- (modified) llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll (+6-8)
- (modified) llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll (+4-2)
- (modified) llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll (+32-36)
- (modified) llvm/test/CodeGen/X86/vec_smulo.ll (+9-15)
- (modified) llvm/test/CodeGen/X86/vec_umulo.ll (+4-6)
- (modified) llvm/test/CodeGen/X86/widen_shuffle-1.ll (+2-3)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 49e5b7d9ef0141..671a14e6250fc6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26373,9 +26373,17 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
if (AllSame)
return N0;
- // Canonicalize any other splat as a build_vector.
+ // Canonicalize any other splat as a build_vector, but avoid defining any
+ // undefined elements in the mask.
SDValue Splatted = V->getOperand(SplatIndex);
SmallVector<SDValue, 8> Ops(NumElts, Splatted);
+ EVT EltVT = Splatted.getValueType();
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (SVN->getMaskElt(i) < 0)
+ Ops[i] = DAG.getUNDEF(EltVT);
+ }
+
SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
// We may have jumped through bitcasts, so the type of the
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
index 4a036a7868c1a9..95ff0d9a3a9c60 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
@@ -3628,15 +3628,15 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR9LE-LABEL: v2ppcf128_fast:
; PWR9LE: # %bb.0: # %entry
; PWR9LE-NEXT: mflr r0
-; PWR9LE-NEXT: stdu r1, -64(r1)
-; PWR9LE-NEXT: std r0, 80(r1)
+; PWR9LE-NEXT: stdu r1, -48(r1)
+; PWR9LE-NEXT: std r0, 64(r1)
; PWR9LE-NEXT: bl __gcc_qadd
; PWR9LE-NEXT: nop
; PWR9LE-NEXT: stfd f2, 40(r1)
; PWR9LE-NEXT: stfd f1, 32(r1)
; PWR9LE-NEXT: lxv vs1, 32(r1)
; PWR9LE-NEXT: xxswapd vs2, vs1
-; PWR9LE-NEXT: addi r1, r1, 64
+; PWR9LE-NEXT: addi r1, r1, 48
; PWR9LE-NEXT: ld r0, 16(r1)
; PWR9LE-NEXT: mtlr r0
; PWR9LE-NEXT: blr
@@ -3644,15 +3644,15 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR9BE-LABEL: v2ppcf128_fast:
; PWR9BE: # %bb.0: # %entry
; PWR9BE-NEXT: mflr r0
-; PWR9BE-NEXT: stdu r1, -144(r1)
-; PWR9BE-NEXT: std r0, 160(r1)
+; PWR9BE-NEXT: stdu r1, -128(r1)
+; PWR9BE-NEXT: std r0, 144(r1)
; PWR9BE-NEXT: bl __gcc_qadd
; PWR9BE-NEXT: nop
; PWR9BE-NEXT: stfd f2, 120(r1)
; PWR9BE-NEXT: stfd f1, 112(r1)
; PWR9BE-NEXT: lxv vs1, 112(r1)
; PWR9BE-NEXT: xxswapd vs2, vs1
-; PWR9BE-NEXT: addi r1, r1, 144
+; PWR9BE-NEXT: addi r1, r1, 128
; PWR9BE-NEXT: ld r0, 16(r1)
; PWR9BE-NEXT: mtlr r0
; PWR9BE-NEXT: blr
@@ -3661,13 +3661,13 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR10LE: # %bb.0: # %entry
; PWR10LE-NEXT: mflr r0
; PWR10LE-NEXT: std r0, 16(r1)
-; PWR10LE-NEXT: stdu r1, -64(r1)
+; PWR10LE-NEXT: stdu r1, -48(r1)
; PWR10LE-NEXT: bl __gcc_qadd at notoc
; PWR10LE-NEXT: stfd f2, 40(r1)
; PWR10LE-NEXT: stfd f1, 32(r1)
; PWR10LE-NEXT: lxv vs1, 32(r1)
; PWR10LE-NEXT: xxswapd vs2, vs1
-; PWR10LE-NEXT: addi r1, r1, 64
+; PWR10LE-NEXT: addi r1, r1, 48
; PWR10LE-NEXT: ld r0, 16(r1)
; PWR10LE-NEXT: mtlr r0
; PWR10LE-NEXT: blr
@@ -3676,14 +3676,14 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR10BE: # %bb.0: # %entry
; PWR10BE-NEXT: mflr r0
; PWR10BE-NEXT: std r0, 16(r1)
-; PWR10BE-NEXT: stdu r1, -144(r1)
+; PWR10BE-NEXT: stdu r1, -128(r1)
; PWR10BE-NEXT: bl __gcc_qadd
; PWR10BE-NEXT: nop
; PWR10BE-NEXT: stfd f2, 120(r1)
; PWR10BE-NEXT: stfd f1, 112(r1)
; PWR10BE-NEXT: lxv vs1, 112(r1)
; PWR10BE-NEXT: xxswapd vs2, vs1
-; PWR10BE-NEXT: addi r1, r1, 144
+; PWR10BE-NEXT: addi r1, r1, 128
; PWR10BE-NEXT: ld r0, 16(r1)
; PWR10BE-NEXT: mtlr r0
; PWR10BE-NEXT: blr
diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll
index 5ec9f6a2a321b3..7228d5335a33f6 100644
--- a/llvm/test/CodeGen/WebAssembly/simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd.ll
@@ -481,21 +481,6 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v16i8:
; NO-SIMD128: .functype shuffle_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store8 15($0), $2
-; NO-SIMD128-NEXT: i32.store8 14($0), $2
-; NO-SIMD128-NEXT: i32.store8 13($0), $2
-; NO-SIMD128-NEXT: i32.store8 12($0), $2
-; NO-SIMD128-NEXT: i32.store8 11($0), $2
-; NO-SIMD128-NEXT: i32.store8 10($0), $2
-; NO-SIMD128-NEXT: i32.store8 9($0), $2
-; NO-SIMD128-NEXT: i32.store8 8($0), $2
-; NO-SIMD128-NEXT: i32.store8 7($0), $2
-; NO-SIMD128-NEXT: i32.store8 6($0), $2
-; NO-SIMD128-NEXT: i32.store8 5($0), $2
-; NO-SIMD128-NEXT: i32.store8 4($0), $2
-; NO-SIMD128-NEXT: i32.store8 3($0), $2
-; NO-SIMD128-NEXT: i32.store8 2($0), $2
-; NO-SIMD128-NEXT: i32.store8 1($0), $2
; NO-SIMD128-NEXT: i32.store8 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <16 x i8> %x, <16 x i8> %y,
@@ -994,13 +979,6 @@ define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v8i16:
; NO-SIMD128: .functype shuffle_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store16 14($0), $2
-; NO-SIMD128-NEXT: i32.store16 12($0), $2
-; NO-SIMD128-NEXT: i32.store16 10($0), $2
-; NO-SIMD128-NEXT: i32.store16 8($0), $2
-; NO-SIMD128-NEXT: i32.store16 6($0), $2
-; NO-SIMD128-NEXT: i32.store16 4($0), $2
-; NO-SIMD128-NEXT: i32.store16 2($0), $2
; NO-SIMD128-NEXT: i32.store16 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <8 x i16> %x, <8 x i16> %y,
@@ -1288,9 +1266,6 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v4i32:
; NO-SIMD128: .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store 12($0), $2
-; NO-SIMD128-NEXT: i32.store 8($0), $2
-; NO-SIMD128-NEXT: i32.store 4($0), $2
; NO-SIMD128-NEXT: i32.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <4 x i32> %x, <4 x i32> %y,
@@ -1550,7 +1525,6 @@ define <2 x i64> @shuffle_undef_v2i64(<2 x i64> %x, <2 x i64> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v2i64:
; NO-SIMD128: .functype shuffle_undef_v2i64 (i32, i64, i64, i64, i64) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i64.store 8($0), $2
; NO-SIMD128-NEXT: i64.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <2 x i64> %x, <2 x i64> %y,
@@ -1819,9 +1793,6 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v4f32:
; NO-SIMD128: .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: f32.store 12($0), $2
-; NO-SIMD128-NEXT: f32.store 8($0), $2
-; NO-SIMD128-NEXT: f32.store 4($0), $2
; NO-SIMD128-NEXT: f32.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <4 x float> %x, <4 x float> %y,
@@ -2082,7 +2053,6 @@ define <2 x double> @shuffle_undef_v2f64(<2 x double> %x, <2 x double> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v2f64:
; NO-SIMD128: .functype shuffle_undef_v2f64 (i32, f64, f64, f64, f64) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: f64.store 8($0), $2
; NO-SIMD128-NEXT: f64.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <2 x double> %x, <2 x double> %y,
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 08d9183bd30b67..fa95ce384533c4 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -621,15 +621,14 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -1110,15 +1109,14 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 28ac4496acb9be..97cc1f8a156943 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -141,8 +141,10 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
; SSE2-NEXT: movdqa %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 838086e366fbfd..d17f2135cccad5 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -159,19 +159,18 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -237,19 +236,18 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
@@ -536,19 +534,18 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -968,19 +965,18 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 7e081310c35be5..49cb7c707a14f3 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -474,8 +474,6 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pand %xmm7, %xmm6
; SSE2-NEXT: paddd %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
@@ -548,8 +546,6 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
; SSSE3-NEXT: pand %xmm7, %xmm6
; SSSE3-NEXT: paddd %xmm8, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSSE3-NEXT: pmuludq %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
@@ -578,25 +574,23 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmuldq %xmm2, %xmm0
; SSE41-NEXT: pinsrd $3, %r8d, %xmm2
-; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE41-NEXT: movd %r9d, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pmuldq %xmm3, %xmm4
-; SSE41-NEXT: pinsrd $1, %edx, %xmm3
-; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %esi
-; SSE41-NEXT: pinsrd $1, %esi, %xmm5
+; SSE41-NEXT: pinsrd $1, %ecx, %xmm3
+; SSE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/123596
More information about the llvm-commits
mailing list