[llvm] DAG: Fix vector_shuffle -> splat fold defining undef lanes (PR #123596)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 20 03:51:24 PST 2025
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/123596
For shuffle vector splats with undef lanes in the mask,
this was introducing real values. Filter out build_vector
results based on the undef elements in the mask.
This avoids AMDGPU test regressions in a future change.
test/CodeGen/X86/urem-seteq-illegal-types.ll looks worse
but I didn't investigate.
>From 4f5020fd9220424cca64ee379de2a1c147b1b001 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 20 Jan 2025 17:30:34 +0700
Subject: [PATCH] DAG: Fix vector_shuffle -> splat fold defining undef lanes
For shuffle vector splats with undef lanes in the mask,
this was introducing real values. Filter out build_vector
results based on the undef elements in the mask.
This avoids AMDGPU test regressions in a future change.
test/CodeGen/X86/urem-seteq-illegal-types.ll looks worse
but I didn't investigate.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 10 ++-
.../CodeGen/PowerPC/vector-reduce-fadd.ll | 20 +++---
llvm/test/CodeGen/WebAssembly/simd.ll | 30 --------
.../CodeGen/X86/srem-seteq-vec-nonsplat.ll | 14 ++--
.../CodeGen/X86/urem-seteq-illegal-types.ll | 6 +-
.../CodeGen/X86/urem-seteq-vec-nonsplat.ll | 68 +++++++++----------
llvm/test/CodeGen/X86/vec_smulo.ll | 24 +++----
llvm/test/CodeGen/X86/vec_umulo.ll | 10 ++-
llvm/test/CodeGen/X86/widen_shuffle-1.ll | 5 +-
9 files changed, 76 insertions(+), 111 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 49e5b7d9ef0141..671a14e6250fc6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26373,9 +26373,17 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
if (AllSame)
return N0;
- // Canonicalize any other splat as a build_vector.
+ // Canonicalize any other splat as a build_vector, but avoid defining any
+ // undefined elements in the mask.
SDValue Splatted = V->getOperand(SplatIndex);
SmallVector<SDValue, 8> Ops(NumElts, Splatted);
+ EVT EltVT = Splatted.getValueType();
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (SVN->getMaskElt(i) < 0)
+ Ops[i] = DAG.getUNDEF(EltVT);
+ }
+
SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
// We may have jumped through bitcasts, so the type of the
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
index 4a036a7868c1a9..95ff0d9a3a9c60 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
@@ -3628,15 +3628,15 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR9LE-LABEL: v2ppcf128_fast:
; PWR9LE: # %bb.0: # %entry
; PWR9LE-NEXT: mflr r0
-; PWR9LE-NEXT: stdu r1, -64(r1)
-; PWR9LE-NEXT: std r0, 80(r1)
+; PWR9LE-NEXT: stdu r1, -48(r1)
+; PWR9LE-NEXT: std r0, 64(r1)
; PWR9LE-NEXT: bl __gcc_qadd
; PWR9LE-NEXT: nop
; PWR9LE-NEXT: stfd f2, 40(r1)
; PWR9LE-NEXT: stfd f1, 32(r1)
; PWR9LE-NEXT: lxv vs1, 32(r1)
; PWR9LE-NEXT: xxswapd vs2, vs1
-; PWR9LE-NEXT: addi r1, r1, 64
+; PWR9LE-NEXT: addi r1, r1, 48
; PWR9LE-NEXT: ld r0, 16(r1)
; PWR9LE-NEXT: mtlr r0
; PWR9LE-NEXT: blr
@@ -3644,15 +3644,15 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR9BE-LABEL: v2ppcf128_fast:
; PWR9BE: # %bb.0: # %entry
; PWR9BE-NEXT: mflr r0
-; PWR9BE-NEXT: stdu r1, -144(r1)
-; PWR9BE-NEXT: std r0, 160(r1)
+; PWR9BE-NEXT: stdu r1, -128(r1)
+; PWR9BE-NEXT: std r0, 144(r1)
; PWR9BE-NEXT: bl __gcc_qadd
; PWR9BE-NEXT: nop
; PWR9BE-NEXT: stfd f2, 120(r1)
; PWR9BE-NEXT: stfd f1, 112(r1)
; PWR9BE-NEXT: lxv vs1, 112(r1)
; PWR9BE-NEXT: xxswapd vs2, vs1
-; PWR9BE-NEXT: addi r1, r1, 144
+; PWR9BE-NEXT: addi r1, r1, 128
; PWR9BE-NEXT: ld r0, 16(r1)
; PWR9BE-NEXT: mtlr r0
; PWR9BE-NEXT: blr
@@ -3661,13 +3661,13 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR10LE: # %bb.0: # %entry
; PWR10LE-NEXT: mflr r0
; PWR10LE-NEXT: std r0, 16(r1)
-; PWR10LE-NEXT: stdu r1, -64(r1)
+; PWR10LE-NEXT: stdu r1, -48(r1)
; PWR10LE-NEXT: bl __gcc_qadd at notoc
; PWR10LE-NEXT: stfd f2, 40(r1)
; PWR10LE-NEXT: stfd f1, 32(r1)
; PWR10LE-NEXT: lxv vs1, 32(r1)
; PWR10LE-NEXT: xxswapd vs2, vs1
-; PWR10LE-NEXT: addi r1, r1, 64
+; PWR10LE-NEXT: addi r1, r1, 48
; PWR10LE-NEXT: ld r0, 16(r1)
; PWR10LE-NEXT: mtlr r0
; PWR10LE-NEXT: blr
@@ -3676,14 +3676,14 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR10BE: # %bb.0: # %entry
; PWR10BE-NEXT: mflr r0
; PWR10BE-NEXT: std r0, 16(r1)
-; PWR10BE-NEXT: stdu r1, -144(r1)
+; PWR10BE-NEXT: stdu r1, -128(r1)
; PWR10BE-NEXT: bl __gcc_qadd
; PWR10BE-NEXT: nop
; PWR10BE-NEXT: stfd f2, 120(r1)
; PWR10BE-NEXT: stfd f1, 112(r1)
; PWR10BE-NEXT: lxv vs1, 112(r1)
; PWR10BE-NEXT: xxswapd vs2, vs1
-; PWR10BE-NEXT: addi r1, r1, 144
+; PWR10BE-NEXT: addi r1, r1, 128
; PWR10BE-NEXT: ld r0, 16(r1)
; PWR10BE-NEXT: mtlr r0
; PWR10BE-NEXT: blr
diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll
index 5ec9f6a2a321b3..7228d5335a33f6 100644
--- a/llvm/test/CodeGen/WebAssembly/simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd.ll
@@ -481,21 +481,6 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v16i8:
; NO-SIMD128: .functype shuffle_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store8 15($0), $2
-; NO-SIMD128-NEXT: i32.store8 14($0), $2
-; NO-SIMD128-NEXT: i32.store8 13($0), $2
-; NO-SIMD128-NEXT: i32.store8 12($0), $2
-; NO-SIMD128-NEXT: i32.store8 11($0), $2
-; NO-SIMD128-NEXT: i32.store8 10($0), $2
-; NO-SIMD128-NEXT: i32.store8 9($0), $2
-; NO-SIMD128-NEXT: i32.store8 8($0), $2
-; NO-SIMD128-NEXT: i32.store8 7($0), $2
-; NO-SIMD128-NEXT: i32.store8 6($0), $2
-; NO-SIMD128-NEXT: i32.store8 5($0), $2
-; NO-SIMD128-NEXT: i32.store8 4($0), $2
-; NO-SIMD128-NEXT: i32.store8 3($0), $2
-; NO-SIMD128-NEXT: i32.store8 2($0), $2
-; NO-SIMD128-NEXT: i32.store8 1($0), $2
; NO-SIMD128-NEXT: i32.store8 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <16 x i8> %x, <16 x i8> %y,
@@ -994,13 +979,6 @@ define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v8i16:
; NO-SIMD128: .functype shuffle_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store16 14($0), $2
-; NO-SIMD128-NEXT: i32.store16 12($0), $2
-; NO-SIMD128-NEXT: i32.store16 10($0), $2
-; NO-SIMD128-NEXT: i32.store16 8($0), $2
-; NO-SIMD128-NEXT: i32.store16 6($0), $2
-; NO-SIMD128-NEXT: i32.store16 4($0), $2
-; NO-SIMD128-NEXT: i32.store16 2($0), $2
; NO-SIMD128-NEXT: i32.store16 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <8 x i16> %x, <8 x i16> %y,
@@ -1288,9 +1266,6 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v4i32:
; NO-SIMD128: .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store 12($0), $2
-; NO-SIMD128-NEXT: i32.store 8($0), $2
-; NO-SIMD128-NEXT: i32.store 4($0), $2
; NO-SIMD128-NEXT: i32.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <4 x i32> %x, <4 x i32> %y,
@@ -1550,7 +1525,6 @@ define <2 x i64> @shuffle_undef_v2i64(<2 x i64> %x, <2 x i64> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v2i64:
; NO-SIMD128: .functype shuffle_undef_v2i64 (i32, i64, i64, i64, i64) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i64.store 8($0), $2
; NO-SIMD128-NEXT: i64.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <2 x i64> %x, <2 x i64> %y,
@@ -1819,9 +1793,6 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v4f32:
; NO-SIMD128: .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: f32.store 12($0), $2
-; NO-SIMD128-NEXT: f32.store 8($0), $2
-; NO-SIMD128-NEXT: f32.store 4($0), $2
; NO-SIMD128-NEXT: f32.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <4 x float> %x, <4 x float> %y,
@@ -2082,7 +2053,6 @@ define <2 x double> @shuffle_undef_v2f64(<2 x double> %x, <2 x double> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v2f64:
; NO-SIMD128: .functype shuffle_undef_v2f64 (i32, f64, f64, f64, f64) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: f64.store 8($0), $2
; NO-SIMD128-NEXT: f64.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <2 x double> %x, <2 x double> %y,
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 08d9183bd30b67..fa95ce384533c4 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -621,15 +621,14 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -1110,15 +1109,14 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 28ac4496acb9be..97cc1f8a156943 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -141,8 +141,10 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
; SSE2-NEXT: movdqa %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 838086e366fbfd..d17f2135cccad5 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -159,19 +159,18 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -237,19 +236,18 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
@@ -536,19 +534,18 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -968,19 +965,18 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: por %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 7e081310c35be5..49cb7c707a14f3 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -474,8 +474,6 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pand %xmm7, %xmm6
; SSE2-NEXT: paddd %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
@@ -548,8 +546,6 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
; SSSE3-NEXT: pand %xmm7, %xmm6
; SSSE3-NEXT: paddd %xmm8, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSSE3-NEXT: pmuludq %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
@@ -578,25 +574,23 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmuldq %xmm2, %xmm0
; SSE41-NEXT: pinsrd $3, %r8d, %xmm2
-; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE41-NEXT: movd %r9d, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pmuldq %xmm3, %xmm4
-; SSE41-NEXT: pinsrd $1, %edx, %xmm3
-; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %esi
-; SSE41-NEXT: pinsrd $1, %esi, %xmm5
+; SSE41-NEXT: pinsrd $1, %ecx, %xmm3
+; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT: pinsrd $1, %edx, %xmm5
; SSE41-NEXT: pmulld %xmm3, %xmm5
; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: movd %edx, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; SSE41-NEXT: movd %esi, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE41-NEXT: movd %ecx, %xmm3
+; SSE41-NEXT: movd %edx, %xmm6
; SSE41-NEXT: pmuldq %xmm3, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
-; SSE41-NEXT: movq %xmm5, 16(%rcx)
+; SSE41-NEXT: movq %xmm5, 16(%rsi)
; SSE41-NEXT: psrad $31, %xmm5
; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
@@ -607,7 +601,7 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
; SSE41-NEXT: pmulld %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, (%rcx)
+; SSE41-NEXT: movdqa %xmm1, (%rsi)
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 68c6ca93576b76..62db6d234d3019 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -394,8 +394,8 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
; SSE2-NEXT: pmuludq %xmm2, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
@@ -444,8 +444,8 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
; SSSE3-NEXT: pmuludq %xmm2, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
@@ -492,9 +492,7 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
; SSE41-NEXT: pxor %xmm6, %xmm3
; SSE41-NEXT: movd %edi, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0]
; SSE41-NEXT: movd %r9d, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
; SSE41-NEXT: pmuludq %xmm7, %xmm8
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7]
diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
index 3257936f62e3b7..3d34205096afe0 100644
--- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
@@ -105,14 +105,13 @@ define void @shuf5(ptr %p) nounwind {
; X86-LABEL: shuf5:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movsd {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33,0,0,0,0,0,0,0,0]
+; X86-NEXT: movsd {{.*#+}} xmm0 = [33,33,u,u,u,u,u,u,0,0,u,u,u,u,u,u]
; X86-NEXT: movsd %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: shuf5:
; X64: # %bb.0:
-; X64-NEXT: movabsq $2387225703656530209, %rax # imm = 0x2121212121212121
-; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: movq $8481, (%rdi) # imm = 0x2121
; X64-NEXT: retq
%v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> poison, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <8 x i8> %v, ptr %p, align 8
More information about the llvm-commits
mailing list