[llvm] 4f8fdf7 - [ISEL] Canonicalise constant splats to RHS.
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 24 01:38:54 PST 2022
Author: Sander de Smalen
Date: 2022-01-24T09:38:36Z
New Revision: 4f8fdf78279f0cb298dc0dc215ee56b0342235ee
URL: https://github.com/llvm/llvm-project/commit/4f8fdf78279f0cb298dc0dc215ee56b0342235ee
DIFF: https://github.com/llvm/llvm-project/commit/4f8fdf78279f0cb298dc0dc215ee56b0342235ee.diff
LOG: [ISEL] Canonicalise constant splats to RHS.
SelectionDAG::getNode() canonicalises constants to the RHS if the
operation is commutative, but it doesn't do so for constant splat
vectors. Doing this early helps making certain folds on vector types,
simplifying the code required for target DAGCombines that are enabled
before Type legalization.
Somewhat to my surprise, DAGCombine doesn't seem to traverse the
DAG in a post-order DFS, so at the time of doing some custom fold where
the input is a MUL, DAGCombiner::visitMUL hasn't yet reordered the
constant splat to the RHS.
This patch leads to a few improvements, but also a few minor regressions,
which I traced down to D46492. When I tried reverting this change to see
if the changes were still necessary, I ran into some segfaults. Not sure
if there is some latent bug there.
Reviewed By: craig.topper
Differential Revision: https://reviews.llvm.org/D117794
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
llvm/test/CodeGen/PowerPC/combine-fneg.ll
llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
llvm/test/CodeGen/X86/dpbusd_const.ll
llvm/test/CodeGen/X86/extractelement-fp.ll
llvm/test/CodeGen/X86/fp-round.ll
llvm/test/CodeGen/X86/fp128-cast.ll
llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
llvm/test/CodeGen/X86/pr43509.ll
llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 52c69feebaf7e..7ca6f9aa4cf0a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5610,22 +5610,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
assert(N1.getOpcode() != ISD::DELETED_NODE &&
N2.getOpcode() != ISD::DELETED_NODE &&
"Operand is DELETED_NODE!");
- ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
- ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
- ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
- ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
-
// Canonicalize constant to RHS if commutative.
if (TLI->isCommutativeBinOp(Opcode)) {
- if (N1C && !N2C) {
- std::swap(N1C, N2C);
- std::swap(N1, N2);
- } else if (N1CFP && !N2CFP) {
- std::swap(N1CFP, N2CFP);
+ bool IsN1C = isConstantIntBuildVectorOrConstantInt(N1);
+ bool IsN2C = isConstantIntBuildVectorOrConstantInt(N2);
+ bool IsN1CFP = isConstantFPBuildVectorOrConstantFP(N1);
+ bool IsN2CFP = isConstantFPBuildVectorOrConstantFP(N2);
+ if ((IsN1C && !IsN2C) || (IsN1CFP && !IsN2CFP))
std::swap(N1, N2);
- }
}
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+ ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+
switch (Opcode) {
default: break;
case ISD::TokenFactor:
diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
index 2e385fdd6f25f..aa0b7e14afc56 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
@@ -126,7 +126,8 @@ define <4 x i32> @out_constant_mone_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
define <4 x i32> @in_constant_mone_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
; CHECK-LABEL: in_constant_mone_vary:
; CHECK: // %bb.0:
-; CHECK-NEXT: orr v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: bic v0.16b, v2.16b, v1.16b
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
%n1 = and <4 x i32> %n0, %mask
@@ -152,8 +153,9 @@ define <4 x i32> @out_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4
define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
; CHECK-LABEL: in_constant_mone_vary_invmask:
; CHECK: // %bb.0:
-; CHECK-NEXT: and v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: orn v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: mvn v0.16b, v1.16b
+; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
%n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
index 1124fbd22a0e5..771c05f184a04 100644
--- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll
+++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
@@ -13,10 +13,10 @@ define <4 x double> @fneg_fdiv_splat(double %a0, <4 x double> %a1) {
; CHECK-NEXT: xvredp 2, 0
; CHECK-NEXT: xxswapd 1, 1
; CHECK-NEXT: xxlor 3, 1, 1
-; CHECK-NEXT: xvmaddadp 3, 0, 2
-; CHECK-NEXT: xvnmsubadp 2, 2, 3
-; CHECK-NEXT: xvmaddadp 1, 0, 2
-; CHECK-NEXT: xvmsubadp 2, 2, 1
+; CHECK-NEXT: xvnmsubadp 3, 0, 2
+; CHECK-NEXT: xvmaddadp 2, 2, 3
+; CHECK-NEXT: xvnmsubadp 1, 0, 2
+; CHECK-NEXT: xvnmaddadp 2, 2, 1
; CHECK-NEXT: xvmuldp 34, 34, 2
; CHECK-NEXT: xvmuldp 35, 35, 2
; CHECK-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
index 2b93974263286..1185737fb0c96 100644
--- a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
+++ b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
@@ -36,9 +36,9 @@ define <4 x float> @repeated_fp_divisor(float %a, <4 x float> %b) {
; CHECK-NEXT: lvx 4, 0, 3
; CHECK-NEXT: xxspltw 0, 0, 0
; CHECK-NEXT: xvresp 1, 0
-; CHECK-NEXT: xvnmsubasp 35, 0, 1
+; CHECK-NEXT: xvmaddasp 35, 0, 1
; CHECK-NEXT: xvmulsp 0, 34, 36
-; CHECK-NEXT: xvmaddasp 1, 1, 35
+; CHECK-NEXT: xvnmsubasp 1, 1, 35
; CHECK-NEXT: xvmulsp 34, 0, 1
; CHECK-NEXT: blr
%ins = insertelement <4 x float> undef, float %a, i32 0
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index aa780fe3b94ad..b0ffb23c9ced3 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -68,8 +68,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
; AVXVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVXVNNI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,127,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1
+; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVXVNNI-NEXT: vmovd %xmm1, %eax
; AVXVNNI-NEXT: addl %edi, %eax
; AVXVNNI-NEXT: retq
@@ -80,10 +79,9 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
; AVX512VNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,127,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2
-; AVX512VNNI-NEXT: vmovd %xmm2, %eax
+; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
+; AVX512VNNI-NEXT: vmovd %xmm1, %eax
; AVX512VNNI-NEXT: addl %edi, %eax
; AVX512VNNI-NEXT: vzeroupper
; AVX512VNNI-NEXT: retq
@@ -92,10 +90,9 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
; AVX512VLVNNI: # %bb.0: # %entry
; AVX512VLVNNI-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VLVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512VLVNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,127,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2
-; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
+; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax
; AVX512VLVNNI-NEXT: addl %edi, %eax
; AVX512VLVNNI-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll
index 8ff73d5a7ffa8..c398fb8c74cb8 100644
--- a/llvm/test/CodeGen/X86/extractelement-fp.ll
+++ b/llvm/test/CodeGen/X86/extractelement-fp.ll
@@ -1070,7 +1070,7 @@ define float @round_v4f32(<4 x float> %x) nounwind {
; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X64-NEXT: vandps %xmm1, %xmm0, %xmm1
; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; X64-NEXT: vorps %xmm1, %xmm2, %xmm1
+; X64-NEXT: vorps %xmm2, %xmm1, %xmm1
; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
; X64-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
@@ -1081,7 +1081,7 @@ define float @round_v4f32(<4 x float> %x) nounwind {
; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X86-NEXT: vandps %xmm1, %xmm0, %xmm1
; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; X86-NEXT: vorps %xmm1, %xmm2, %xmm1
+; X86-NEXT: vorps %xmm2, %xmm1, %xmm1
; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
; X86-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -1099,7 +1099,7 @@ define double @round_v4f64(<4 x double> %x) nounwind {
; X64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; X64-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
; X64-NEXT: # xmm2 = mem[0,0]
-; X64-NEXT: vorpd %xmm1, %xmm2, %xmm1
+; X64-NEXT: vorpd %xmm2, %xmm1, %xmm1
; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X64-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
; X64-NEXT: vzeroupper
@@ -1114,7 +1114,7 @@ define double @round_v4f64(<4 x double> %x) nounwind {
; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
; X86-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
; X86-NEXT: # xmm2 = mem[0,0]
-; X86-NEXT: vorpd %xmm1, %xmm2, %xmm1
+; X86-NEXT: vorpd %xmm2, %xmm1, %xmm1
; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, (%esp)
diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll
index 7798ab682d41e..955501544ff55 100644
--- a/llvm/test/CodeGen/X86/fp-round.ll
+++ b/llvm/test/CodeGen/X86/fp-round.ll
@@ -41,7 +41,7 @@ define half @round_f16(half %h) {
; AVX1-NEXT: callq ___extendhfsf2
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vorps %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; AVX1-NEXT: callq ___truncsfhf2
@@ -94,7 +94,7 @@ define float @round_f32(float %x) {
; AVX1: ## %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vorps %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -130,7 +130,7 @@ define double @round_f64(double %x) {
; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
; AVX1-NEXT: ## xmm2 = mem[0,0]
-; AVX1-NEXT: vorpd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vorpd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -521,11 +521,11 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3
; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3
+; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vroundps $11, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm2
-; AVX1-NEXT: vorps %ymm2, %ymm4, %ymm2
+; AVX1-NEXT: vorps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vroundps $11, %ymm1, %ymm1
; AVX1-NEXT: retq
@@ -620,11 +620,11 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3
; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX1-NEXT: vorpd %ymm3, %ymm4, %ymm3
+; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0
; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm2
-; AVX1-NEXT: vorpd %ymm2, %ymm4, %ymm2
+; AVX1-NEXT: vorpd %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vroundpd $11, %ymm1, %ymm1
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 500cb0c677ff5..530ae967e1c07 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -1326,7 +1326,7 @@ define fp128 @TestTruncCopysign(fp128 %x, i32 %n) nounwind {
; X64-AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [+Inf,+Inf]
; X64-AVX-NEXT: # xmm1 = mem[0,0]
-; X64-AVX-NEXT: vorps %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: callq __extenddftf2 at PLT
; X64-AVX-NEXT: addq $8, %rsp
; X64-AVX-NEXT: .LBB26_2: # %cleanup
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 8a452ddc06b62..54d74c3c86d8e 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -465,9 +465,9 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: pmuludq %xmm2, %xmm3
-; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE2-NEXT: pand %xmm1, %xmm0
@@ -491,9 +491,9 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: pmuludq %xmm2, %xmm3
-; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE2-NEXT: pand %xmm1, %xmm0
@@ -611,9 +611,9 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: pmuludq %xmm2, %xmm3
-; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE2-NEXT: pand %xmm1, %xmm0
@@ -637,9 +637,9 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: pmuludq %xmm2, %xmm3
-; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE2-NEXT: pand %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr43509.ll b/llvm/test/CodeGen/X86/pr43509.ll
index e2c3affd952bb..87ddad03e9c45 100644
--- a/llvm/test/CodeGen/X86/pr43509.ll
+++ b/llvm/test/CodeGen/X86/pr43509.ll
@@ -4,12 +4,10 @@
define <8 x i8> @foo(<8 x float> %arg) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %bb
-; CHECK-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k0
-; CHECK-NEXT: vpmovm2b %k0, %xmm1
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpltps %ymm2, %ymm0, %k1
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1 {%k1}
; CHECK-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
bb:
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
index 2d6c4dc829ed9..705d655939c30 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
@@ -336,23 +336,26 @@ define <4 x i32> @in_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32
; CHECK-SSE1-LABEL: in_constant_mone_vary:
; CHECK-SSE1: # %bb.0:
; CHECK-SSE1-NEXT: movq %rdi, %rax
-; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0
; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1
-; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1
-; CHECK-SSE1-NEXT: orps %xmm0, %xmm1
+; CHECK-SSE1-NEXT: andnps (%rcx), %xmm1
+; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi)
; CHECK-SSE1-NEXT: retq
;
; CHECK-SSE2-LABEL: in_constant_mone_vary:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movaps (%rsi), %xmm0
-; CHECK-SSE2-NEXT: orps (%rdx), %xmm0
+; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1
+; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps (%rdx), %xmm0
+; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_constant_mone_vary:
; CHECK-XOP: # %bb.0:
; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0
-; CHECK-XOP-NEXT: vorps (%rdx), %xmm0, %xmm0
+; CHECK-XOP-NEXT: vandnps (%rdx), %xmm0, %xmm1
+; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -408,30 +411,32 @@ define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py,
; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask:
; CHECK-SSE1: # %bb.0:
; CHECK-SSE1-NEXT: movq %rdi, %rax
-; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0
-; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT: andps (%rdx), %xmm0
-; CHECK-SSE1-NEXT: orps %xmm1, %xmm0
-; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0
+; CHECK-SSE1-NEXT: movaps (%rcx), %xmm1
+; CHECK-SSE1-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE1-NEXT: movaps %xmm0, %xmm2
+; CHECK-SSE1-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE1-NEXT: xorps %xmm0, %xmm2
+; CHECK-SSE1-NEXT: movaps %xmm2, (%rdi)
; CHECK-SSE1-NEXT: retq
;
; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pand (%rsi), %xmm0
-; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa (%rsi), %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; CHECK-SSE2-NEXT: pxor (%rdx), %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_constant_mone_vary_invmask:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rsi), %xmm0
; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm1
-; CHECK-XOP-NEXT: vpand (%rsi), %xmm0, %xmm0
-; CHECK-XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm1
+; CHECK-XOP-NEXT: vpandn %xmm1, %xmm0, %xmm1
+; CHECK-XOP-NEXT: vpxor %xmm0, %xmm1, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
More information about the llvm-commits
mailing list