[llvm] Fix non-splat vector SREM expansion when one of the divisors is a power of two. (PR #82706)
Owen Anderson via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 22 15:39:33 PST 2024
https://github.com/resistor created https://github.com/llvm/llvm-project/pull/82706
The expansion previously used, derived from Hacker's Delight,
does not work correctly when the dividend is INT_MIN and the
divisor is a power of two. We now use an alternate derivation
of the A and Q constants specifically for the power-of-two divisor
case to avoid this problem. Credit to Fabian Giesen for the
new derivation.
Fixes https://github.com/llvm/llvm-project/issues/77169
>From 96f8ecb859b6088d0c8a98c3e65110d56206584c Mon Sep 17 00:00:00 2001
From: Owen Anderson <resistor at mac.com>
Date: Thu, 22 Feb 2024 18:34:59 -0500
Subject: [PATCH] Fix non-splat vector SREM expansion when one of the divisors
is a power of two.
The expansion previously used, derived from Hacker's Delight,
does not work correctly when the dividend is INT_MIN and the
divisor is a power of two. We now use an alternate derivation
of the A and Q constants specifically for the power-of-two divisor
case to avoid this problem. Credit to Fabian Giesen for the
new derivation.
Fixes https://github.com/llvm/llvm-project/issues/77169
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 20 ++++
.../CodeGen/X86/srem-seteq-vec-nonsplat.ll | 103 +++++++++---------
2 files changed, 70 insertions(+), 53 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 07fb89127a737c..21d484b1962524 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6803,6 +6803,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
SDValue CompTargetNode, ISD::CondCode Cond,
DAGCombinerInfo &DCI, const SDLoc &DL,
SmallVectorImpl<SDNode *> &Created) const {
+ // Derived from Hacker's Delight, 2nd Edition, by Hank Warren. Section 10-17.
// Fold:
// (seteq/ne (srem N, D), 0)
// To:
@@ -6813,6 +6814,17 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
// - A = bitwiseand(floor((2^(W - 1) - 1) / D0), (-(2^k)))
// - Q = floor((2 * A) / (2^K))
// where W is the width of the common type of N and D.
+ //
+ // When D is a power of two (and thus D0 is 1), the normal
+ // formula for A and Q don't apply, because the derivation
+ // depends on D not dividing 2^(W-1), and thus theoroem ZRS
+ // does not apply. This specifically fails when N = INT_MIN.
+ //
+ // Instead, for power-of-two D, we use:
+ // - A = 2^(W-1)
+ // |-> Order-preserving map from [-2^(W-1), 2^(W-1) - 1] to [0,2^W - 1])
+ // - Q = 2^(W-K) - 1
+ // |-> Test that the top K bits are zero after rotation
assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
"Only applicable for (in)equality comparisons.");
@@ -6896,6 +6908,14 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
// Q = floor((2 * A) / (2^K))
APInt Q = (2 * A).udiv(APInt::getOneBitSet(W, K));
+ // If D was a power of two, apply the alternate constant derivation.
+ if (D0.isOne()) {
+ // A = 2^(W-1)
+ A = APInt::getSignedMinValue(W);
+ // - Q = 2^(W-K) - 1
+ Q = APInt::getAllOnes(W - K).zext(W);
+ }
+
assert(APInt::getAllOnes(SVT.getSizeInBits()).ugt(A) &&
"We are expecting that A is always less than all-ones for SVT");
assert(APInt::getAllOnes(ShSVT.getSizeInBits()).ugt(K) &&
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 8509e930ba74ae..3dde5c1c8a40c1 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -560,7 +560,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
; CHECK-SSE41-NEXT: por %xmm1, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,268435454,858993458]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,268435455,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -646,7 +646,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435454,306783378]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435455,306783378]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -735,7 +735,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,268435454,42949672]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,268435455,42949672]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1041,7 +1041,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm2
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,0,858993458]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,1,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
@@ -1135,21 +1135,21 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3067833783,3067833783,1,3067833783]
; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: por %xmm5, %xmm4
-; CHECK-SSE41-NEXT: pminud %xmm4, %xmm3
-; CHECK-SSE41-NEXT: pcmpeqd %xmm4, %xmm3
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: por %xmm4, %xmm3
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2
; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
@@ -1157,17 +1157,16 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; CHECK-AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1179,12 +1178,11 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm4
+; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpminud %xmm3, %xmm2, %xmm3
+; CHECK-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
@@ -1196,15 +1194,14 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-LABEL: test_srem_even_INT_MIN:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-AVX512VL-NEXT: vpminud %xmm3, %xmm2, %xmm3
-; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
%srem = srem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14>
@@ -1263,7 +1260,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: por %xmm4, %xmm3
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,0,42949672]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,1,42949672]
; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2
; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2
; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1362,7 +1359,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
; CHECK-SSE41-NEXT: por %xmm1, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,858993458]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435455,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1447,7 +1444,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,306783378]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435455,306783378]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1536,7 +1533,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,42949672]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435455,42949672]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1844,7 +1841,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,858993458]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435455,4294967295,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1932,7 +1929,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435454,4294967295,306783378]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435455,4294967295,306783378]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -2016,7 +2013,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,42949672]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435455,4294967295,42949672]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -2093,7 +2090,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm0
; CHECK-SSE41-NEXT: por %xmm1, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,4294967295]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435455,4294967295]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -2166,7 +2163,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm0
; CHECK-SSE41-NEXT: por %xmm1, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,4294967295]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435455,4294967295]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -2237,7 +2234,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
; CHECK-SSE2-NEXT: psrlw $8, %xmm6
; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7
; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
@@ -2264,7 +2261,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: psrlw $8, %xmm0
; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm1
; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm3
@@ -2300,7 +2297,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
; CHECK-SSE41-NEXT: psrlw $8, %xmm6
; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
; CHECK-SSE41-NEXT: pminub %xmm6, %xmm0
; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm7, %xmm7
@@ -2326,7 +2323,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: psrlw $8, %xmm0
; CHECK-SSE41-NEXT: packuswb %xmm4, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
; CHECK-SSE41-NEXT: pmaxub %xmm0, %xmm4
; CHECK-SSE41-NEXT: pcmpeqb %xmm0, %xmm4
; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm3
More information about the llvm-commits
mailing list