[llvm] [DAG] isConstantIntBuildVectorOrConstantInt - peek through bitcasts (PR #112710)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 17 06:55:39 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Alter both isConstantIntBuildVectorOrConstantInt + isConstantFPBuildVectorOrConstantFP to return a bool instead of the underlying SDNode, and adjust usage to account for this.
Update isConstantIntBuildVectorOrConstantInt to peek though bitcasts when attempting to find a constant, in particular this improves canonicalization of constants to the RHS on commutable instructions.
X86 is the beneficiary here as it often bitcasts rematerializable 0/-1 vector constants as vXi32 and bitcasts to the requested type
Minor cleanup that helps with #<!-- -->107423
Alternative to #<!-- -->112682
---
Patch is 130.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112710.diff
18 Files Affected:
- (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+2-2)
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+14-20)
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+24-20)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+1-1)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+2-2)
- (modified) llvm/test/CodeGen/X86/avx2-arith.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/combine-sra.ll (+4-5)
- (modified) llvm/test/CodeGen/X86/midpoint-int-vec-128.ll (+25-25)
- (modified) llvm/test/CodeGen/X86/midpoint-int-vec-256.ll (+30-30)
- (modified) llvm/test/CodeGen/X86/min-legal-vector-width.ll (+9-9)
- (modified) llvm/test/CodeGen/X86/pmul.ll (+29-33)
- (modified) llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/psubus.ll (+36-45)
- (modified) llvm/test/CodeGen/X86/sat-add.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll (+3-3)
- (modified) llvm/test/CodeGen/X86/vector-trunc-packus.ll (+128-147)
- (modified) llvm/test/CodeGen/X86/vector-trunc-ssat.ll (+128-147)
- (modified) llvm/test/CodeGen/X86/vector-trunc-usat.ll (+128-151)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index b8f80738486a71..2342a8d779f170 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -2301,10 +2301,10 @@ class SelectionDAG {
Align getEVTAlign(EVT MemoryVT) const;
/// Test whether the given value is a constant int or similar node.
- SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N) const;
+ bool isConstantIntBuildVectorOrConstantInt(SDValue N) const;
/// Test whether the given value is a constant FP or similar node.
- SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) const ;
+ bool isConstantFPBuildVectorOrConstantFP(SDValue N) const ;
/// \returns true if \p N is any kind of constant or build_vector of
/// constants, int or float. If a vector, it may not necessarily be a splat.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 644054361dd3ee..70f7cee6f46602 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1205,13 +1205,13 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
- if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
+ if (DAG.isConstantIntBuildVectorOrConstantInt(N01)) {
SDNodeFlags NewFlags;
if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
Flags.hasNoUnsignedWrap())
NewFlags.setNoUnsignedWrap(true);
- if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
+ if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
// Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags);
@@ -9931,10 +9931,10 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
// fold (rot* (rot* x, c2), c1)
// -> (rot* x, ((c1 % bitsize) +- (c2 % bitsize) + bitsize) % bitsize)
if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
- SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
- SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
- if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
- EVT ShiftVT = C1->getValueType(0);
+ bool C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
+ bool C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
+ if (C1 && C2 && N1.getValueType() == N0.getOperand(1).getValueType()) {
+ EVT ShiftVT = N1.getValueType();
bool SameSide = (N->getOpcode() == NextOp);
unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
@@ -16805,8 +16805,8 @@ SDValue DAGCombiner::visitVP_FADD(SDNode *N) {
SDValue DAGCombiner::visitFADD(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- SDNode *N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0);
- SDNode *N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
+ bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0);
+ bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
const TargetOptions &Options = DAG.getTarget().Options;
@@ -16903,10 +16903,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
// of rounding steps.
if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
if (N0.getOpcode() == ISD::FMUL) {
- SDNode *CFP00 =
- DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
- SDNode *CFP01 =
- DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
+ bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+ bool CFP01 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
// (fadd (fmul x, c), x) -> (fmul x, c+1)
if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
@@ -16926,10 +16924,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
}
if (N1.getOpcode() == ISD::FMUL) {
- SDNode *CFP10 =
- DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
- SDNode *CFP11 =
- DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
+ bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+ bool CFP11 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
// (fadd x, (fmul x, c)) -> (fmul x, c+1)
if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
@@ -16949,8 +16945,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
}
if (N0.getOpcode() == ISD::FADD) {
- SDNode *CFP00 =
- DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+ bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
// (fadd (fadd x, x), x) -> (fmul x, 3.0)
if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
(N0.getOperand(0) == N1)) {
@@ -16960,8 +16955,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
}
if (N1.getOpcode() == ISD::FADD) {
- SDNode *CFP10 =
- DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+ bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
// (fadd x, (fadd x, x)) -> (fmul x, 3.0)
if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
N1.getOperand(0) == N0) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 66c078b1d35ba5..6e3e2174f305df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6962,10 +6962,10 @@ void SelectionDAG::canonicalizeCommutativeBinop(unsigned Opcode, SDValue &N1,
// Canonicalize:
// binop(const, nonconst) -> binop(nonconst, const)
- SDNode *N1C = isConstantIntBuildVectorOrConstantInt(N1);
- SDNode *N2C = isConstantIntBuildVectorOrConstantInt(N2);
- SDNode *N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
- SDNode *N2CFP = isConstantFPBuildVectorOrConstantFP(N2);
+ bool N1C = isConstantIntBuildVectorOrConstantInt(N1);
+ bool N2C = isConstantIntBuildVectorOrConstantInt(N2);
+ bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
+ bool N2CFP = isConstantFPBuildVectorOrConstantFP(N2);
if ((N1C && !N2C) || (N1CFP && !N2CFP))
std::swap(N1, N2);
@@ -13197,39 +13197,43 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
return true;
}
-// Returns the SDNode if it is a constant integer BuildVector
-// or constant integer.
-SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) const {
+// Returns true if it is a constant integer BuildVector or constant integer,
+// possibly hidden by a bitcast.
+bool SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) const {
+ N = peekThroughBitcasts(N);
+
if (isa<ConstantSDNode>(N))
- return N.getNode();
+ return true;
+
if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
- return N.getNode();
+ return true;
+
// Treat a GlobalAddress supporting constant offset folding as a
// constant integer.
- if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
+ if (auto *GA = dyn_cast<GlobalAddressSDNode>(N))
if (GA->getOpcode() == ISD::GlobalAddress &&
TLI->isOffsetFoldingLegal(GA))
- return GA;
+ return true;
+
if ((N.getOpcode() == ISD::SPLAT_VECTOR) &&
isa<ConstantSDNode>(N.getOperand(0)))
- return N.getNode();
- return nullptr;
+ return true;
+ return false;
}
-// Returns the SDNode if it is a constant float BuildVector
-// or constant float.
-SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const {
+// Returns true if it is a constant float BuildVector or constant float.
+bool SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const {
if (isa<ConstantFPSDNode>(N))
- return N.getNode();
+ return true;
if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
- return N.getNode();
+ return true;
if ((N.getOpcode() == ISD::SPLAT_VECTOR) &&
isa<ConstantFPSDNode>(N.getOperand(0)))
- return N.getNode();
+ return true;
- return nullptr;
+ return false;
}
std::optional<bool> SelectionDAG::isBoolConstant(SDValue N,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5e5afdb7fa0a6c..3a5fc44db8258e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20713,7 +20713,7 @@ static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {
if (!Add.hasOneUse())
return SDValue();
- if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(X)))
+ if (DAG.isConstantIntBuildVectorOrConstantInt(X))
return SDValue();
SDValue M1 = Add.getOperand(0);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aa6e75cbf41085..b524b3983376af 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56552,8 +56552,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
// TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
auto IsNonOpaqueConstant = [&](SDValue Op) {
- if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
- if (auto *Cst = dyn_cast<ConstantSDNode>(C))
+ if (DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
+ if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
return !Cst->isOpaque();
return true;
}
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 90733dfb8465ef..44ab33ad67f272 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -122,7 +122,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; CHECK-LABEL: mul_v32i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-NEXT: vpand %ymm1, %ymm2, %ymm3
+; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3
; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3
; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index 7eee418742ddb5..c982884314f623 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -725,12 +725,11 @@ define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: pxor %xmm7, %xmm6
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519]
-; SSE41-NEXT: movdqa %xmm8, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm6
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711]
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index 6fd3db3464decb..ee83a79b6dd550 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -2369,8 +2369,8 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pand %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pand %xmm3, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
; SSE41-NEXT: pand %xmm3, %xmm5
@@ -2391,7 +2391,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
@@ -2432,7 +2432,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
@@ -2450,7 +2450,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
@@ -2592,8 +2592,8 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: pmaddubsw %xmm3, %xmm5
; SSE41-NEXT: pand %xmm2, %xmm5
@@ -2616,7 +2616,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -2659,7 +2659,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
@@ -2677,7 +2677,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
@@ -2823,8 +2823,8 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; SSE41-NEXT: psrlw $1, %xmm0
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pand %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pand %xmm3, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
; SSE41-NEXT: pand %xmm3, %xmm5
@@ -2846,7 +2846,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
@@ -2889,7 +2889,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
@@ -2908,7 +2908,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4
-; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
@@ -3054,8 +3054,8 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pand %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pand %xmm3, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
; SSE41-NEXT: pand %xmm3, %xmm5
@@ -3077,7 +3077,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
@@ -3120,7 +3120,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBA...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/112710
More information about the llvm-commits
mailing list