[llvm] [DAG] isConstantIntBuildVectorOrConstantInt - peek through bitcasts (PR #112710)

Thu Oct 17 06:55:39 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Simon Pilgrim (RKSimon)

<details>
<summary>Changes</summary>

Alter both isConstantIntBuildVectorOrConstantInt + isConstantFPBuildVectorOrConstantFP to return a bool instead of the underlying SDNode, and adjust usage to account for this.

Update isConstantIntBuildVectorOrConstantInt to peek though bitcasts when attempting to find a constant, in particular this improves canonicalization of constants to the RHS on commutable instructions.

X86 is the beneficiary here as it often bitcasts rematerializable 0/-1 vector constants as vXi32 and bitcasts to the requested type

Minor cleanup that helps with #107423

Alternative to #112682

---

Patch is 130.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112710.diff


18 Files Affected:

- (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+2-2) 
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+14-20) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+24-20) 
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+1-1) 
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+2-2) 
- (modified) llvm/test/CodeGen/X86/avx2-arith.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/combine-sra.ll (+4-5) 
- (modified) llvm/test/CodeGen/X86/midpoint-int-vec-128.ll (+25-25) 
- (modified) llvm/test/CodeGen/X86/midpoint-int-vec-256.ll (+30-30) 
- (modified) llvm/test/CodeGen/X86/min-legal-vector-width.ll (+9-9) 
- (modified) llvm/test/CodeGen/X86/pmul.ll (+29-33) 
- (modified) llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/psubus.ll (+36-45) 
- (modified) llvm/test/CodeGen/X86/sat-add.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/vector-trunc-packus.ll (+128-147) 
- (modified) llvm/test/CodeGen/X86/vector-trunc-ssat.ll (+128-147) 
- (modified) llvm/test/CodeGen/X86/vector-trunc-usat.ll (+128-151) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index b8f80738486a71..2342a8d779f170 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -2301,10 +2301,10 @@ class SelectionDAG {
   Align getEVTAlign(EVT MemoryVT) const;
 
   /// Test whether the given value is a constant int or similar node.
-  SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N) const;
+  bool isConstantIntBuildVectorOrConstantInt(SDValue N) const;
 
   /// Test whether the given value is a constant FP or similar node.
-  SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) const ;
+  bool isConstantFPBuildVectorOrConstantFP(SDValue N) const ;
 
   /// \returns true if \p N is any kind of constant or build_vector of
   /// constants, int or float. If a vector, it may not necessarily be a splat.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 644054361dd3ee..70f7cee6f46602 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1205,13 +1205,13 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
   SDValue N00 = N0.getOperand(0);
   SDValue N01 = N0.getOperand(1);
 
-  if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N01)) {
     SDNodeFlags NewFlags;
     if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
         Flags.hasNoUnsignedWrap())
       NewFlags.setNoUnsignedWrap(true);
 
-    if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
+    if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
       // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
       if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
         return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags);
@@ -9931,10 +9931,10 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
   // fold (rot* (rot* x, c2), c1)
   //   -> (rot* x, ((c1 % bitsize) +- (c2 % bitsize) + bitsize) % bitsize)
   if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
-    SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
-    SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
-    if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
-      EVT ShiftVT = C1->getValueType(0);
+    bool C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
+    bool C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
+    if (C1 && C2 && N1.getValueType() == N0.getOperand(1).getValueType()) {
+      EVT ShiftVT = N1.getValueType();
       bool SameSide = (N->getOpcode() == NextOp);
       unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
       SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
@@ -16805,8 +16805,8 @@ SDValue DAGCombiner::visitVP_FADD(SDNode *N) {
 SDValue DAGCombiner::visitFADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  SDNode *N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0);
-  SDNode *N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
+  bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0);
+  bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
@@ -16903,10 +16903,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     // of rounding steps.
     if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
       if (N0.getOpcode() == ISD::FMUL) {
-        SDNode *CFP00 =
-            DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
-        SDNode *CFP01 =
-            DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
+        bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+        bool CFP01 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
 
         // (fadd (fmul x, c), x) -> (fmul x, c+1)
         if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
@@ -16926,10 +16924,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       }
 
       if (N1.getOpcode() == ISD::FMUL) {
-        SDNode *CFP10 =
-            DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
-        SDNode *CFP11 =
-            DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
+        bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+        bool CFP11 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
 
         // (fadd x, (fmul x, c)) -> (fmul x, c+1)
         if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
@@ -16949,8 +16945,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       }
 
       if (N0.getOpcode() == ISD::FADD) {
-        SDNode *CFP00 =
-            DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+        bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
         // (fadd (fadd x, x), x) -> (fmul x, 3.0)
         if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
             (N0.getOperand(0) == N1)) {
@@ -16960,8 +16955,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       }
 
       if (N1.getOpcode() == ISD::FADD) {
-        SDNode *CFP10 =
-            DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+        bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
         // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
         if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
             N1.getOperand(0) == N0) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 66c078b1d35ba5..6e3e2174f305df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6962,10 +6962,10 @@ void SelectionDAG::canonicalizeCommutativeBinop(unsigned Opcode, SDValue &N1,
 
   // Canonicalize:
   //   binop(const, nonconst) -> binop(nonconst, const)
-  SDNode *N1C = isConstantIntBuildVectorOrConstantInt(N1);
-  SDNode *N2C = isConstantIntBuildVectorOrConstantInt(N2);
-  SDNode *N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
-  SDNode *N2CFP = isConstantFPBuildVectorOrConstantFP(N2);
+  bool N1C = isConstantIntBuildVectorOrConstantInt(N1);
+  bool N2C = isConstantIntBuildVectorOrConstantInt(N2);
+  bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
+  bool N2CFP = isConstantFPBuildVectorOrConstantFP(N2);
   if ((N1C && !N2C) || (N1CFP && !N2CFP))
     std::swap(N1, N2);
 
@@ -13197,39 +13197,43 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
   return true;
 }
 
-// Returns the SDNode if it is a constant integer BuildVector
-// or constant integer.
-SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) const {
+// Returns true if it is a constant integer BuildVector or constant integer,
+// possibly hidden by a bitcast.
+bool SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) const {
+  N = peekThroughBitcasts(N);
+
   if (isa<ConstantSDNode>(N))
-    return N.getNode();
+    return true;
+
   if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
-    return N.getNode();
+    return true;
+
   // Treat a GlobalAddress supporting constant offset folding as a
   // constant integer.
-  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
+  if (auto *GA = dyn_cast<GlobalAddressSDNode>(N))
     if (GA->getOpcode() == ISD::GlobalAddress &&
         TLI->isOffsetFoldingLegal(GA))
-      return GA;
+      return true;
+
   if ((N.getOpcode() == ISD::SPLAT_VECTOR) &&
       isa<ConstantSDNode>(N.getOperand(0)))
-    return N.getNode();
-  return nullptr;
+    return true;
+  return false;
 }
 
-// Returns the SDNode if it is a constant float BuildVector
-// or constant float.
-SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const {
+// Returns true if it is a constant float BuildVector or constant float.
+bool SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const {
   if (isa<ConstantFPSDNode>(N))
-    return N.getNode();
+    return true;
 
   if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
-    return N.getNode();
+    return true;
 
   if ((N.getOpcode() == ISD::SPLAT_VECTOR) &&
       isa<ConstantFPSDNode>(N.getOperand(0)))
-    return N.getNode();
+    return true;
 
-  return nullptr;
+  return false;
 }
 
 std::optional<bool> SelectionDAG::isBoolConstant(SDValue N,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5e5afdb7fa0a6c..3a5fc44db8258e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20713,7 +20713,7 @@ static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {
 
   if (!Add.hasOneUse())
     return SDValue();
-  if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(X)))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(X))
     return SDValue();
 
   SDValue M1 = Add.getOperand(0);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aa6e75cbf41085..b524b3983376af 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56552,8 +56552,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
 
   // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
   auto IsNonOpaqueConstant = [&](SDValue Op) {
-    if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
-      if (auto *Cst = dyn_cast<ConstantSDNode>(C))
+    if (DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
+      if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
         return !Cst->isOpaque();
       return true;
     }
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 90733dfb8465ef..44ab33ad67f272 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -122,7 +122,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
 ; CHECK-LABEL: mul_v32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-NEXT:    vpand %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm3
 ; CHECK-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
 ; CHECK-NEXT:    vpand %ymm2, %ymm3, %ymm3
 ; CHECK-NEXT:    vpandn %ymm1, %ymm2, %ymm1
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index 7eee418742ddb5..c982884314f623 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -725,12 +725,11 @@ define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm7, %xmm6
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519]
-; SSE41-NEXT:    movdqa %xmm8, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm6
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711]
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
 ; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index 6fd3db3464decb..ee83a79b6dd550 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -2369,8 +2369,8 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; SSE41-NEXT:    psrlw $1, %xmm1
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm5
 ; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
 ; SSE41-NEXT:    pand %xmm3, %xmm5
@@ -2391,7 +2391,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
 ; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
@@ -2432,7 +2432,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOP-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -2450,7 +2450,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -2592,8 +2592,8 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; SSE41-NEXT:    psrlw $1, %xmm1
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm1, %xmm5
 ; SSE41-NEXT:    pmaddubsw %xmm3, %xmm5
 ; SSE41-NEXT:    pand %xmm2, %xmm5
@@ -2616,7 +2616,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
 ; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -2659,7 +2659,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOP-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -2677,7 +2677,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
 ; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -2823,8 +2823,8 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; SSE41-NEXT:    psrlw $1, %xmm0
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm4
 ; SSE41-NEXT:    movdqa %xmm0, %xmm5
 ; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
 ; SSE41-NEXT:    pand %xmm3, %xmm5
@@ -2846,7 +2846,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm0, %xmm4
 ; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
@@ -2889,7 +2889,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm0, %xmm4
-; XOP-FALLBACK-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
 ; XOP-FALLBACK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
@@ -2908,7 +2908,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; XOPAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpmaddubsw %xmm4, %xmm0, %xmm4
-; XOPAVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
 ; XOPAVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
@@ -3054,8 +3054,8 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ; SSE41-NEXT:    psrlw $1, %xmm1
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm5
 ; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
 ; SSE41-NEXT:    pand %xmm3, %xmm5
@@ -3077,7 +3077,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
 ; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
@@ -3120,7 +3120,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ; XOP-FALLBACK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; XOP-FALLBACK-NEXT:    vpandn %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; XOP-FALLBA...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/112710