[llvm] fdd3e2c - [DAG] SelectionDAG::getNode(N1,N2) - detect N2 constant vector splats as well as scalars

Thu Jan 27 02:59:25 PST 2022

Author: Simon Pilgrim
Date: 2022-01-27T10:59:08Z
New Revision: fdd3e2c9438d0e6b8a3e2d0d1ed848d39fa25f97

URL: https://github.com/llvm/llvm-project/commit/fdd3e2c9438d0e6b8a3e2d0d1ed848d39fa25f97
DIFF: https://github.com/llvm/llvm-project/commit/fdd3e2c9438d0e6b8a3e2d0d1ed848d39fa25f97.diff

LOG: [DAG] SelectionDAG::getNode(N1,N2) - detect N2 constant vector splats as well as scalars

We already perform some basic folds (add/sub with zero etc.) on scalar types, this patch adds some basic support for constant splats as well in a few cases (we can add more with future test coverage).

In the cases I've enabled, we can handle buildvector implicit truncation as we're not creating new constant nodes from the vector types - we're just returning existing nodes. This allows us to get a number of extra cases in the aarch64 tests.

I haven't enabled support for undefs in buildvector splats, as we're often checking for zero/allones patterns that return the original constant and we shouldn't be returning undef elements in some of these cases - we can enable this later if we're OK with creating new constants.

Differential Revision: https://reviews.llvm.org/D118264

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
    llvm/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
    llvm/test/CodeGen/X86/vec_smulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 28363e2b9e88e..45f3005e8f570 100644

--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5639,8 +5639,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       std::swap(N1, N2);
   }
 
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
-  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+  auto *N1C = dyn_cast<ConstantSDNode>(N1);
+  auto *N2C = dyn_cast<ConstantSDNode>(N2);
+
+  // Don't allow undefs in vector splats - we might be returning N2 when folding
+  // to zero etc.
+  ConstantSDNode *N2CV =
+      isConstOrConstSplat(N2, /*AllowUndefs*/ false, /*AllowTruncation*/ true);
 
   switch (Opcode) {
   default: break;
@@ -5671,9 +5676,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            N1.getValueType() == VT && "Binary operator types must match!");
     // (X & 0) -> 0.  This commonly occurs when legalizing i64 values, so it's
     // worth handling here.
-    if (N2C && N2C->isZero())
+    if (N2CV && N2CV->isZero())
       return N2;
-    if (N2C && N2C->isAllOnes()) // X & -1 -> X
+    if (N2CV && N2CV->isAllOnes()) // X & -1 -> X
       return N1;
     break;
   case ISD::OR:
@@ -5685,7 +5690,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            N1.getValueType() == VT && "Binary operator types must match!");
     // (X ^|+- 0) -> X.  This commonly occurs when legalizing i64 values, so
     // it's worth handling here.
-    if (N2C && N2C->isZero())
+    if (N2CV && N2CV->isZero())
       return N1;
     if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && VT.isVector() &&
         VT.getVectorElementType() == MVT::i1)
@@ -5791,7 +5796,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // size of the value, the shift/rotate count is guaranteed to be zero.
     if (VT == MVT::i1)
       return N1;
-    if (N2C && N2C->isZero())
+    if (N2CV && N2CV->isZero())
       return N1;
     break;
   case ISD::FP_ROUND:

diff  --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index b6b9d0209cff0..67713805de7c8 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -822,7 +822,6 @@ define <8 x i8> @vselect_constant_cond_zero_v8i8(<8 x i8> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d1, #0x00000000ff00ff
 ; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    orr v0.2s, #0
 ; CHECK-NEXT:    ret
   %b = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i8> %a, <8 x i8> zeroinitializer
   ret <8 x i8> %b
@@ -833,7 +832,6 @@ define <4 x i16> @vselect_constant_cond_zero_v4i16(<4 x i16> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d1, #0xffff00000000ffff
 ; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    orr v0.2s, #0
 ; CHECK-NEXT:    ret
   %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> %a, <4 x i16> zeroinitializer
   ret <4 x i16> %b
@@ -845,7 +843,6 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    adrp x8, .LCPI85_0
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI85_0]
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    orr v0.4s, #0
 ; CHECK-NEXT:    ret
   %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> %a, <4 x i32> zeroinitializer
   ret <4 x i32> %b

diff  --git a/llvm/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll b/llvm/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
index f611c934617de..4ddf669ede19f 100644
--- a/llvm/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
+++ b/llvm/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
@@ -7,15 +7,13 @@ target triple = "armv7--linux-gnueabi"
 define void @function() {
 ; CHECK: cmp r0, #0
 ; CHECK: bxne lr
-; CHECK: vmov.i32 q8, #0xff0000
 entry:
   br i1 undef, label %vector.body, label %for.end
 
-; CHECK: vld1.32 {d18, d19}, [r0]
-; CHECK: vand q10, q9, q8
-; CHECK: vbic.i16 q9, #0xff
-; CHECK: vorr q9, q9, q10
-; CHECK: vst1.32 {d18, d19}, [r0]
+; CHECK: vld1.32 {d16, d17}, [r0]
+; CHECK: vbic.i32 q8, #0xff
+; CHECK: vorr q8, q8, q9
+; CHECK: vst1.32 {d16, d17}, [r0]
 vector.body:
   %wide.load = load <4 x i32>, <4 x i32>* undef, align 4
   %0 = and <4 x i32> %wide.load, <i32 -16711936, i32 -16711936, i32 -16711936, i32 -16711936>

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index fa06df5f1b576..550c755708239 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3282,9 +3282,9 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX-LABEL: smulo_v4i1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX-NEXT:    vmovmskps %xmm0, %eax
-; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT:    vpslld $31, %xmm0, %xmm1
+; AVX-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX-NEXT:    vmovmskps %xmm1, %eax
 ; AVX-NEXT:    movb %al, (%rdi)
 ; AVX-NEXT:    retq
 ;