[llvm] 03380c7 - [DAGCombine] Basic combines for AVG nodes.

Mon Feb 14 03:18:41 PST 2022

Author: David Green
Date: 2022-02-14T11:18:35Z
New Revision: 03380c70ed548224e2fd8280bc92f69d356d258c

URL: https://github.com/llvm/llvm-project/commit/03380c70ed548224e2fd8280bc92f69d356d258c
DIFF: https://github.com/llvm/llvm-project/commit/03380c70ed548224e2fd8280bc92f69d356d258c.diff

LOG: [DAGCombine] Basic combines for AVG nodes.

This adds very basic combines for AVG nodes, mostly for constant folding
and handling degenerate (zero) cases. The code performs mostly the same
transforms as visitMULHS, adjusted for AVG nodes.

Constant folding extends to a higher bitwidth and drops the lowest bit.
For undef nodes, `avg undef, x` is transformed to x.  There is also a
transform for `avgfloor x, 0` transforming to `shr x, 1`.

Differential Revision: https://reviews.llvm.org/D119559

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/test/CodeGen/AArch64/hadd-combine.ll
    llvm/test/CodeGen/X86/pic-load-remat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8646fabb12620..4f42815065b79 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -426,6 +426,7 @@ namespace {
     SDValue visitREM(SDNode *N);
     SDValue visitMULHU(SDNode *N);
     SDValue visitMULHS(SDNode *N);
+    SDValue visitAVG(SDNode *N);
     SDValue visitSMUL_LOHI(SDNode *N);
     SDValue visitUMUL_LOHI(SDNode *N);
     SDValue visitMULO(SDNode *N);
@@ -1635,6 +1636,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::UREM:               return visitREM(N);
   case ISD::MULHU:              return visitMULHU(N);
   case ISD::MULHS:              return visitMULHS(N);
+  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU:
+  case ISD::AVGCEILS:
+  case ISD::AVGCEILU:           return visitAVG(N);
   case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
   case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
   case ISD::SMULO:
@@ -4654,6 +4659,46 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitAVG(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // fold (avg c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
+
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (avgfloor x, 0) -> x >> 1
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
+      if (Opcode == ISD::AVGFLOORS)
+        return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT));
+      if (Opcode == ISD::AVGFLOORU)
+        return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT));
+    }
+  }
+
+  // fold (avg x, undef) -> x
+  if (N0.isUndef())
+    return N1;
+  if (N1.isUndef())
+    return N0;
+
+  // TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1
+
+  return SDValue();
+}
+
 /// Perform optimizations common to nodes that compute two values. LoOp and HiOp
 /// give the opcodes for the two computations that are being performed. Return
 /// true if a simplification was made.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9693cede668c6..b3a929050d7cd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5274,6 +5274,30 @@ static llvm::Optional<APInt> FoldValue(unsigned Opcode, const APInt &C1,
     APInt C2Ext = C2.zext(FullWidth);
     return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
   }
+  case ISD::AVGFLOORS: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.sext(FullWidth);
+    APInt C2Ext = C2.sext(FullWidth);
+    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGFLOORU: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.zext(FullWidth);
+    APInt C2Ext = C2.zext(FullWidth);
+    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGCEILS: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.sext(FullWidth);
+    APInt C2Ext = C2.sext(FullWidth);
+    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGCEILU: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.zext(FullWidth);
+    APInt C2Ext = C2.zext(FullWidth);
+    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
+  }
   }
   return llvm::None;
 }

diff  --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
index 3de09ebcfa20c..6c2809ce2070f 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -135,8 +135,7 @@ define <8 x i16> @haddu_i_const_lhs(<8 x i16> %src1) {
 define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: haddu_i_const_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -145,9 +144,7 @@ define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @haddu_i_const_both() {
 ; CHECK-LABEL: haddu_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
-; CHECK-NEXT:    movi v1.8h, #3
-; CHECK-NEXT:    uhadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -156,18 +153,16 @@ define <8 x i16> @haddu_i_const_both() {
 define <8 x i16> @haddu_i_const_bothhigh() {
 ; CHECK-LABEL: haddu_i_const_bothhigh:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
-; CHECK-NEXT:    mvni v1.8h, #1
-; CHECK-NEXT:    uhadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    mvni v0.8h, #1
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
   ret <8 x i16> %result
 }
 
-define <8 x i16> @haddu_i_undef(<8 x i16> %src1) {
+define <8 x i16> @haddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: haddu_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -312,8 +307,7 @@ define <8 x i16> @hadds_i_const_lhs(<8 x i16> %src1) {
 define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: hadds_i_const_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    sshr v0.8h, v0.8h, #1
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -322,9 +316,7 @@ define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @hadds_i_const_both() {
 ; CHECK-LABEL: hadds_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
-; CHECK-NEXT:    movi v1.8h, #3
-; CHECK-NEXT:    shadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -334,18 +326,16 @@ define <8 x i16> @hadds_i_const_bothhigh() {
 ; CHECK-LABEL: hadds_i_const_bothhigh:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #32766
-; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    shadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
   ret <8 x i16> %result
 }
 
-define <8 x i16> @hadds_i_undef(<8 x i16> %src1) {
+define <8 x i16> @hadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: hadds_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -508,9 +498,7 @@ define <8 x i16> @rhaddu_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @rhaddu_i_const_both() {
 ; CHECK-LABEL: rhaddu_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
-; CHECK-NEXT:    movi v1.8h, #3
-; CHECK-NEXT:    urhadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -520,17 +508,15 @@ define <8 x i16> @rhaddu_i_const_bothhigh() {
 ; CHECK-LABEL: rhaddu_i_const_bothhigh:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
-; CHECK-NEXT:    mvni v1.8h, #1
-; CHECK-NEXT:    urhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
   ret <8 x i16> %result
 }
 
-define <8 x i16> @rhaddu_i_undef(<8 x i16> %src1) {
+define <8 x i16> @rhaddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: rhaddu_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -693,9 +679,7 @@ define <8 x i16> @rhadds_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @rhadds_i_const_both() {
 ; CHECK-LABEL: rhadds_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
-; CHECK-NEXT:    movi v1.8h, #3
-; CHECK-NEXT:    srhadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -704,19 +688,16 @@ define <8 x i16> @rhadds_i_const_both() {
 define <8 x i16> @rhadds_i_const_bothhigh() {
 ; CHECK-LABEL: rhadds_i_const_bothhigh:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32766
 ; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    srhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
   ret <8 x i16> %result
 }
 
-define <8 x i16> @rhadds_i_undef(<8 x i16> %src1) {
+define <8 x i16> @rhadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: rhadds_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result

diff  --git a/llvm/test/CodeGen/X86/pic-load-remat.ll b/llvm/test/CodeGen/X86/pic-load-remat.ll
index 8596e5699b280..0b3dc41e083b2 100644
--- a/llvm/test/CodeGen/X86/pic-load-remat.ll
+++ b/llvm/test/CodeGen/X86/pic-load-remat.ll
@@ -7,10 +7,9 @@ define void @f() nounwind  {
 ; CHECK-NEXT:    calll L0$pb
 ; CHECK-NEXT:  L0$pb:
 ; CHECK-NEXT:    popl %eax
-; CHECK-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    psllw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm1
-; CHECK-NEXT:    pavgw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [21183,21183,21183,21183,21183,21183,21183,21183]
 ; CHECK-NEXT:    paddsw %xmm0, %xmm0
 ; CHECK-NEXT:    paddw %xmm1, %xmm0
 ; CHECK-NEXT:    .p2align 4, 0x90