[llvm] r330497 - [PowerPC] fix incorrect vectorization of abs() on POWER9

Sat Apr 21 02:32:17 PDT 2018

Author: inouehrs
Date: Sat Apr 21 02:32:17 2018
New Revision: 330497

URL: http://llvm.org/viewvc/llvm-project?rev=330497&view=rev
Log:
[PowerPC] fix incorrect vectorization of abs() on POWER9

Vectorized loops with abs() returns incorrect results on POWER9. This patch fixes it.
For example the following code returns negative result if input values are negative though it sums up the absolute value of the inputs.

int vpx_satd_c(const int16_t *coeff, int length) {
  int satd = 0;
  for (int i = 0; i < length; ++i) satd += abs(coeff[i]);
  return satd;
}

This problem causes test failures for libvpx.
For vector absolute and vector absolute difference on POWER9, LLVM generates VABSDUW (Vector Absolute Difference Unsigned Word) instruction or variants.
Since these instructions are for unsigned integers, we need adjustment for signed integers.
For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000). Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1. For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000).

Differential Revision: https://reviews.llvm.org/D45522


Modified:
    llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
    llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td
    llvm/trunk/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll

Modified: llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp?rev=330497&r1=330496&r2=330497&view=diff
==============================================================================

--- llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp Sat Apr 21 02:32:17 2018
@@ -327,6 +327,7 @@ private:
 
     bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
     void transferMemOperands(SDNode *N, SDNode *Result);
+    MachineSDNode *flipSignBit(const SDValue &N, SDNode **SignBit = nullptr);
   };
 
 } // end anonymous namespace
@@ -3970,6 +3971,51 @@ void PPCDAGToDAGISel::transferMemOperand
   cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
 }
 
+/// This method returns a node after flipping the MSB of each element
+/// of vector integer type. Additionally, if SignBitVec is non-null,
+/// this method sets a node with one at MSB of all elements
+/// and zero at other bits in SignBitVec.
+MachineSDNode *
+PPCDAGToDAGISel::flipSignBit(const SDValue &N, SDNode **SignBitVec) {
+  SDLoc dl(N);
+  EVT VecVT = N.getValueType();
+  if (VecVT == MVT::v4i32) {
+    if (SignBitVec) {
+      SDNode *ZV = CurDAG->getMachineNode(PPC::V_SET0, dl, MVT::v4i32);
+      *SignBitVec = CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT,
+                                        SDValue(ZV, 0));
+    }
+    return CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, N);
+  }
+  else if (VecVT == MVT::v8i16) {
+    SDNode *Hi = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32,
+                                     getI32Imm(0x8000, dl));
+    SDNode *ScaImm = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32,
+                                         SDValue(Hi, 0),
+                                         getI32Imm(0x8000, dl));
+    SDNode *VecImm = CurDAG->getMachineNode(PPC::MTVSRWS, dl, VecVT,
+                                         SDValue(ScaImm, 0));
+    /*
+    Alternatively, we can do this as follow to use VRF instead of GPR.
+      vspltish 5, 1
+      vspltish 6, 15
+      vslh 5, 6, 5
+    */
+    if (SignBitVec) *SignBitVec = VecImm;
+    return CurDAG->getMachineNode(PPC::VADDUHM, dl, VecVT, N,
+                                  SDValue(VecImm, 0));
+  }
+  else if (VecVT == MVT::v16i8) {
+    SDNode *VecImm = CurDAG->getMachineNode(PPC::XXSPLTIB, dl, MVT::i32,
+                                         getI32Imm(0x80, dl));
+    if (SignBitVec) *SignBitVec = VecImm;
+    return CurDAG->getMachineNode(PPC::VADDUBM, dl, VecVT, N,
+                                  SDValue(VecImm, 0));
+  }
+  else
+    llvm_unreachable("Unsupported vector data type for flipSignBit");
+}
+
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
 void PPCDAGToDAGISel::Select(SDNode *N) {
@@ -4783,6 +4829,55 @@ void PPCDAGToDAGISel::Select(SDNode *N)
       return;
     }
   }
+  case ISD::ABS: {
+    assert(PPCSubTarget->hasP9Vector() && "ABS is supported with P9 Vector");
+
+    // For vector absolute difference, we use VABSDUW instruction of POWER9.
+    // Since VABSDU instructions are for unsigned integers, we need adjustment
+    // for signed integers.
+    // For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000).
+    // Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1.
+    // For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000).
+    EVT VecVT = N->getOperand(0).getValueType();
+    SDNode *AbsOp = nullptr;
+    unsigned AbsOpcode;
+
+    if (VecVT == MVT::v4i32)
+      AbsOpcode = PPC::VABSDUW;
+    else if (VecVT == MVT::v8i16)
+      AbsOpcode = PPC::VABSDUH;
+    else if (VecVT == MVT::v16i8)
+      AbsOpcode = PPC::VABSDUB;
+    else
+      llvm_unreachable("Unsupported vector data type for ISD::ABS");
+
+    // Even for signed integers, we can skip adjustment if all values are
+    // known to be positive (as signed integer) due to zero-extended inputs.
+    if (N->getOperand(0).getOpcode() == ISD::SUB &&
+        N->getOperand(0)->getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+        N->getOperand(0)->getOperand(1).getOpcode() == ISD::ZERO_EXTEND) {
+      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
+                                     SDValue(N->getOperand(0)->getOperand(0)),
+                                     SDValue(N->getOperand(0)->getOperand(1)));
+      ReplaceNode(N, AbsOp);
+      return;
+    }
+    if (N->getOperand(0).getOpcode() == ISD::SUB) {
+      SDValue SubVal = N->getOperand(0);
+      SDNode *Op0 = flipSignBit(SubVal->getOperand(0));
+      SDNode *Op1 = flipSignBit(SubVal->getOperand(1));
+      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
+                                     SDValue(Op0, 0), SDValue(Op1, 0));
+    }
+    else {
+      SDNode *Op1 = nullptr;
+      SDNode *Op0 = flipSignBit(N->getOperand(0), &Op1);
+      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, SDValue(Op0, 0),
+                                     SDValue(Op1, 0));
+    }
+    ReplaceNode(N, AbsOp);
+    return;
+  }
   }
 
   SelectCode(N);

Modified: llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td?rev=330497&r1=330496&r2=330497&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td Sat Apr 21 02:32:17 2018
@@ -1504,18 +1504,4 @@ def VABSDUW : VXForm_1<1155, (outs vrrc:
                        "vabsduw $vD, $vA, $vB", IIC_VecGeneral,
                        [(set v4i32:$vD, (int_ppc_altivec_vabsduw v4i32:$vA, v4i32:$vB))]>;
 
-def : Pat<(v16i8:$vD (abs v16i8:$vA)),
-          (v16i8 (VABSDUB $vA, (v16i8 (V_SET0B))))>;
-def : Pat<(v8i16:$vD (abs v8i16:$vA)),
-          (v8i16 (VABSDUH $vA, (v8i16 (V_SET0H))))>;
-def : Pat<(v4i32:$vD (abs v4i32:$vA)),
-          (v4i32 (VABSDUW $vA, (v4i32 (V_SET0))))>;
-
-def : Pat<(v16i8:$vD (abs (sub v16i8:$vA, v16i8:$vB))),
-          (v16i8 (VABSDUB $vA, $vB))>;
-def : Pat<(v8i16:$vD (abs (sub v8i16:$vA, v8i16:$vB))),
-          (v8i16 (VABSDUH $vA, $vB))>;
-def : Pat<(v4i32:$vD (abs (sub v4i32:$vA, v4i32:$vB))),
-          (v4i32 (VABSDUW $vA, $vB))>;
-
 } // end HasP9Altivec

Modified: llvm/trunk/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll?rev=330497&r1=330496&r2=330497&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll (original)
+++ llvm/trunk/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll Sat Apr 21 02:32:17 2018
@@ -9,8 +9,10 @@ entry:
   %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %a, <4 x i32> %sub.i)
   ret <4 x i32> %0
 ; CHECK-LABEL: simple_absv_32
-; CHECK: vxor [[ZERO:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-NEXT: vabsduw 2, 2, [[ZERO]]
+; CHECK-DAG: vxor {{[0-9]+}}, [[REG:[0-9]+]], [[REG]]
+; CHECK-DAG: xvnegsp 34, 34
+; CHECK-DAG: xvnegsp 35, {{[0-9]+}}
+; CHECK-NEXT: vabsduw 2, 2, {{[0-9]+}}
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_32
 ; CHECK-PWR8: xxlxor
@@ -26,8 +28,10 @@ entry:
   %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %sub.i, <4 x i32> %a)
   ret <4 x i32> %0
 ; CHECK-LABEL: simple_absv_32_swap
-; CHECK: vxor [[ZERO:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-NEXT: vabsduw 2, 2, [[ZERO]]
+; CHECK-DAG: vxor {{[0-9]+}}, [[REG:[0-9]+]], [[REG]]
+; CHECK-DAG: xvnegsp 34, 34
+; CHECK-DAG: xvnegsp 35, {{[0-9]+}}
+; CHECK-NEXT: vabsduw 2, 2, {{[0-9]+}}
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_32_swap
 ; CHECK-PWR8: xxlxor
@@ -42,8 +46,9 @@ entry:
   %0 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %a, <8 x i16> %sub.i)
   ret <8 x i16> %0
 ; CHECK-LABEL: simple_absv_16
-; CHECK: vxor [[ZERO:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-NEXT: vabsduh 2, 2, [[ZERO]]
+; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NEXT: vadduhm 2, 2, [[IMM:[0-9]+]]
+; CHECK-NEXT: vabsduh 2, 2, [[IMM]]
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_16
 ; CHECK-PWR8: xxlxor
@@ -59,8 +64,9 @@ entry:
   %0 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %a, <16 x i8> %sub.i)
   ret <16 x i8> %0
 ; CHECK-LABEL: simple_absv_8
-; CHECK: vxor [[ZERO:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-NEXT: vabsdub 2, 2, [[ZERO]]
+; CHECK: xxspltib {{[0-9]+}}, 128
+; CHECK-NEXT: vaddubm 2, 2, [[IMM:[0-9]+]]
+; CHECK-NEXT: vabsdub 2, 2, [[IMM]]
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_8
 ; CHECK-PWR8: xxlxor
@@ -79,7 +85,9 @@ entry:
   %3 = select <4 x i1> %1, <4 x i32> %0, <4 x i32> %2
   ret <4 x i32> %3
 ; CHECK-LABEL: sub_absv_32
-; CHECK: vabsduw 2, 2, 3
+; CHECK-DAG: xvnegsp 34, 34
+; CHECK-DAG: xvnegsp 35, 35
+; CHECK-NEXT: vabsduw 2, 2, 3
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: sub_absv_32
 ; CHECK-PWR8: vsubuwm