[llvm] r253952 - [X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.

Cong Hou via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 23 21:44:19 PST 2015


Author: conghou
Date: Mon Nov 23 23:44:19 2015
New Revision: 253952

URL: http://llvm.org/viewvc/llvm-project?rev=253952&view=rev
Log:
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.

This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:

%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>

and with this patch it will be converted to a X86ISD::AVG instruction.

The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.

Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.


Differential revision: http://reviews.llvm.org/D14761



Added:
    llvm/trunk/test/CodeGen/X86/avg.ll
Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86InstrSSE.td
    llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=253952&r1=253951&r2=253952&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Nov 23 23:44:19 2015
@@ -1779,6 +1779,7 @@ X86TargetLowering::X86TargetLowering(con
   setTargetDAGCombine(ISD::MLOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::MSTORE);
+  setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
@@ -19853,6 +19854,36 @@ void X86TargetLowering::ReplaceNodeResul
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case X86ISD::AVG: {
+    // Legalize types for X86ISD::AVG by expanding vectors.
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+
+    auto InVT = N->getValueType(0);
+    auto InVTSize = InVT.getSizeInBits();
+    const unsigned RegSize =
+        (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
+    assert((!Subtarget->hasAVX512() || RegSize < 512) &&
+           "512-bit vector requires AVX512");
+    assert((!Subtarget->hasAVX2() || RegSize < 256) &&
+           "256-bit vector requires AVX2");
+
+    auto ElemVT = InVT.getVectorElementType();
+    auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+                                  RegSize / ElemVT.getSizeInBits());
+    assert(RegSize % InVT.getSizeInBits() == 0);
+    unsigned NumConcat = RegSize / InVT.getSizeInBits();
+
+    SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
+    Ops[0] = N->getOperand(0);
+    SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+    Ops[0] = N->getOperand(1);
+    SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+
+    SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
+    Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
+                                  DAG.getIntPtrConstant(0, dl)));
+    return;
+  }
   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   case X86ISD::FMINC:
   case X86ISD::FMIN:
@@ -25347,6 +25378,132 @@ static SDValue PerformXorCombine(SDNode
   return SDValue();
 }
 
+/// This function detects the AVG pattern between vectors of unsigned i8/i16,
+/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
+/// X86ISD::AVG instruction.
+static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+                                const X86Subtarget *Subtarget, SDLoc DL) {
+  if (!VT.isVector() || !VT.isSimple())
+    return SDValue();
+  EVT InVT = In.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  EVT ScalarVT = VT.getVectorElementType();
+  if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
+        isPowerOf2_32(NumElems)))
+    return SDValue();
+
+  // InScalarVT is the intermediate type in AVG pattern and it should be greater
+  // than the original input type (i8/i16).
+  EVT InScalarVT = InVT.getVectorElementType();
+  if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
+    return SDValue();
+
+  if (Subtarget->hasAVX512()) {
+    if (VT.getSizeInBits() > 512)
+      return SDValue();
+  } else if (Subtarget->hasAVX2()) {
+    if (VT.getSizeInBits() > 256)
+      return SDValue();
+  } else {
+    if (VT.getSizeInBits() > 128)
+      return SDValue();
+  }
+
+  // Detect the following pattern:
+  //
+  //   %1 = zext <N x i8> %a to <N x i32>
+  //   %2 = zext <N x i8> %b to <N x i32>
+  //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
+  //   %4 = add nuw nsw <N x i32> %3, %2
+  //   %5 = lshr <N x i32> %N, <i32 1 x N>
+  //   %6 = trunc <N x i32> %5 to <N x i8>
+  //
+  // In AVX512, the last instruction can also be a trunc store.
+
+  if (In.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  // A lambda checking the given SDValue is a constant vector and each element
+  // is in the range [Min, Max].
+  auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
+    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
+    if (!BV || !BV->isConstant())
+      return false;
+    for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+      if (!C)
+        return false;
+      uint64_t Val = C->getZExtValue();
+      if (Val < Min || Val > Max)
+        return false;
+    }
+    return true;
+  };
+
+  // Check if each element of the vector is left-shifted by one.
+  auto LHS = In.getOperand(0);
+  auto RHS = In.getOperand(1);
+  if (!IsConstVectorInRange(RHS, 1, 1))
+    return SDValue();
+  if (LHS.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // Detect a pattern of a + b + 1 where the order doesn't matter.
+  SDValue Operands[3];
+  Operands[0] = LHS.getOperand(0);
+  Operands[1] = LHS.getOperand(1);
+
+  // Take care of the case when one of the operands is a constant vector whose
+  // element is in the range [1, 256].
+  if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
+      Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
+      Operands[0].getOperand(0).getValueType() == VT) {
+    // The pattern is detected. Subtract one from the constant vector, then
+    // demote it and emit X86ISD::AVG instruction.
+    SDValue One = DAG.getConstant(1, DL, InScalarVT);
+    SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT,
+                               SmallVector<SDValue, 8>(NumElems, One));
+    Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones);
+    Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
+    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+                       Operands[1]);
+  }
+
+  if (Operands[0].getOpcode() == ISD::ADD)
+    std::swap(Operands[0], Operands[1]);
+  else if (Operands[1].getOpcode() != ISD::ADD)
+    return SDValue();
+  Operands[2] = Operands[1].getOperand(0);
+  Operands[1] = Operands[1].getOperand(1);
+
+  // Now we have three operands of two additions. Check that one of them is a
+  // constant vector with ones, and the other two are promoted from i8/i16.
+  for (int i = 0; i < 3; ++i) {
+    if (!IsConstVectorInRange(Operands[i], 1, 1))
+      continue;
+    std::swap(Operands[i], Operands[2]);
+
+    // Check if Operands[0] and Operands[1] are results of type promotion.
+    for (int j = 0; j < 2; ++j)
+      if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+          Operands[j].getOperand(0).getValueType() != VT)
+        return SDValue();
+
+    // The pattern is detected, emit X86ISD::AVG instruction.
+    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+                       Operands[1].getOperand(0));
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget,
+                          SDLoc(N));
+}
+
 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
@@ -25611,6 +25768,16 @@ static SDValue PerformSTORECombine(SDNod
   // First, pack all of the elements in one place. Next, store to memory
   // in fewer chunks.
   if (St->isTruncatingStore() && VT.isVector()) {
+    // Check if we can detect an AVG pattern from the truncation. If yes,
+    // replace the trunc store by a normal store with the result of X86ISD::AVG
+    // instruction.
+    SDValue Avg =
+        detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl);
+    if (Avg.getNode())
+      return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+                          St->getPointerInfo(), St->isVolatile(),
+                          St->isNonTemporal(), St->getAlignment());
+
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
@@ -26873,6 +27040,7 @@ SDValue X86TargetLowering::PerformDAGCom
   case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
+  case ISD::TRUNCATE:       return PerformTRUNCATECombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG, Subtarget);
   case X86ISD::FMIN:

Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=253952&r1=253951&r2=253952&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Mon Nov 23 23:44:19 2015
@@ -4046,6 +4046,10 @@ defm PMAXUB  : PDI_binop_all<0xDE, "pmax
                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 
 // Intrinsic forms
 defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
@@ -4062,10 +4066,6 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "
                                  int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
 defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
                                  int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
-defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
-                                 int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
-defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
-                                 int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
 defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
                                  int_x86_avx2_psad_bw, SSE_PMADD, 1>;
 

Modified: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h?rev=253952&r1=253951&r2=253952&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h Mon Nov 23 23:44:19 2015
@@ -250,6 +250,8 @@ static const IntrinsicData  IntrinsicsWi
   X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_b,  INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_w,  INTR_TYPE_2OP, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -1699,6 +1701,8 @@ static const IntrinsicData  IntrinsicsWi
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_b,       INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_w,       INTR_TYPE_2OP, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(sse2_pmaxs_w,      INTR_TYPE_2OP, ISD::SMAX, 0),
   X86_INTRINSIC_DATA(sse2_pmaxu_b,      INTR_TYPE_2OP, ISD::UMAX, 0),
   X86_INTRINSIC_DATA(sse2_pmins_w,      INTR_TYPE_2OP, ISD::SMIN, 0),

Added: llvm/trunk/test/CodeGen/X86/avg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avg.ll?rev=253952&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avg.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avg.ll Mon Nov 23 23:44:19 2015
@@ -0,0 +1,627 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW
+
+define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: avg_v4i8
+; SSE2:      # BB#0:
+; SSE2-NEXT: movd (%rdi), %xmm0           # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd (%rsi), %xmm1           # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: pavgb %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v4i8
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovd (%rdi), %xmm0
+; AVX2-NEXT: vmovd (%rsi), %xmm1
+; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovd %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = load <4 x i8>, <4 x i8>* %b
+  %3 = zext <4 x i8> %1 to <4 x i32>
+  %4 = zext <4 x i8> %2 to <4 x i32>
+  %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <4 x i32> %5, %4
+  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <4 x i32> %7 to <4 x i8>
+  store <4 x i8> %8, <4 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
+; SSE2-LABEL: avg_v8i8
+; SSE2:       # BB#0:
+; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
+; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
+; SSE2-NEXT: pavgb %xmm0, %xmm1
+; SSE2-NEXT: movq %xmm1, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v8i8
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovq (%rdi), %xmm0
+; AVX2-NEXT: vmovq (%rsi), %xmm1
+; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovq %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = load <8 x i8>, <8 x i8>* %b
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = zext <8 x i8> %2 to <8 x i32>
+  %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <8 x i32> %5, %4
+  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <8 x i32> %7 to <8 x i8>
+  store <8 x i8> %8, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
+; SSE2-LABEL: avg_v16i8
+; SSE2:       # BB#0:
+; SSE2-NEXT: movdqa (%rsi), %xmm0
+; SSE2-NEXT: pavgb (%rdi), %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v16i8
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rsi), %xmm0
+; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = load <16 x i8>, <16 x i8>* %b
+  %3 = zext <16 x i8> %1 to <16 x i32>
+  %4 = zext <16 x i8> %2 to <16 x i32>
+  %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <16 x i32> %5, %4
+  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <16 x i32> %7 to <16 x i8>
+  store <16 x i8> %8, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
+; AVX2-LABEL: avg_v32i8
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rsi), %ymm0
+; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+;
+  %1 = load <32 x i8>, <32 x i8>* %a
+  %2 = load <32 x i8>, <32 x i8>* %b
+  %3 = zext <32 x i8> %1 to <32 x i32>
+  %4 = zext <32 x i8> %2 to <32 x i32>
+  %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <32 x i32> %5, %4
+  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <32 x i32> %7 to <32 x i8>
+  store <32 x i8> %8, <32 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
+; AVX512BW-LABEL: avg_v64i8
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
+; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: retq
+;
+  %1 = load <64 x i8>, <64 x i8>* %a
+  %2 = load <64 x i8>, <64 x i8>* %b
+  %3 = zext <64 x i8> %1 to <64 x i32>
+  %4 = zext <64 x i8> %2 to <64 x i32>
+  %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <64 x i32> %5, %4
+  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <64 x i32> %7 to <64 x i8>
+  store <64 x i8> %8, <64 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
+; SSE2-LABEL: avg_v4i16
+; SSE2:       # BB#0:
+; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
+; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
+; SSE2-NEXT: pavgw %xmm0, %xmm1
+; SSE2-NEXT: movq %xmm1, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v4i16
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovq (%rdi), %xmm0
+; AVX2-NEXT: vmovq (%rsi), %xmm1
+; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovq %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = load <4 x i16>, <4 x i16>* %b
+  %3 = zext <4 x i16> %1 to <4 x i32>
+  %4 = zext <4 x i16> %2 to <4 x i32>
+  %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <4 x i32> %5, %4
+  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <4 x i32> %7 to <4 x i16>
+  store <4 x i16> %8, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
+; SSE2-LABEL: avg_v8i16
+; SSE2:       # BB#0:
+; SSE2-NEXT: movdqa (%rsi), %xmm0
+; SSE2-NEXT: pavgw (%rdi), %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v8i16
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rsi), %xmm0
+; AVX2-NEXT: vpavgw (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = load <8 x i16>, <8 x i16>* %b
+  %3 = zext <8 x i16> %1 to <8 x i32>
+  %4 = zext <8 x i16> %2 to <8 x i32>
+  %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <8 x i32> %5, %4
+  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  store <8 x i16> %8, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
+; AVX2-LABEL: avg_v16i16
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rsi), %ymm0
+; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+;
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = load <16 x i16>, <16 x i16>* %b
+  %3 = zext <16 x i16> %1 to <16 x i32>
+  %4 = zext <16 x i16> %2 to <16 x i32>
+  %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <16 x i32> %5, %4
+  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  store <16 x i16> %8, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
+; AVX512BW-LABEL: avg_v32i16
+; AVX512BW:      # BB#0:
+; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
+; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: retq
+;
+  %1 = load <32 x i16>, <32 x i16>* %a
+  %2 = load <32 x i16>, <32 x i16>* %b
+  %3 = zext <32 x i16> %1 to <32 x i32>
+  %4 = zext <32 x i16> %2 to <32 x i32>
+  %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <32 x i32> %5, %4
+  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  store <32 x i16> %8, <32 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: avg_v4i8_2
+; SSE2:       # BB#0:
+; SSE2-NEXT:  movd (%rdi), %xmm0
+; SSE2-NEXT:  movd (%rsi), %xmm1
+; SSE2-NEXT:  pavgb %xmm0, %xmm1
+; SSE2-NEXT:  movd %xmm1, (%rax)
+; SSE2-NEXT:  retq
+;
+; AVX2-LABEL: avg_v4i8_2
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovd (%rdi), %xmm0
+; AVX2-NEXT: vmovd (%rsi), %xmm1
+; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = load <4 x i8>, <4 x i8>* %b
+  %3 = zext <4 x i8> %1 to <4 x i32>
+  %4 = zext <4 x i8> %2 to <4 x i32>
+  %5 = add nuw nsw <4 x i32> %3, %4
+  %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <4 x i32> %7 to <4 x i8>
+  store <4 x i8> %8, <4 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
+; SSE2-LABEL: avg_v8i8_2
+; SSE2:       # BB#0:
+; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
+; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
+; SSE2-NEXT: pavgb %xmm0, %xmm1
+; SSE2-NEXT: movq %xmm1, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v8i8_2
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovq (%rdi), %xmm0
+; AVX2-NEXT: vmovq (%rsi), %xmm1
+; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = load <8 x i8>, <8 x i8>* %b
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = zext <8 x i8> %2 to <8 x i32>
+  %5 = add nuw nsw <8 x i32> %3, %4
+  %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <8 x i32> %7 to <8 x i8>
+  store <8 x i8> %8, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
+; SSE2-LABEL: avg_v16i8_2
+; SSE2:       # BB#0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pavgb (%rsi), %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v16i8_2
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = load <16 x i8>, <16 x i8>* %b
+  %3 = zext <16 x i8> %1 to <16 x i32>
+  %4 = zext <16 x i8> %2 to <16 x i32>
+  %5 = add nuw nsw <16 x i32> %3, %4
+  %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <16 x i32> %7 to <16 x i8>
+  store <16 x i8> %8, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
+; AVX2-LABEL: avg_v32i8_2
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+;
+  %1 = load <32 x i8>, <32 x i8>* %a
+  %2 = load <32 x i8>, <32 x i8>* %b
+  %3 = zext <32 x i8> %1 to <32 x i32>
+  %4 = zext <32 x i8> %2 to <32 x i32>
+  %5 = add nuw nsw <32 x i32> %3, %4
+  %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <32 x i32> %7 to <32 x i8>
+  store <32 x i8> %8, <32 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
+; AVX512BW-LABEL: avg_v64i8_2
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
+; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: retq
+;
+  %1 = load <64 x i8>, <64 x i8>* %a
+  %2 = load <64 x i8>, <64 x i8>* %b
+  %3 = zext <64 x i8> %1 to <64 x i32>
+  %4 = zext <64 x i8> %2 to <64 x i32>
+  %5 = add nuw nsw <64 x i32> %4, %4
+  %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <64 x i32> %7 to <64 x i8>
+  store <64 x i8> %8, <64 x i8>* undef, align 4
+  ret void
+}
+
+
+define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
+; SSE2-LABEL: avg_v4i16_2
+; SSE2:       # BB#0:
+; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
+; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
+; SSE2-NEXT: pavgw %xmm0, %xmm1
+; SSE2-NEXT: movq %xmm1, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v4i16_2
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovq (%rdi), %xmm0
+; AVX2-NEXT: vmovq (%rsi), %xmm1
+; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = load <4 x i16>, <4 x i16>* %b
+  %3 = zext <4 x i16> %1 to <4 x i32>
+  %4 = zext <4 x i16> %2 to <4 x i32>
+  %5 = add nuw nsw <4 x i32> %3, %4
+  %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <4 x i32> %7 to <4 x i16>
+  store <4 x i16> %8, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
+; SSE2-LABEL: avg_v8i16_2
+; SSE2:       # BB#0:
+; SSE2-NEXT:  movdqa (%rdi), %xmm0
+; SSE2-NEXT:  pavgw (%rsi), %xmm0
+; SSE2-NEXT:  movdqu %xmm0, (%rax)
+; SSE2-NEXT:  retq
+;
+; AVX2-LABEL: avg_v8i16_2
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = load <8 x i16>, <8 x i16>* %b
+  %3 = zext <8 x i16> %1 to <8 x i32>
+  %4 = zext <8 x i16> %2 to <8 x i32>
+  %5 = add nuw nsw <8 x i32> %3, %4
+  %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  store <8 x i16> %8, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
+; AVX2-LABEL: avg_v16i16_2
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+;
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = load <16 x i16>, <16 x i16>* %b
+  %3 = zext <16 x i16> %1 to <16 x i32>
+  %4 = zext <16 x i16> %2 to <16 x i32>
+  %5 = add nuw nsw <16 x i32> %3, %4
+  %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  store <16 x i16> %8, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
+; AVX512BW-LABEL: avg_v32i16_2
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: retq
+;
+  %1 = load <32 x i16>, <32 x i16>* %a
+  %2 = load <32 x i16>, <32 x i16>* %b
+  %3 = zext <32 x i16> %1 to <32 x i32>
+  %4 = zext <32 x i16> %2 to <32 x i32>
+  %5 = add nuw nsw <32 x i32> %3, %4
+  %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  store <32 x i16> %8, <32 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v4i8_const(<4 x i8>* %a) {
+; SSE2-LABEL: avg_v4i8_const
+; SSE2:       # BB#0:
+; SSE2-NEXT: movd (%rdi), %xmm0           # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: pavgb {{.*}}, %xmm0
+; SSE2-NEXT: movd %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v4i8_const
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovd (%rdi), %xmm0
+; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
+  %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <4 x i32> %4 to <4 x i8>
+  store <4 x i8> %5, <4 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i8_const(<8 x i8>* %a) {
+; SSE2-LABEL: avg_v8i8_const
+; SSE2:       # BB#0:
+; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
+; SSE2-NEXT: pavgb {{.*}}, %xmm0
+; SSE2-NEXT: movq %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v8i8_const
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovq (%rdi), %xmm0
+; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <8 x i32> %4 to <8 x i8>
+  store <8 x i8> %5, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i8_const(<16 x i8>* %a) {
+; SSE2-LABEL: avg_v16i8_const
+; SSE2:       # BB#0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pavgb {{.*}}, %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v16i8_const
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = zext <16 x i8> %1 to <16 x i32>
+  %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <16 x i32> %4 to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i8_const(<32 x i8>* %a) {
+; AVX2-LABEL: avg_v32i8_const
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpavgb {{.*}}, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+;
+  %1 = load <32 x i8>, <32 x i8>* %a
+  %2 = zext <32 x i8> %1 to <32 x i32>
+  %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 
+  %5 = trunc <32 x i32> %4 to <32 x i8>
+  store <32 x i8> %5, <32 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v64i8_const(<64 x i8>* %a) {
+; AVX512BW-LABEL: avg_v64i8_const
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT: vpavgb {{.*}}, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: retq
+;
+  %1 = load <64 x i8>, <64 x i8>* %a
+  %2 = zext <64 x i8> %1 to <64 x i32>
+  %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 
+  %5 = trunc <64 x i32> %4 to <64 x i8>
+  store <64 x i8> %5, <64 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v4i16_const(<4 x i16>* %a) {
+; SSE2-LABEL: avg_v4i16_const
+; SSE2:       # BB#0:
+; SSE2-NEXT: movq (%rdi), %xmm0
+; SSE2-NEXT: pavgw {{.*}}, %xmm0
+; SSE2-NEXT: movq %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v4i16_const
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovq (%rdi), %xmm0
+; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
+  %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <4 x i32> %4 to <4 x i16>
+  store <4 x i16> %5, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i16_const(<8 x i16>* %a) {
+; SSE2-LABEL: avg_v8i16_const
+; SSE2:       # BB#0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pavgw {{.*}}, %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: avg_v8i16_const
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: retq
+;
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = zext <8 x i16> %1 to <8 x i32>
+  %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <8 x i32> %4 to <8 x i16>
+  store <8 x i16> %5, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i16_const(<16 x i16>* %a) {
+; AVX2-LABEL: avg_v16i16_const
+; AVX2:       # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpavgw {{.*}}, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+;
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = zext <16 x i16> %1 to <16 x i32>
+  %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <16 x i32> %4 to <16 x i16>
+  store <16 x i16> %5, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i16_const(<32 x i16>* %a) {
+; AVX512BW-LABEL: avg_v32i16_const
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT: vpavgw {{.*}}, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+;
+  %1 = load <32 x i16>, <32 x i16>* %a
+  %2 = zext <32 x i16> %1 to <32 x i32>
+  %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 
+  %5 = trunc <32 x i32> %4 to <32 x i16>
+  store <32 x i16> %5, <32 x i16>* undef, align 4
+  ret void
+}




More information about the llvm-commits mailing list