[llvm] 0c4651f - [CostModel][AArch64] Improve cost model for vector reduction intrinsics

Thu Jun 24 04:16:05 PDT 2021

Author: Rosie Sumpter
Date: 2021-06-24T12:02:58+01:00
New Revision: 0c4651f0a883443259684aa6de69d26a5bd49e46

URL: https://github.com/llvm/llvm-project/commit/0c4651f0a883443259684aa6de69d26a5bd49e46
DIFF: https://github.com/llvm/llvm-project/commit/0c4651f0a883443259684aa6de69d26a5bd49e46.diff

LOG: [CostModel][AArch64] Improve cost model for vector reduction intrinsics

OR, XOR and AND entries are added to the cost table. An extra cost
is added when vector splitting occurs.

This is done to address the issue of a missed SLP vectorization
opportunity due to unreasonably high costs being attributed to the vector
Or reduction (see: https://bugs.llvm.org/show_bug.cgi?id=44593).

Differential Revision: https://reviews.llvm.org/D104538

Added: 
    llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
    llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 37e98529d0e99..90a62344d74ee 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1791,17 +1791,68 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   // Horizontal adds can use the 'addv' instruction. We model the cost of these
   // instructions as normal vector adds. This is the only arithmetic vector
   // reduction operation for which we have an instruction.
+  // OR, XOR and AND costs should match the codegen from:
+  // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
+  // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
+  // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
   static const CostTblEntry CostTblNoPairwise[]{
-      {ISD::ADD, MVT::v8i8,  1},
-      {ISD::ADD, MVT::v16i8, 1},
-      {ISD::ADD, MVT::v4i16, 1},
-      {ISD::ADD, MVT::v8i16, 1},
-      {ISD::ADD, MVT::v4i32, 1},
+      {ISD::ADD, MVT::v8i8,   1},
+      {ISD::ADD, MVT::v16i8,  1},
+      {ISD::ADD, MVT::v4i16,  1},
+      {ISD::ADD, MVT::v8i16,  1},
+      {ISD::ADD, MVT::v4i32,  1},
+      {ISD::OR,  MVT::v8i8,  15},
+      {ISD::OR,  MVT::v16i8, 17},
+      {ISD::OR,  MVT::v4i16,  7},
+      {ISD::OR,  MVT::v8i16,  9},
+      {ISD::OR,  MVT::v2i32,  3},
+      {ISD::OR,  MVT::v4i32,  5},
+      {ISD::OR,  MVT::v2i64,  3},
+      {ISD::XOR, MVT::v8i8,  15},
+      {ISD::XOR, MVT::v16i8, 17},
+      {ISD::XOR, MVT::v4i16,  7},
+      {ISD::XOR, MVT::v8i16,  9},
+      {ISD::XOR, MVT::v2i32,  3},
+      {ISD::XOR, MVT::v4i32,  5},
+      {ISD::XOR, MVT::v2i64,  3},
+      {ISD::AND, MVT::v8i8,  15},
+      {ISD::AND, MVT::v16i8, 17},
+      {ISD::AND, MVT::v4i16,  7},
+      {ISD::AND, MVT::v8i16,  9},
+      {ISD::AND, MVT::v2i32,  3},
+      {ISD::AND, MVT::v4i32,  5},
+      {ISD::AND, MVT::v2i64,  3},
   };
-
-  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
-    return LT.first * Entry->Cost;
-
+  switch (ISD) {
+  default:
+    break;
+  case ISD::ADD:
+    if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
+      return LT.first * Entry->Cost;
+    break;
+  case ISD::XOR:
+  case ISD::AND:
+  case ISD::OR:
+    const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
+    if (!Entry)
+      break;
+    auto *ValVTy = cast<FixedVectorType>(ValTy);
+    if (!ValVTy->getElementType()->isIntegerTy(1) &&
+        MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
+        isPowerOf2_32(ValVTy->getNumElements())) {
+      InstructionCost ExtraCost = 0;
+      if (LT.first != 1) {
+        // Type needs to be split, so there is an extra cost of LT.first - 1
+        // arithmetic ops.
+        auto *Ty = FixedVectorType::get(ValTy->getElementType(),
+                                        MTy.getVectorNumElements());
+        ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
+        ExtraCost *= LT.first - 1;
+      }
+      return Entry->Cost + ExtraCost;
+    }
+    break;
+  }
   return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
                                            CostKind);
 }

diff  --git a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
index cbf04bfa82385..6d2fdaf573136 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
@@ -11,8 +11,24 @@ define i32 @reduce_i1(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 181 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 362 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64i8 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+
   %V1   = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
   %V2   = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
   %V4   = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
@@ -21,6 +37,22 @@ define i32 @reduce_i1(i32 %arg) {
   %V32  = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
   %V64  = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
   %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+
+  %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
+  %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
+  %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+  %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+  %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+  %V32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+  %V64i8 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+  %V4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+  %V8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+  %V16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+  %V2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+  %V4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+  %V8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+  %V2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+  %V4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
   ret i32 undef
 }
 
@@ -32,3 +64,18 @@ declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>)
 declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>)
 declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>)
 declare i1 @llvm.vector.reduce.and.v128i1(<128 x i1>)
+declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>)
+declare i8 @llvm.vector.reduce.and.v3i8(<3 x i8>)
+declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
+declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
+declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)

diff  --git a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
index dba196f0f042e..82068c736533d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
@@ -11,8 +11,24 @@ define i32 @reduce_i1(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 181 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 362 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64i8 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+
   %V1   = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
   %V2   = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
   %V4   = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
@@ -21,6 +37,22 @@ define i32 @reduce_i1(i32 %arg) {
   %V32  = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
   %V64  = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
   %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+
+  %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
+  %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
+  %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+  %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+  %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+  %V32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+  %V64i8 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+  %V4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+  %V8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+  %V16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+  %V2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+  %V4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+  %V8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+  %V2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+  %V4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
   ret i32 undef
 }
 
@@ -32,3 +64,18 @@ declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>)
 declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>)
 declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>)
 declare i1 @llvm.vector.reduce.or.v128i1(<128 x i1>)
+declare i8 @llvm.vector.reduce.or.v1i8(<1 x i8>)
+declare i8 @llvm.vector.reduce.or.v3i8(<3 x i8>)
+declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
+declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
+declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)

diff  --git a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
new file mode 100644
index 0000000000000..cebcc3aace493
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -cost-model -cost-kind=throughput -analyze | FileCheck %s
+
+define i32 @reduce_i1(i32 %arg) {
+; CHECK-LABEL: 'reduce_i1'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 455 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 637 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1001 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64i8 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+
+  %V1   = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+
+  %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
+  %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
+  %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+  %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+  %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+  %V32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+  %V64i8 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+  %V4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+  %V8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+  %V16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+  %V2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+  %V4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+  %V8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+  %V2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+  %V4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+  ret i32 undef
+}
+
+declare i1 @llvm.vector.reduce.xor.v1i1(<1 x i1>)
+declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.xor.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.xor.v128i1(<128 x i1>)
+declare i8 @llvm.vector.reduce.xor.v1i8(<1 x i8>)
+declare i8 @llvm.vector.reduce.xor.v3i8(<3 x i8>)
+declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
+declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
+declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
index 7a34b67164a68..2f7076dd25ca3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
@@ -7,54 +7,29 @@ define i8 @reduce_and(%struct.buf* %a, %struct.buf* %b) {
 ; CHECK-LABEL: @reduce_and(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], %struct.buf* [[A:%.*]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B:%.*]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[XOR12:%.*]] = xor i8 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[AND13:%.*]] = and i8 [[XOR12]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[XOR12_1:%.*]] = xor i8 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[AND13_1:%.*]] = and i8 [[XOR12_1]], [[AND13]]
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT:    [[XOR12_2:%.*]] = xor i8 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[AND13_2:%.*]] = and i8 [[XOR12_2]], [[AND13_1]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 3
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[XOR12_3:%.*]] = xor i8 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[AND13_3:%.*]] = and i8 [[XOR12_3]], [[AND13_2]]
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_4]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX3_4]], align 1
-; CHECK-NEXT:    [[XOR12_4:%.*]] = xor i8 [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[AND13_4:%.*]] = and i8 [[XOR12_4]], [[AND13_3]]
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 5
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX_5]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 5
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX3_5]], align 1
-; CHECK-NEXT:    [[XOR12_5:%.*]] = xor i8 [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[AND13_5:%.*]] = and i8 [[XOR12_5]], [[AND13_4]]
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX_6]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX3_6]], align 1
-; CHECK-NEXT:    [[XOR12_6:%.*]] = xor i8 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[AND13_6:%.*]] = and i8 [[XOR12_6]], [[AND13_5]]
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX_7]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX3_7]], align 1
-; CHECK-NEXT:    [[XOR12_7:%.*]] = xor i8 [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[AND13_7:%.*]] = and i8 [[XOR12_7]], [[AND13_6]]
-; CHECK-NEXT:    ret i8 [[AND13_7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i8> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> [[TMP4]])
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i8 [[TMP5]], 1
+; CHECK-NEXT:    ret i8 [[OP_EXTRA]]
 ;
 entry:
   %arrayidx = getelementptr inbounds %struct.buf, %struct.buf* %a, i64 0, i32 0, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
index c456a3b8778e8..185eb8a73ffb1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
@@ -7,54 +7,29 @@ define i8 @reduce_or(%struct.buf* %a, %struct.buf* %b) {
 ; CHECK-LABEL: @reduce_or(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], %struct.buf* [[A:%.*]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B:%.*]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[XOR12:%.*]] = xor i8 [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[XOR12_1:%.*]] = xor i8 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[OR13_1:%.*]] = or i8 [[XOR12_1]], [[XOR12]]
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT:    [[XOR12_2:%.*]] = xor i8 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[OR13_2:%.*]] = or i8 [[XOR12_2]], [[OR13_1]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 3
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[XOR12_3:%.*]] = xor i8 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[OR13_3:%.*]] = or i8 [[XOR12_3]], [[OR13_2]]
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_4]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX3_4]], align 1
-; CHECK-NEXT:    [[XOR12_4:%.*]] = xor i8 [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[OR13_4:%.*]] = or i8 [[XOR12_4]], [[OR13_3]]
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 5
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX_5]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 5
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX3_5]], align 1
-; CHECK-NEXT:    [[XOR12_5:%.*]] = xor i8 [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[OR13_5:%.*]] = or i8 [[XOR12_5]], [[OR13_4]]
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX_6]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX3_6]], align 1
-; CHECK-NEXT:    [[XOR12_6:%.*]] = xor i8 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[OR13_6:%.*]] = or i8 [[XOR12_6]], [[OR13_5]]
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX_7]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX3_7]], align 1
-; CHECK-NEXT:    [[XOR12_7:%.*]] = xor i8 [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[OR13_7:%.*]] = or i8 [[XOR12_7]], [[OR13_6]]
-; CHECK-NEXT:    ret i8 [[OR13_7]]
-;
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i8> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP4]])
+; CHECK-NEXT:    ret i8 [[TMP5]]
+
 entry:
   %arrayidx = getelementptr inbounds %struct.buf, %struct.buf* %a, i64 0, i32 0, i64 0
   %0 = load i8, i8* %arrayidx, align 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll
index 6c35679581897..2c59e57cec56a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll
@@ -7,54 +7,29 @@ define i8 @reduce_xor(%struct.buf* %a, %struct.buf* %b) {
 ; CHECK-LABEL: @reduce_xor(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], %struct.buf* [[A:%.*]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B:%.*]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[AND12:%.*]] = and i8 [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[AND12_1:%.*]] = and i8 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[AND12]], [[AND12_1]]
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT:    [[AND12_2:%.*]] = and i8 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i8 [[TMP4]], [[AND12_2]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 3
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[AND12_3:%.*]] = and i8 [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i8 [[TMP7]], [[AND12_3]]
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX_4]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 4
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX3_4]], align 1
-; CHECK-NEXT:    [[AND12_4:%.*]] = and i8 [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor i8 [[TMP10]], [[AND12_4]]
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 5
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX_5]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 5
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX3_5]], align 1
-; CHECK-NEXT:    [[AND12_5:%.*]] = and i8 [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor i8 [[TMP13]], [[AND12_5]]
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6
-; CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX_6]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6
-; CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[ARRAYIDX3_6]], align 1
-; CHECK-NEXT:    [[AND12_6:%.*]] = and i8 [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = xor i8 [[TMP16]], [[AND12_6]]
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX_7]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP21:%.*]] = load i8, i8* [[ARRAYIDX3_7]], align 1
-; CHECK-NEXT:    [[AND12_7:%.*]] = and i8 [[TMP21]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = xor i8 [[TMP19]], [[AND12_7]]
-; CHECK-NEXT:    [[XOR13_7:%.*]] = xor i8 [[TMP22]], 1
-; CHECK-NEXT:    ret i8 [[XOR13_7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> [[TMP4]])
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = xor i8 [[TMP5]], 1
+; CHECK-NEXT:    ret i8 [[OP_EXTRA]]
 ;
 entry:
   %arrayidx = getelementptr inbounds %struct.buf, %struct.buf* %a, i64 0, i32 0, i64 0