[llvm] [SLP]Add runtime stride support for strided loads. (PR #81517)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 5 06:35:40 PST 2024
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/81517
>From 72d585d454b69eebfd0426509ac68d49331e8d3e Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 12 Feb 2024 19:05:25 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 150 +++++++++++++++++-
.../RISCV/strided-loads-vectorized.ll | 98 ++----------
2 files changed, 155 insertions(+), 93 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c54d065cac6382..8fa32035d172d5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -87,6 +87,7 @@
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -3890,6 +3891,122 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
});
}
+/// Checks if the provided list of pointers \p Pointers represents the strided
+/// pointers for type ElemTy. If they are not, std::nullopt is returned.
+/// Otherwise, if \p Inst is not specified, just initialized optional value is
+/// returned to show that the pointers represent strided pointers. If \p Inst
+/// specified, the runtime stride is materialized before the given \p Inst.
+/// \returns std::nullopt if the pointers are not pointers with the runtime
+/// stride, nullptr or actual stride value, otherwise.
+static std::optional<Value *>
+calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ Instruction *Inst = nullptr) {
+ SmallVector<const SCEV *> SCEVs;
+ const SCEV *PtrSCEVA = nullptr;
+ const SCEV *PtrSCEVB = nullptr;
+ for (Value *Ptr : PointerOps) {
+ const SCEV *PtrSCEV = SE.getSCEV(Ptr);
+ if (!PtrSCEV)
+ return std::nullopt;
+ SCEVs.push_back(PtrSCEV);
+ if (!PtrSCEVA && !PtrSCEVB) {
+ PtrSCEVA = PtrSCEVB = PtrSCEV;
+ continue;
+ }
+ const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVA);
+ if (!Diff || isa<SCEVCouldNotCompute>(Diff))
+ return std::nullopt;
+ if (Diff->isNonConstantNegative()) {
+ PtrSCEVA = PtrSCEV;
+ continue;
+ }
+ const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVB, PtrSCEV);
+ if (!Diff1 || isa<SCEVCouldNotCompute>(Diff1))
+ return std::nullopt;
+ if (Diff1->isNonConstantNegative()) {
+ PtrSCEVB = PtrSCEV;
+ continue;
+ }
+ }
+ const SCEV *Stride = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
+ if (!Stride)
+ return std::nullopt;
+ int Size = DL.getTypeStoreSize(ElemTy);
+ auto TryGetStride = [&](const SCEV *Dist,
+ const SCEV *Multiplier) -> const SCEV * {
+ if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
+ if (M->getOperand(0) == Multiplier)
+ return M->getOperand(1);
+ if (M->getOperand(1) == Multiplier)
+ return M->getOperand(0);
+ return nullptr;
+ }
+ if (Multiplier == Dist)
+ return SE.getConstant(Dist->getType(), 1);
+ return SE.getUDivExactExpr(Dist, Multiplier);
+ };
+ if (Size != 1 || SCEVs.size() > 2) {
+ const SCEV *Sz =
+ SE.getConstant(Stride->getType(), Size * (SCEVs.size() - 1));
+ Stride = TryGetStride(Stride, Sz);
+ if (!Stride)
+ return std::nullopt;
+ }
+ if (!Stride || isa<SCEVConstant>(Stride))
+ return std::nullopt;
+ // Iterate through all pointers and check if all distances are
+ // unique multiple of Dist.
+ using DistOrdPair = std::pair<int64_t, int>;
+ auto Compare = llvm::less_first();
+ std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
+ int Cnt = 0;
+ bool IsConsecutive = true;
+ for (const SCEV *PtrSCEV : SCEVs) {
+ unsigned Dist = 0;
+ if (PtrSCEV != PtrSCEVA) {
+ const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVA);
+ const SCEV *Coeff = TryGetStride(Diff, Stride);
+ if (!Coeff)
+ return std::nullopt;
+ const auto *SC = dyn_cast<SCEVConstant>(Coeff);
+ if (!SC || isa<SCEVCouldNotCompute>(SC))
+ return std::nullopt;
+ if (!SE.getMinusSCEV(PtrSCEV,
+ SE.getAddExpr(PtrSCEVA, SE.getMulExpr(Stride, SC)))
+ ->isZero())
+ return std::nullopt;
+ Dist = SC->getAPInt().getZExtValue();
+ }
+ // If the strides are not the same or repeated, we can't vectorize.
+ if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
+ return std::nullopt;
+ auto Res = Offsets.emplace(Dist, Cnt);
+ if (!Res.second)
+ return std::nullopt;
+ // Consecutive order if the inserted element is the last one.
+ IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
+ ++Cnt;
+ }
+ if (Offsets.size() != SCEVs.size())
+ return std::nullopt;
+ SortedIndices.clear();
+ if (!IsConsecutive) {
+ // Fill SortedIndices array only if it is non-consecutive.
+ SortedIndices.resize(PointerOps.size());
+ Cnt = 0;
+ for (const std::pair<int64_t, int> &Pair : Offsets) {
+ SortedIndices[Cnt] = Pair.second;
+ ++Cnt;
+ }
+ }
+ if (!Inst)
+ return nullptr;
+ SCEVExpander Expander(SE, DL, "strided-load-vec");
+ return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
+}
+
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
@@ -3927,6 +4044,11 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+ if (!IsSorted && Sz > MinProfitableStridedLoads && TTI.isTypeLegal(VecTy) &&
+ TTI.isLegalStridedLoadStore(VecTy, CommonAlignment) &&
+ calculateRtStride(PointerOps, ScalarTy, DL, SE, Order))
+ return LoadsState::StridedVectorize;
if (IsSorted || all_of(PointerOps, [&](Value *P) {
return arePointersCompatible(P, PointerOps.front(), TLI);
})) {
@@ -11645,10 +11767,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
std::optional<int> Diff = getPointersDiff(
VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
Type *StrideTy = DL->getIndexType(PO->getType());
- int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
- Value *StrideVal =
- ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
- DL->getTypeAllocSize(ScalarTy));
+ Value *StrideVal;
+ if (Diff) {
+ int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
+ StrideVal =
+ ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
+ DL->getTypeAllocSize(ScalarTy));
+ } else {
+ SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
+ transform(E->Scalars, PointerOps.begin(), [](Value *V) {
+ return cast<LoadInst>(V)->getPointerOperand();
+ });
+ OrdersType Order;
+ std::optional<Value *> Stride =
+ calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
+ &*Builder.GetInsertPoint());
+ Value *NewStride =
+ Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
+ StrideVal = Builder.CreateMul(
+ NewStride,
+ ConstantInt::get(
+ StrideTy,
+ (IsReverseOrder ? -1 : 1) *
+ static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
+ }
Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
auto *Inst = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_load,
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index 4b0b41970bbb4d..03acc0009fb04c 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -78,67 +78,13 @@ define void @test1(ptr %p, ptr noalias %s, i32 %stride) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
-; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
-; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
-; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
-; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
-; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
-; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
-; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
-; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
-; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3
-; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
-; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
-; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
-; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
-; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
-; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4
-; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
-; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
-; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
-; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
-; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
-; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
-; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5
-; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
-; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
-; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
-; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
-; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
-; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6
-; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
-; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
-; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
-; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
-; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
-; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
-; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7
-; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
-; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
-; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2
-; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
-; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
-; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[STR]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 [[TMP0]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -215,38 +161,12 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
-; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
-; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
-; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4
-; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
-; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3
-; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
-; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2
-; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
-; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
-; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0
-; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7
-; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]]
-; CHECK-NEXT: store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STR]], -4
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 [[TMP1]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
+; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
;
entry:
More information about the llvm-commits
mailing list