[llvm] c167028 - [SLP]Delay vectorization of postponable values for instructions with no users.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 19 08:40:08 PDT 2022
Author: Alexey Bataev
Date: 2022-08-19T08:39:16-07:00
New Revision: c16702868485f157f60cd34ac64c5dd5db456d47
URL: https://github.com/llvm/llvm-project/commit/c16702868485f157f60cd34ac64c5dd5db456d47
DIFF: https://github.com/llvm/llvm-project/commit/c16702868485f157f60cd34ac64c5dd5db456d47.diff
LOG: [SLP]Delay vectorization of postponable values for instructions with no users.
SLP vectorizer tries to find the reductions starting the operands of the
instructions with no-users/void returns/etc. But such operands can be
postponable instructions, like Cmp, InsertElement or InsertValue. Such
operands still must be postponed, vectorizer should not try to vectorize
them immediately.
Differential Revision: https://reviews.llvm.org/D131965
Added:
Modified:
llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll
llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index b41f3efc5b55b..dfb30c6936883 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -20,6 +20,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/PassManager.h"
@@ -58,6 +59,7 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
using StoreListMap = MapVector<Value *, StoreList>;
using GEPList = SmallVector<GetElementPtrInst *, 8>;
using GEPListMap = MapVector<Value *, GEPList>;
+ using InstSetVector = SmallSetVector<Instruction *, 8>;
ScalarEvolution *SE = nullptr;
TargetTransformInfo *TTI = nullptr;
@@ -124,8 +126,8 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
/// Tries to vectorize constructs started from CmpInst, InsertValueInst or
/// InsertElementInst instructions.
- bool vectorizeSimpleInstructions(SmallVectorImpl<Instruction *> &Instructions,
- BasicBlock *BB, slpvectorizer::BoUpSLP &R,
+ bool vectorizeSimpleInstructions(InstSetVector &Instructions, BasicBlock *BB,
+ slpvectorizer::BoUpSLP &R,
bool AtTerminator);
/// Scan the basic block and look for patterns that are likely to start
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 843dc849a9560..5af240b71c405 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11966,9 +11966,9 @@ static bool compareCmp(Value *V, Value *V2,
return IsCompatibility;
}
-bool SLPVectorizerPass::vectorizeSimpleInstructions(
- SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
- bool AtTerminator) {
+bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions,
+ BasicBlock *BB, BoUpSLP &R,
+ bool AtTerminator) {
bool OpsChanged = false;
SmallVector<Instruction *, 4> PostponedCmps;
for (auto *I : reverse(Instructions)) {
@@ -12037,9 +12037,10 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
/*LimitForRegisterSize=*/true);
Instructions.clear();
} else {
+ Instructions.clear();
// Insert in reverse order since the PostponedCmps vector was filled in
// reverse order.
- Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
+ Instructions.insert(PostponedCmps.rbegin(), PostponedCmps.rend());
}
return OpsChanged;
}
@@ -12192,7 +12193,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
VisitedInstrs.clear();
- SmallVector<Instruction *, 8> PostProcessInstructions;
+ InstSetVector PostProcessInstructions;
SmallDenseSet<Instruction *, 4> KeyNodes;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
// Skip instructions with scalable type. The num of elements is unknown at
@@ -12244,8 +12245,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
!DT->isReachableFromEntry(P->getIncomingBlock(I)))
continue;
- Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
- P->getIncomingBlock(I), R, TTI);
+ // Postponed instructions should not be vectorized here, delay their
+ // vectorization.
+ if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
+ PI && !PostProcessInstructions.contains(PI))
+ Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
+ P->getIncomingBlock(I), R, TTI);
}
continue;
}
@@ -12272,8 +12277,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
if (TryToVectorizeRoot) {
for (auto *V : it->operand_values()) {
- // Try to match and vectorize a horizontal reduction.
- OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
+ // Postponed instructions should not be vectorized here, delay their
+ // vectorization.
+ if (auto *VI = dyn_cast<Instruction>(V);
+ VI && !PostProcessInstructions.contains(VI))
+ // Try to match and vectorize a horizontal reduction.
+ OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
}
}
// Start vectorization of post-process list of instructions from the
@@ -12292,7 +12301,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
if (isa<CmpInst, InsertElementInst, InsertValueInst>(it))
- PostProcessInstructions.push_back(&*it);
+ PostProcessInstructions.insert(&*it);
}
return Changed;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
index 470fa0caa1cd1..8224ffd4d0bcf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -430,21 +430,9 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
; Make sure that vectorization happens even if insertelements operations
; must be rescheduled. The case here is from compiling Julia.
define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
-; THRESHOLD-LABEL: @reschedule_extract(
-; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; THRESHOLD-NEXT: ret <4 x float> [[TMP1]]
-;
-; NOTHRESHOLD-LABEL: @reschedule_extract(
-; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]]
-;
-; MINTREESIZE-LABEL: @reschedule_extract(
-; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3
-; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
-; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]]
+; CHECK-LABEL: @reschedule_extract(
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%a0 = extractelement <4 x float> %a, i32 0
%b0 = extractelement <4 x float> %b, i32 0
@@ -468,21 +456,9 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
; Check that cost model for vectorization takes credit for
; instructions that are erased.
define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
-; THRESHOLD-LABEL: @take_credit(
-; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; THRESHOLD-NEXT: ret <4 x float> [[TMP1]]
-;
-; NOTHRESHOLD-LABEL: @take_credit(
-; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]]
-;
-; MINTREESIZE-LABEL: @take_credit(
-; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3
-; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
-; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]]
+; CHECK-LABEL: @take_credit(
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%a0 = extractelement <4 x float> %a, i32 0
%b0 = extractelement <4 x float> %b, i32 0
@@ -530,21 +506,9 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
}
define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
-; THRESHOLD-LABEL: @_vadd256(
-; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; THRESHOLD-NEXT: ret <8 x float> [[TMP1]]
-;
-; NOTHRESHOLD-LABEL: @_vadd256(
-; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; NOTHRESHOLD-NEXT: ret <8 x float> [[TMP1]]
-;
-; MINTREESIZE-LABEL: @_vadd256(
-; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[B:%.*]], i32 7
-; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7
-; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <8 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT: ret <8 x float> [[TMP5]]
+; CHECK-LABEL: @_vadd256(
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: ret <8 x float> [[TMP1]]
;
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %b, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index d7adf196f7c2a..09487d560de4a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -465,21 +465,9 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
; Make sure that vectorization happens even if insertelements operations
; must be rescheduled. The case here is from compiling Julia.
define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
-; THRESHOLD-LABEL: @reschedule_extract(
-; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; THRESHOLD-NEXT: ret <4 x float> [[TMP1]]
-;
-; NOTHRESHOLD-LABEL: @reschedule_extract(
-; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]]
-;
-; MINTREESIZE-LABEL: @reschedule_extract(
-; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3
-; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
-; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]]
+; CHECK-LABEL: @reschedule_extract(
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%a0 = extractelement <4 x float> %a, i32 0
%b0 = extractelement <4 x float> %b, i32 0
@@ -503,21 +491,9 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
; Check that cost model for vectorization takes credit for
; instructions that are erased.
define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
-; THRESHOLD-LABEL: @take_credit(
-; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; THRESHOLD-NEXT: ret <4 x float> [[TMP1]]
-;
-; NOTHRESHOLD-LABEL: @take_credit(
-; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]]
-;
-; MINTREESIZE-LABEL: @take_credit(
-; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3
-; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
-; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]]
+; CHECK-LABEL: @take_credit(
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%a0 = extractelement <4 x float> %a, i32 0
%b0 = extractelement <4 x float> %b, i32 0
@@ -565,21 +541,9 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
}
define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
-; THRESHOLD-LABEL: @_vadd256(
-; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; THRESHOLD-NEXT: ret <8 x float> [[TMP1]]
-;
-; NOTHRESHOLD-LABEL: @_vadd256(
-; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; NOTHRESHOLD-NEXT: ret <8 x float> [[TMP1]]
-;
-; MINTREESIZE-LABEL: @_vadd256(
-; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[B:%.*]], i32 7
-; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7
-; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <8 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT: ret <8 x float> [[TMP5]]
+; CHECK-LABEL: @_vadd256(
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: ret <8 x float> [[TMP1]]
;
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %b, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll
index 76e0d9de21181..a017c4c6bb729 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll
@@ -5,23 +5,19 @@ define <4 x double> @test(double* %p2, double %i1754, double %i1781, double %i17
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I1771:%.*]] = getelementptr inbounds double, double* [[P2:%.*]], i64 54
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1754:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1754]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[I1771]] to <2 x double>*
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1781]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[I1792]], i32 2
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP8]], i32 3
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[TMP9]], i32 3
-; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <4 x double> [[TMP12]], [[TMP13]]
-; CHECK-NEXT: ret <4 x double> [[TMP14]]
+; CHECK-NEXT: [[I1772:%.*]] = load double, double* [[I1771]], align 8
+; CHECK-NEXT: [[I1795:%.*]] = getelementptr inbounds double, double* [[P2]], i64 55
+; CHECK-NEXT: [[I1796:%.*]] = load double, double* [[I1795]], align 8
+; CHECK-NEXT: [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x double> [[TMP3]], [[SHUFFLE]]
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x double> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: ret <4 x double> [[TMP6]]
;
entry:
%i1771 = getelementptr inbounds double, double* %p2, i64 54
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
index aa9d89873f551..8c2e5b790d70c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64 -S | FileCheck %s --check-prefixes=SSE
; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64-v2 -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=AVX2
define { i64, i64 } @compute_min(ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %x, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %y) {
; SSE-LABEL: @compute_min(
@@ -123,6 +123,74 @@ define { i64, i64 } @compute_min(ptr nocapture noundef nonnull readonly align 2
; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1
; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
+; AVX2-LABEL: @compute_min(
+; AVX2-NEXT: entry:
+; AVX2-NEXT: [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X:%.*]], i64 0, i64 1
+; AVX2-NEXT: [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y:%.*]], i64 0, i64 1
+; AVX2-NEXT: [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2
+; AVX2-NEXT: [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2
+; AVX2-NEXT: [[ARRAYIDX_I_I_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 3
+; AVX2-NEXT: [[ARRAYIDX_I_I10_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 3
+; AVX2-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4
+; AVX2-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4
+; AVX2-NEXT: [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5
+; AVX2-NEXT: [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5
+; AVX2-NEXT: [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6
+; AVX2-NEXT: [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6
+; AVX2-NEXT: [[ARRAYIDX_I_I_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 7
+; AVX2-NEXT: [[ARRAYIDX_I_I10_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 7
+; AVX2-NEXT: [[TMP0:%.*]] = load i16, ptr [[Y]], align 2
+; AVX2-NEXT: [[TMP1:%.*]] = load i16, ptr [[X]], align 2
+; AVX2-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2
+; AVX2-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2
+; AVX2-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_2]], align 2
+; AVX2-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_I_I_2]], align 2
+; AVX2-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_3]], align 2
+; AVX2-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_3]], align 2
+; AVX2-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2
+; AVX2-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2
+; AVX2-NEXT: [[TMP10:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i32 0
+; AVX2-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> [[TMP10]], i16 [[TMP8]], i32 1
+; AVX2-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i32 0
+; AVX2-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP12]], i16 [[TMP9]], i32 1
+; AVX2-NEXT: [[TMP14:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP13]])
+; AVX2-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2
+; AVX2-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2
+; AVX2-NEXT: [[TMP17:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i32 0
+; AVX2-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> [[TMP17]], i16 [[TMP15]], i32 1
+; AVX2-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP3]], i32 0
+; AVX2-NEXT: [[TMP20:%.*]] = insertelement <2 x i16> [[TMP19]], i16 [[TMP16]], i32 1
+; AVX2-NEXT: [[TMP21:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP18]], <2 x i16> [[TMP20]])
+; AVX2-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_6]], align 2
+; AVX2-NEXT: [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_I_I_6]], align 2
+; AVX2-NEXT: [[TMP24:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0
+; AVX2-NEXT: [[TMP25:%.*]] = insertelement <2 x i16> [[TMP24]], i16 [[TMP22]], i32 1
+; AVX2-NEXT: [[TMP26:%.*]] = insertelement <2 x i16> poison, i16 [[TMP7]], i32 0
+; AVX2-NEXT: [[TMP27:%.*]] = insertelement <2 x i16> [[TMP26]], i16 [[TMP23]], i32 1
+; AVX2-NEXT: [[TMP28:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP25]], <2 x i16> [[TMP27]])
+; AVX2-NEXT: [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_7]], align 2
+; AVX2-NEXT: [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_I_I_7]], align 2
+; AVX2-NEXT: [[TMP31:%.*]] = insertelement <2 x i16> poison, i16 [[TMP4]], i32 0
+; AVX2-NEXT: [[TMP32:%.*]] = insertelement <2 x i16> [[TMP31]], i16 [[TMP29]], i32 1
+; AVX2-NEXT: [[TMP33:%.*]] = insertelement <2 x i16> poison, i16 [[TMP5]], i32 0
+; AVX2-NEXT: [[TMP34:%.*]] = insertelement <2 x i16> [[TMP33]], i16 [[TMP30]], i32 1
+; AVX2-NEXT: [[TMP35:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP32]], <2 x i16> [[TMP34]])
+; AVX2-NEXT: [[TMP36:%.*]] = zext <2 x i16> [[TMP35]] to <2 x i64>
+; AVX2-NEXT: [[TMP37:%.*]] = shl nuw <2 x i64> [[TMP36]], <i64 32, i64 48>
+; AVX2-NEXT: [[TMP38:%.*]] = zext <2 x i16> [[TMP28]] to <2 x i64>
+; AVX2-NEXT: [[TMP39:%.*]] = shl nuw <2 x i64> [[TMP38]], <i64 48, i64 32>
+; AVX2-NEXT: [[TMP40:%.*]] = or <2 x i64> [[TMP37]], [[TMP39]]
+; AVX2-NEXT: [[TMP41:%.*]] = zext <2 x i16> [[TMP21]] to <2 x i64>
+; AVX2-NEXT: [[TMP42:%.*]] = shl nuw nsw <2 x i64> [[TMP41]], <i64 16, i64 16>
+; AVX2-NEXT: [[TMP43:%.*]] = or <2 x i64> [[TMP40]], [[TMP42]]
+; AVX2-NEXT: [[TMP44:%.*]] = zext <2 x i16> [[TMP14]] to <2 x i64>
+; AVX2-NEXT: [[TMP45:%.*]] = or <2 x i64> [[TMP43]], [[TMP44]]
+; AVX2-NEXT: [[TMP46:%.*]] = extractelement <2 x i64> [[TMP45]], i32 0
+; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP46]], 0
+; AVX2-NEXT: [[TMP47:%.*]] = extractelement <2 x i64> [[TMP45]], i32 1
+; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP47]], 1
+; AVX2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
entry:
%0 = load i16, ptr %y, align 2
%1 = load i16, ptr %x, align 2
More information about the llvm-commits
mailing list