[libc-commits] [flang] [libc] [llvm] [libc] Implement ucontext functions for x86_64 (PR #187712)
Jeff Bailey via libc-commits
libc-commits at lists.llvm.org
Thu Apr 30 09:12:56 PDT 2026
https://github.com/kaladron updated https://github.com/llvm/llvm-project/pull/187712
>From 16fcdb9fb6185dde371b5bef2e0b0ebdf8d109fa Mon Sep 17 00:00:00 2001
From: Jeff Bailey <jbailey at raspberryginger.com>
Date: Fri, 20 Mar 2026 14:11:51 +0000
Subject: [PATCH] [libc] Implement x86_64 ucontext functions (#187712)
Implemented the x86_64 ucontext functions: getcontext, setcontext,
makecontext, and swapcontext.
The structures in mcontext_t.h and ucontext_t.h are defined to match the
system ABI specified in the Linux kernel headers and glibc, ensuring that
ucontext_t can correctly interpret contexts provided by the kernel in
signal handlers.
---
.../flang/Lower/Support/ReductionProcessor.h | 10 +-
flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 6 +-
flang/lib/Lower/OpenMP/ClauseProcessor.h | 4 +-
flang/lib/Lower/OpenMP/OpenMP.cpp | 30 +-
.../lib/Lower/Support/ReductionProcessor.cpp | 31 +-
flang/lib/Semantics/check-omp-structure.cpp | 8 +-
flang/lib/Semantics/check-omp-structure.h | 1 -
flang/test/Lower/OpenMP/wsloop-simd.f90 | 19 +
libc/config/linux/x86_64/entrypoints.txt | 2 +
libc/docs/CMakeLists.txt | 1 +
libc/docs/headers/index.rst | 1 +
libc/src/ucontext/CMakeLists.txt | 13 +
libc/src/ucontext/getcontext.h | 5 +-
libc/src/ucontext/makecontext.h | 22 +
libc/src/ucontext/setcontext.h | 5 +-
libc/src/ucontext/swapcontext.h | 22 +
libc/src/ucontext/x86_64/CMakeLists.txt | 35 ++
libc/src/ucontext/x86_64/getcontext.cpp | 14 +-
libc/src/ucontext/x86_64/makecontext.cpp | 93 ++++
libc/src/ucontext/x86_64/setcontext.cpp | 25 +-
libc/src/ucontext/x86_64/swapcontext.cpp | 127 +++++
libc/test/src/ucontext/CMakeLists.txt | 2 +
libc/test/src/ucontext/ucontext_test.cpp | 91 +++-
libc/utils/docgen/nl_types.yaml | 13 +
.../CodeGen/SelectionDAG/TargetLowering.cpp | 39 +-
llvm/lib/ProfileData/CMakeLists.txt | 6 +-
llvm/test/CodeGen/AArch64/aarch64-mulv.ll | 78 ++--
llvm/test/CodeGen/AArch64/double_reduct.ll | 25 +-
llvm/test/CodeGen/AArch64/vecreduce-fmul.ll | 64 +--
llvm/test/CodeGen/PowerPC/cttz-elts.ll | 80 ++--
llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll | 21 +-
llvm/test/CodeGen/X86/vector-compress.ll | 432 +++++++-----------
.../CodeGen/X86/vector-extract-last-active.ll | 102 +++--
llvm/test/tools/llubi/intr_vector_manip.ll | 18 +-
llvm/test/tools/llubi/intr_vscale_poison.ll | 4 +-
llvm/tools/llubi/lib/Interpreter.cpp | 33 +-
llvm/tools/llubi/llubi.cpp | 10 +
37 files changed, 988 insertions(+), 504 deletions(-)
create mode 100644 libc/src/ucontext/makecontext.h
create mode 100644 libc/src/ucontext/swapcontext.h
create mode 100644 libc/src/ucontext/x86_64/makecontext.cpp
create mode 100644 libc/src/ucontext/x86_64/swapcontext.cpp
create mode 100644 libc/utils/docgen/nl_types.yaml
diff --git a/flang/include/flang/Lower/Support/ReductionProcessor.h b/flang/include/flang/Lower/Support/ReductionProcessor.h
index bbc4879bbe352..0b4a692827a79 100644
--- a/flang/include/flang/Lower/Support/ReductionProcessor.h
+++ b/flang/include/flang/Lower/Support/ReductionProcessor.h
@@ -144,6 +144,12 @@ class ReductionProcessor {
/// Creates a reduction declaration and associates it with an OpenMP block
/// directive.
+ /// \param [in,out] reductionVarCache - optional cache mapping reduction
+ /// symbols to their SSA values. When provided, array/box reduction
+ /// variables that have already been allocated will be reused instead of
+ /// creating new allocas. This ensures that nested composite wrappers
+ /// (e.g. wsloop and simd in DO SIMD) share the same SSA values, allowing
+ /// the genLoopVars() mapper to correctly remap inner wrapper operands.
template <typename OpType, typename RedOperatorListTy>
static bool processReductionArguments(
mlir::Location currentLocation, lower::AbstractConverter &converter,
@@ -151,7 +157,9 @@ class ReductionProcessor {
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value>
+ *reductionVarCache = nullptr);
};
template <typename FloatOp, typename IntegerOp>
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 796dda34e3821..1c39e90a922cf 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -2023,7 +2023,9 @@ bool ClauseProcessor::processNontemporal(
bool ClauseProcessor::processReduction(
mlir::Location currentLocation, mlir::omp::ReductionClauseOps &result,
- llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const {
+ llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache)
+ const {
return findRepeatableClause<omp::clause::Reduction>(
[&](const omp::clause::Reduction &clause, const parser::CharBlock &) {
llvm::SmallVector<mlir::Value> reductionVars;
@@ -2047,7 +2049,7 @@ bool ClauseProcessor::processReduction(
currentLocation, converter,
std::get<typename omp::clause::ReductionOperatorList>(clause.t),
reductionVars, reduceVarByRef, reductionDeclSymbols,
- reductionSyms))
+ reductionSyms, reductionVarCache))
TODO(currentLocation, "Lowering unrecognised reduction type");
// Copy local lists into the output.
llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 29b5c29b8e33a..acf1068efb987 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -166,7 +166,9 @@ class ClauseProcessor {
bool processNontemporal(mlir::omp::NontemporalClauseOps &result) const;
bool processReduction(
mlir::Location currentLocation, mlir::omp::ReductionClauseOps &result,
- llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) const;
+ llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value>
+ *reductionVarCache = nullptr) const;
bool processTaskReduction(
mlir::Location currentLocation, mlir::omp::TaskReductionClauseOps &result,
llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 6859f5b291342..88d28cf94b045 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1667,13 +1667,15 @@ static void genSimdClauses(
lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
const List<Clause> &clauses, mlir::Location loc,
mlir::omp::SimdOperands &clauseOps,
- llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
+ llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache =
+ nullptr) {
ClauseProcessor cp(converter, semaCtx, clauses);
cp.processAligned(clauseOps);
cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
cp.processNontemporal(clauseOps);
cp.processOrder(clauseOps);
- cp.processReduction(loc, clauseOps, reductionSyms);
+ cp.processReduction(loc, clauseOps, reductionSyms, reductionVarCache);
cp.processSafelen(clauseOps);
cp.processSimdlen(clauseOps);
cp.processLinear(clauseOps);
@@ -1943,13 +1945,15 @@ static void genWsloopClauses(
lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
lower::StatementContext &stmtCtx, const List<Clause> &clauses,
mlir::Location loc, mlir::omp::WsloopOperands &clauseOps,
- llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
+ llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache =
+ nullptr) {
ClauseProcessor cp(converter, semaCtx, clauses);
cp.processAllocate(clauseOps);
cp.processNowait(clauseOps);
cp.processOrder(clauseOps);
cp.processOrdered(clauseOps);
- cp.processReduction(loc, clauseOps, reductionSyms);
+ cp.processReduction(loc, clauseOps, reductionSyms, reductionVarCache);
cp.processSchedule(stmtCtx, clauseOps);
cp.processLinear(clauseOps);
}
@@ -3505,6 +3509,10 @@ static mlir::omp::DistributeOp genCompositeDistributeParallelDoSimd(
/*isComposite=*/true);
// Clause processing.
+ // Use a shared cache so that both wsloop and simd produce the same SSA
+ // values for array/box reduction variables. See genCompositeDoSimd.
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> reductionVarCache;
+
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, distributeItem->clauses,
loc, distributeClauseOps);
@@ -3512,12 +3520,12 @@ static mlir::omp::DistributeOp genCompositeDistributeParallelDoSimd(
mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
genWsloopClauses(converter, semaCtx, stmtCtx, doItem->clauses, loc,
- wsloopClauseOps, wsloopReductionSyms);
+ wsloopClauseOps, wsloopReductionSyms, &reductionVarCache);
mlir::omp::SimdOperands simdClauseOps;
llvm::SmallVector<const semantics::Symbol *> simdReductionSyms;
genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
- simdReductionSyms);
+ simdReductionSyms, &reductionVarCache);
DataSharingProcessor simdItemDSP(converter, semaCtx, simdItem->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
@@ -3638,15 +3646,21 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
ConstructQueue::const_iterator simdItem = std::next(doItem);
// Clause processing.
+ // Use a shared cache so that both wsloop and simd produce the same SSA
+ // values for array/box reduction variables, enabling genLoopVars()'s
+ // IRMapping to correctly chain the inner wrapper's operands to the outer
+ // wrapper's block arguments.
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> reductionVarCache;
+
mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
genWsloopClauses(converter, semaCtx, stmtCtx, doItem->clauses, loc,
- wsloopClauseOps, wsloopReductionSyms);
+ wsloopClauseOps, wsloopReductionSyms, &reductionVarCache);
mlir::omp::SimdOperands simdClauseOps;
llvm::SmallVector<const semantics::Symbol *> simdReductionSyms;
genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
- simdReductionSyms);
+ simdReductionSyms, &reductionVarCache);
DataSharingProcessor wsloopItemDSP(
converter, semaCtx, doItem->clauses, eval,
diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index 2aa040e4b0ca8..d5387f7a59118 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -46,7 +46,8 @@ template bool ReductionProcessor::processReductionArguments<
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache);
template bool ReductionProcessor::processReductionArguments<
fir::DeclareReductionOp, llvm::SmallVector<fir::ReduceOperationEnum>>(
@@ -55,7 +56,8 @@ template bool ReductionProcessor::processReductionArguments<
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache);
template mlir::omp::DeclareReductionOp
ReductionProcessor::createDeclareReduction<mlir::omp::DeclareReductionOp>(
@@ -658,7 +660,8 @@ bool ReductionProcessor::processReductionArguments(
llvm::SmallVectorImpl<mlir::Value> &reductionVars,
llvm::SmallVectorImpl<bool> &reduceVarByRef,
llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
- const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols) {
+ const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+ llvm::DenseMap<const semantics::Symbol *, mlir::Value> *reductionVarCache) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
if constexpr (std::is_same_v<RedOperatorListTy,
@@ -701,6 +704,21 @@ bool ReductionProcessor::processReductionArguments(
}
for (const semantics::Symbol *symbol : reductionSymbols) {
+ // If a cached reduction variable exists for this symbol, reuse it.
+ // This ensures that composite constructs (e.g. DO SIMD) where both
+ // the outer wrapper (wsloop) and inner wrapper (simd) process the same
+ // reduction clause share the same SSA value, enabling genLoopVars()'s
+ // IRMapping to correctly remap inner wrapper operands to outer wrapper
+ // block arguments.
+ if (reductionVarCache) {
+ auto it = reductionVarCache->find(symbol);
+ if (it != reductionVarCache->end()) {
+ reductionVars.push_back(it->second);
+ reduceVarByRef.push_back(doReductionByRef(it->second));
+ continue;
+ }
+ }
+
mlir::Value symVal = converter.getSymbolAddress(*symbol);
if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
@@ -753,7 +771,12 @@ bool ReductionProcessor::processReductionArguments(
reductionVars.push_back(
builder.createConvert(currentLocation, refTy, symVal));
- reduceVarByRef.push_back(doReductionByRef(symVal));
+ reduceVarByRef.push_back(doReductionByRef(reductionVars.back()));
+
+ // Cache the final SSA value for this symbol so that subsequent calls
+ // (e.g. for the inner wrapper in a composite construct) reuse it.
+ if (reductionVarCache)
+ reductionVarCache->try_emplace(symbol, reductionVars.back());
}
unsigned idx = 0;
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 7bff7de2f8fbf..40f81bd3abcdc 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -946,11 +946,6 @@ void OmpStructureChecker::Enter(const parser::OpenMPConstruct &x) {
}
void OmpStructureChecker::Leave(const parser::OpenMPConstruct &x) {
- for (const auto &[sym, source] : deferredNonVariables_) {
- context_.SayWithDecl(
- *sym, source, "'%s' must be a variable"_err_en_US, sym->name());
- }
- deferredNonVariables_.clear();
if (GetOmpDirectiveName(x).v != llvm::omp::Directive::OMPD_section) {
dirStack_.pop_back();
}
@@ -3692,7 +3687,8 @@ void OmpStructureChecker::Enter(const parser::OmpClause &x) {
for (const auto &[symbol, source] : symbols) {
if (!IsVariableListItem(*symbol) &&
!(IsNamedConstant(*symbol) && SharedOrFirstprivate)) {
- deferredNonVariables_.insert({symbol, source});
+ context_.SayWithDecl(*symbol, source,
+ "'%s' must be a variable"_err_en_US, symbol->name());
}
}
}
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index e75bb5da847a9..142602c1f1118 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -420,7 +420,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
int allocateDirectiveLevel_{0};
parser::CharBlock visitedAtomicSource_;
- SymbolSourceMap deferredNonVariables_;
// Stack of nested DO loops and OpenMP constructs.
// This is used to verify DO loop nest for DOACROSS, and branches into
diff --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 b/flang/test/Lower/OpenMP/wsloop-simd.f90
index 03e35de04cace..987bc804a9b44 100644
--- a/flang/test/Lower/OpenMP/wsloop-simd.f90
+++ b/flang/test/Lower/OpenMP/wsloop-simd.f90
@@ -67,6 +67,25 @@ subroutine do_simd_reduction()
!$omp end do simd
end subroutine do_simd_reduction
+! Verify that the simd reduction var references the wsloop's reduction
+! block arg (not the original array), ensuring proper chaining of
+! per-SIMD-lane results into the wsloop's thread-private reduction copy.
+! CHECK-LABEL: {{.*}}do_simd_array_reduction{{.*}}
+subroutine do_simd_array_reduction()
+ integer :: a(100)
+ a = 0
+ ! CHECK: omp.wsloop
+ ! CHECK-SAME: reduction(byref @[[ADD_RED_SYM:.*]] %{{.*}} -> %[[ADD_RED_OUTER:.*]] : !fir.ref<!fir.box<!fir.array<100xi32>>>)
+ ! CHECK-NEXT: omp.simd
+ ! CHECK-SAME: reduction(byref @[[ADD_RED_SYM]] %[[ADD_RED_OUTER]] -> %[[ADD_RED_INNER:.*]] : !fir.ref<!fir.box<!fir.array<100xi32>>>)
+ ! CHECK-NEXT: omp.loop_nest
+ !$omp do simd reduction(+:a)
+ do index_ = 1, 10
+ a = a + index_
+ end do
+ !$omp end do simd
+end subroutine do_simd_array_reduction
+
! CHECK-LABEL: func.func @_QPdo_simd_private(
subroutine do_simd_private()
integer, allocatable :: tmp
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index d1c1d9496af67..3f9e81d8f990e 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1265,6 +1265,8 @@ if(LLVM_LIBC_FULL_BUILD)
# ucontext.h entrypoints
libc.src.ucontext.getcontext
libc.src.ucontext.setcontext
+ libc.src.ucontext.makecontext
+ libc.src.ucontext.swapcontext
# stdio.h entrypoints
libc.src.stdio.clearerr
diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt
index a709ee175a246..9162976d0744d 100644
--- a/libc/docs/CMakeLists.txt
+++ b/libc/docs/CMakeLists.txt
@@ -54,6 +54,7 @@ if (SPHINX_FOUND)
glob
inttypes
locale
+ nl_types
net/if
netinet/in
poll
diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst
index 8ae0843984255..e7a4f6b701de2 100644
--- a/libc/docs/headers/index.rst
+++ b/libc/docs/headers/index.rst
@@ -23,6 +23,7 @@ Implementation Status
math/index.rst
net/if
netinet/in
+ nl_types
poll
search
setjmp
diff --git a/libc/src/ucontext/CMakeLists.txt b/libc/src/ucontext/CMakeLists.txt
index d8316abf508a3..67201bad898f5 100644
--- a/libc/src/ucontext/CMakeLists.txt
+++ b/libc/src/ucontext/CMakeLists.txt
@@ -16,3 +16,16 @@ add_entrypoint_object(
.${LIBC_TARGET_ARCHITECTURE}.setcontext
)
+add_entrypoint_object(
+ makecontext
+ ALIAS
+ DEPENDS
+ .${LIBC_TARGET_ARCHITECTURE}.makecontext
+)
+
+add_entrypoint_object(
+ swapcontext
+ ALIAS
+ DEPENDS
+ .${LIBC_TARGET_ARCHITECTURE}.swapcontext
+)
diff --git a/libc/src/ucontext/getcontext.h b/libc/src/ucontext/getcontext.h
index 00efbe04717bd..804feed18f539 100644
--- a/libc/src/ucontext/getcontext.h
+++ b/libc/src/ucontext/getcontext.h
@@ -1,4 +1,5 @@
-//===-- Implementation header for getcontext --------------------*- C++ -*-===//
+//===-- Implementation header for getcontext ----------------------*- C++
+//-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -14,7 +15,7 @@
namespace LIBC_NAMESPACE_DECL {
-int getcontext(ucontext_t *ucp) noexcept;
+int getcontext(ucontext_t *ucp);
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ucontext/makecontext.h b/libc/src/ucontext/makecontext.h
new file mode 100644
index 0000000000000..ebf4e48b8d7d9
--- /dev/null
+++ b/libc/src/ucontext/makecontext.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for makecontext ---------------------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_UCONTEXT_MAKECONTEXT_H
+#define LLVM_LIBC_SRC_UCONTEXT_MAKECONTEXT_H
+
+#include "include/llvm-libc-types/ucontext_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+void makecontext(ucontext_t *ucp, void (*func)(void), int argc, ...);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_UCONTEXT_MAKECONTEXT_H
diff --git a/libc/src/ucontext/setcontext.h b/libc/src/ucontext/setcontext.h
index 86a25b3a41245..c42feb1fe24f3 100644
--- a/libc/src/ucontext/setcontext.h
+++ b/libc/src/ucontext/setcontext.h
@@ -1,4 +1,5 @@
-//===-- Implementation header for setcontext --------------------*- C++ -*-===//
+//===-- Implementation header for setcontext ----------------------*- C++
+//-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -14,7 +15,7 @@
namespace LIBC_NAMESPACE_DECL {
-int setcontext(const ucontext_t *ucp) noexcept;
+int setcontext(const ucontext_t *ucp);
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ucontext/swapcontext.h b/libc/src/ucontext/swapcontext.h
new file mode 100644
index 0000000000000..c35490aed6ffc
--- /dev/null
+++ b/libc/src/ucontext/swapcontext.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for swapcontext ---------------------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_UCONTEXT_SWAPCONTEXT_H
+#define LLVM_LIBC_SRC_UCONTEXT_SWAPCONTEXT_H
+
+#include "include/llvm-libc-types/ucontext_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int swapcontext(ucontext_t *__restrict oucp, const ucontext_t *__restrict ucp);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_UCONTEXT_SWAPCONTEXT_H
diff --git a/libc/src/ucontext/x86_64/CMakeLists.txt b/libc/src/ucontext/x86_64/CMakeLists.txt
index 64b35296f36d3..2711738905b77 100644
--- a/libc/src/ucontext/x86_64/CMakeLists.txt
+++ b/libc/src/ucontext/x86_64/CMakeLists.txt
@@ -26,3 +26,38 @@ add_entrypoint_object(
libc.hdr.types.size_t
)
+add_entrypoint_object(
+ makecontext
+ SRCS
+ makecontext.cpp
+ HDRS
+ ../makecontext.h
+ COMPILE_OPTIONS
+ -O3
+ -fno-omit-frame-pointer
+ DEPENDS
+ libc.include.llvm-libc-types.ucontext_t
+ libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.OSUtil.osutil
+ .setcontext
+ libc.hdr.types.size_t
+ libc.hdr.stdint_proxy
+)
+
+add_entrypoint_object(
+ swapcontext
+ SRCS
+ swapcontext.cpp
+ HDRS
+ ../swapcontext.h
+ COMPILE_OPTIONS
+ -O3
+ -fno-omit-frame-pointer
+ DEPENDS
+ libc.include.llvm-libc-types.ucontext_t
+ libc.include.sys_syscall
+ libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.hdr.types.size_t
+)
diff --git a/libc/src/ucontext/x86_64/getcontext.cpp b/libc/src/ucontext/x86_64/getcontext.cpp
index 26fce6447ffe8..24be58d3b045d 100644
--- a/libc/src/ucontext/x86_64/getcontext.cpp
+++ b/libc/src/ucontext/x86_64/getcontext.cpp
@@ -12,15 +12,17 @@
#include "src/__support/macros/config.h"
#include "hdr/types/size_t.h"
-#include "include/llvm-libc-macros/signal-macros.h"
#include <sys/syscall.h>
+// We define these locally so we don't depend on system headers.
+// These are the standard sysV x86_64 ABI indices for gregset_t.
+
namespace LIBC_NAMESPACE_DECL {
// We use naked because we need to capture the exact register state
// at the moment of the function call, avoiding any compiler prologue/epilogue.
-__attribute__((naked)) LLVM_LIBC_FUNCTION(int, getcontext,
- (ucontext_t * ucp)) noexcept {
+[[gnu::naked]]
+LLVM_LIBC_FUNCTION(int, getcontext, (ucontext_t * ucp)) {
asm(R"(
# ucp is in rdi
@@ -61,7 +63,7 @@ __attribute__((naked)) LLVM_LIBC_FUNCTION(int, getcontext,
# rt_sigprocmask(SIG_BLOCK, NULL, &ucp->uc_sigmask, sizeof(sigset_t))
leaq %c[sigmask](%%rdi), %%rdx # oldset = &ucp->uc_sigmask
xorq %%rsi, %%rsi # set = NULL
- movq $%c[sig_block], %%rdi # SIG_BLOCK (captured mask in oldset)
+ movq $0, %%rdi # SIG_BLOCK (captured mask in oldset)
movq $%c[sigset_size], %%r10
movq $%c[syscall_num], %%rax
syscall
@@ -72,7 +74,7 @@ __attribute__((naked)) LLVM_LIBC_FUNCTION(int, getcontext,
retq
)" ::[ret_size] "i"(sizeof(void *)),
[sigset_size] "i"(sizeof(sigset_t)),
- [syscall_num] "i"(SYS_rt_sigprocmask), [sig_block] "i"(SIG_BLOCK),
+ [syscall_num] "i"(SYS_rt_sigprocmask),
[r8] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R8])),
[r9] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R9])),
[r10] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R10])),
@@ -93,7 +95,7 @@ __attribute__((naked)) LLVM_LIBC_FUNCTION(int, getcontext,
[fpregs_mem] "i"(__builtin_offsetof(ucontext_t, __fpregs_mem)),
[fpregs_ptr] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.fpregs)),
[sigmask] "i"(__builtin_offsetof(ucontext_t, uc_sigmask))
- : "memory", "rcx", "r11", "rdi", "rsi", "rax", "r10");
+ : "memory", "rcx", "r11");
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ucontext/x86_64/makecontext.cpp b/libc/src/ucontext/x86_64/makecontext.cpp
new file mode 100644
index 0000000000000..85d6c9521f9a1
--- /dev/null
+++ b/libc/src/ucontext/x86_64/makecontext.cpp
@@ -0,0 +1,93 @@
+//===-- Implementation of makecontext for x86_64 --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/ucontext/makecontext.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#include "hdr/stdint_proxy.h"
+#include "hdr/types/size_t.h"
+#include "include/llvm-libc-types/ucontext_t.h"
+#include <stdarg.h>
+
+#include "src/__support/OSUtil/exit.h"
+#include "src/ucontext/setcontext.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+extern "C" void __makecontext_trampoline_c(ucontext_t *uc_link) {
+ if (uc_link)
+ setcontext(uc_link);
+
+ internal::exit(0);
+}
+
+[[gnu::naked]] void __makecontext_trampoline() {
+ asm(R"(
+ mov %rbx, %rdi
+ call __makecontext_trampoline_c
+ hlt
+ )");
+}
+
+LLVM_LIBC_FUNCTION(void, makecontext,
+ (ucontext_t * ucp, void (*func)(void), int argc, ...)) {
+ if (!ucp || !func)
+ return;
+
+ // System V AMD64 ABI requirements.
+ constexpr uintptr_t STACK_ALIGN_BYTES = 16;
+ constexpr uintptr_t STACK_ALIGN_MASK = ~(STACK_ALIGN_BYTES - 1);
+ constexpr int REGISTER_ARGS_COUNT = 6;
+ constexpr uintptr_t ARG_SIZE = sizeof(greg_t);
+
+ uintptr_t stack_top =
+ reinterpret_cast<uintptr_t>(ucp->uc_stack.ss_sp) + ucp->uc_stack.ss_size;
+ stack_top &= STACK_ALIGN_MASK;
+
+ int stack_args = argc > REGISTER_ARGS_COUNT ? argc - REGISTER_ARGS_COUNT : 0;
+
+ uintptr_t new_rsp = stack_top - stack_args * ARG_SIZE;
+ new_rsp &= STACK_ALIGN_MASK;
+
+ // The System V ABI requires the stack to be 16-byte aligned before the 'call'
+ // instruction. When a function is entered, the return address has been
+ // pushed, making the stack misaligned by 8. We simulate this state by
+ // subtracting 8, storing the trampoline address at the top of the stack.
+ new_rsp -= ARG_SIZE;
+
+ greg_t *stack_area = reinterpret_cast<greg_t *>(new_rsp);
+ stack_area[0] = reinterpret_cast<greg_t>(&__makecontext_trampoline);
+
+ va_list ap;
+ va_start(ap, argc);
+ if (argc > 0)
+ ucp->uc_mcontext.gregs[REG_RDI] = va_arg(ap, greg_t);
+ if (argc > 1)
+ ucp->uc_mcontext.gregs[REG_RSI] = va_arg(ap, greg_t);
+ if (argc > 2)
+ ucp->uc_mcontext.gregs[REG_RDX] = va_arg(ap, greg_t);
+ if (argc > 3)
+ ucp->uc_mcontext.gregs[REG_RCX] = va_arg(ap, greg_t);
+ if (argc > 4)
+ ucp->uc_mcontext.gregs[REG_R8] = va_arg(ap, greg_t);
+ if (argc > 5)
+ ucp->uc_mcontext.gregs[REG_R9] = va_arg(ap, greg_t);
+
+ for (int i = 0; i < stack_args; ++i) {
+ stack_area[i + 1] = va_arg(ap, greg_t);
+ }
+
+ va_end(ap);
+
+ ucp->uc_mcontext.gregs[REG_RIP] = reinterpret_cast<greg_t>(func);
+ ucp->uc_mcontext.gregs[REG_RSP] = new_rsp;
+ ucp->uc_mcontext.gregs[REG_RBX] = reinterpret_cast<greg_t>(ucp->uc_link);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ucontext/x86_64/setcontext.cpp b/libc/src/ucontext/x86_64/setcontext.cpp
index 408298bed3d35..2a779e3dc9a7a 100644
--- a/libc/src/ucontext/x86_64/setcontext.cpp
+++ b/libc/src/ucontext/x86_64/setcontext.cpp
@@ -12,32 +12,22 @@
#include "src/__support/macros/config.h"
#include "hdr/types/size_t.h"
-#include "include/llvm-libc-macros/signal-macros.h"
#include <sys/syscall.h>
namespace LIBC_NAMESPACE_DECL {
-__attribute__((naked)) LLVM_LIBC_FUNCTION(int, setcontext,
- (const ucontext_t *ucp)) noexcept {
+[[gnu::naked]]
+LLVM_LIBC_FUNCTION(int, setcontext, (const ucontext_t *ucp)) {
asm(R"(
# ucp is in rdi
# Restore the signal mask using rt_sigprocmask syscall.
# rt_sigprocmask(SIG_SETMASK, &ucp->uc_sigmask, NULL, sizeof(sigset_t))
- # Note: Restoring the signal mask early means that if a signal
- # arrives before the context switch is complete, it will run on
- # the old stack with the new mask. Doing this later is difficult
- # because the syscall clobbers registers.
- #
- # Note: We could avoid these stack operations by saving rdi in a
- # non-volatile register (like r12) across the syscall, since all
- # registers will be overwritten anyway. We stick to the stack for
- # simplicity and readability.
pushq %%rdi # Save ucp
leaq %c[sigmask](%%rdi), %%rsi # set = &ucp->uc_sigmask
xorq %%rdx, %%rdx # oldset = NULL
movq $%c[sigset_size], %%r10 # sigsetsize = sizeof(sigset_t)
- movq $%c[sig_setmask], %%rdi # how = SIG_SETMASK
+ movq $2, %%rdi # how = SIG_SETMASK
movq $%c[syscall_num], %%rax
syscall
popq %%rdi # Restore ucp
@@ -60,18 +50,17 @@ __attribute__((naked)) LLVM_LIBC_FUNCTION(int, setcontext,
mov %c[rax](%%rdi), %%rax
mov %c[rcx](%%rdi), %%rcx
- # Restore stack pointer
+ # Restore stack pointer and instruction pointer
mov %c[rsp](%%rdi), %%rsp
- # Push saved RIP onto the new stack to use ret later
- pushq %c[rip](%%rdi)
+ mov %c[rip](%%rdi), %%r11 # Use r11 as temp for rip
# Restore RSI and RDI last
mov %c[rsi](%%rdi), %%rsi
mov %c[rdi](%%rdi), %%rdi
- retq
+ jmpq *%%r11 # Jump to the saved instruction pointer
)" ::[sigset_size] "i"(sizeof(sigset_t)),
- [syscall_num] "i"(SYS_rt_sigprocmask), [sig_setmask] "i"(SIG_SETMASK),
+ [syscall_num] "i"(SYS_rt_sigprocmask),
[r8] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R8])),
[r9] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R9])),
[r10] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R10])),
diff --git a/libc/src/ucontext/x86_64/swapcontext.cpp b/libc/src/ucontext/x86_64/swapcontext.cpp
new file mode 100644
index 0000000000000..4048751067ac3
--- /dev/null
+++ b/libc/src/ucontext/x86_64/swapcontext.cpp
@@ -0,0 +1,127 @@
+//===-- Implementation of swapcontext for x86_64 --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/ucontext/swapcontext.h"
+#include "include/llvm-libc-types/ucontext_t.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#include "hdr/types/size_t.h"
+#include <sys/syscall.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+[[gnu::naked]]
+LLVM_LIBC_FUNCTION(int, swapcontext,
+ (ucontext_t * oucp, const ucontext_t *ucp)) {
+ asm(R"(
+ # oucp is in rdi, ucp is in rsi
+
+ // Save current context into oucp
+ // Save general purpose registers
+ mov %%r8, %c[r8](%%rdi)
+ mov %%r9, %c[r9](%%rdi)
+ mov %%r10, %c[r10](%%rdi)
+ mov %%r11, %c[r11](%%rdi)
+ mov %%r12, %c[r12](%%rdi)
+ mov %%r13, %c[r13](%%rdi)
+ mov %%r14, %c[r14](%%rdi)
+ mov %%r15, %c[r15](%%rdi)
+ mov %%rdi, %c[rdi](%%rdi) # oucp itself
+ mov %%rsi, %c[rsi](%%rdi) # ucp
+ mov %%rbp, %c[rbp](%%rdi)
+ mov %%rbx, %c[rbx](%%rdi)
+ mov %%rdx, %c[rdx](%%rdi)
+ // setcontext should return 0 when resumed by setcontext.
+ // So we save 0 into the RAX register of the context.
+ movq $0, %c[rax](%%rdi)
+ mov %%rcx, %c[rcx](%%rdi)
+
+ // The stack pointer before the call is rsp + sizeof(void*).
+ // The return address was pushed when this function was called.
+ // Save instruction pointer and stack pointer
+ mov (%%rsp), %%rax
+ mov %%rax, %c[rip](%%rdi)
+ lea %c[ret_size](%%rsp), %%rax
+ mov %%rax, %c[rsp](%%rdi)
+
+ // Save floating point state
+ fxsaveq %c[fpregs_mem](%%rdi)
+ // Point mcontext.fpregs to our internal FP storage
+ lea %c[fpregs_mem](%%rdi), %%rax
+ mov %%rax, %c[fpregs_ptr](%%rdi)
+
+ // Capture oucp signal mask and restore ucp signal mask atomically.
+ // rt_sigprocmask(SIG_SETMASK, &ucp->uc_sigmask, &oucp->uc_sigmask, sizeof(sigset_t))
+ // oucp is in rdi, ucp is in rsi
+ pushq %%rdi # Save oucp
+ pushq %%rsi # Save ucp
+ leaq %c[sigmask](%%rdi), %%rdx # oldset = &oucp->uc_sigmask
+ leaq %c[sigmask](%%rsi), %%rsi # set = &ucp->uc_sigmask
+ movq $%c[sigset_size], %%r10 # sigsetsize = sizeof(sigset_t)
+ movq $2, %%rdi # how = SIG_SETMASK
+ movq $%c[syscall_num], %%rax
+ syscall
+ popq %%rsi # Restore ucp (new context)
+ popq %%rdi # Restore oucp (old context - not needed but for clean stack)
+
+ // Restore context from ucp (now in rsi)
+ // Restore floating point state
+ fxrstorq %c[fpregs_mem](%%rsi)
+
+ // Restore general purpose registers EXECPT rdi, rsi, rsp, rip
+ mov %c[r8](%%rsi), %%r8
+ mov %c[r9](%%rsi), %%r9
+ mov %c[r10](%%rsi), %%r10
+ mov %c[r11](%%rsi), %%r11
+ mov %c[r12](%%rsi), %%r12
+ mov %c[r13](%%rsi), %%r13
+ mov %c[r14](%%rsi), %%r14
+ mov %c[r15](%%rsi), %%r15
+ mov %c[rbp](%%rsi), %%rbp
+ mov %c[rbx](%%rsi), %%rbx
+ mov %c[rdx](%%rsi), %%rdx
+ mov %c[rax](%%rsi), %%rax
+ mov %c[rcx](%%rsi), %%rcx
+
+ // Restore stack pointer and instruction pointer
+ mov %c[rsp](%%rsi), %%rsp
+ mov %c[rip](%%rsi), %%r11 # Use r11 as temp for rip
+
+ // Restore RSI and RDI last
+ mov %c[rdi](%%rsi), %%rdi
+ mov %c[rsi](%%rsi), %%rsi
+
+ jmpq *%%r11 # Jump to the saved instruction pointer
+ )" ::[ret_size] "i"(sizeof(void *)),
+ [sigset_size] "i"(sizeof(sigset_t)),
+ [syscall_num] "i"(SYS_rt_sigprocmask),
+ [r8] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R8])),
+ [r9] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R9])),
+ [r10] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R10])),
+ [r11] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R11])),
+ [r12] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R12])),
+ [r13] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R13])),
+ [r14] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R14])),
+ [r15] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_R15])),
+ [rdi] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_RDI])),
+ [rsi] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_RSI])),
+ [rbp] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_RBP])),
+ [rbx] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_RBX])),
+ [rdx] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_RDX])),
+ [rax] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_RAX])),
+ [rcx] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_RCX])),
+ [rsp] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_RSP])),
+ [rip] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.gregs[REG_RIP])),
+ [fpregs_mem] "i"(__builtin_offsetof(ucontext_t, __fpregs_mem)),
+ [fpregs_ptr] "i"(__builtin_offsetof(ucontext_t, uc_mcontext.fpregs)),
+ [sigmask] "i"(__builtin_offsetof(ucontext_t, uc_sigmask))
+ : "memory", "rcx", "r11");
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/ucontext/CMakeLists.txt b/libc/test/src/ucontext/CMakeLists.txt
index 4c8444393bd35..8dae7050e1f86 100644
--- a/libc/test/src/ucontext/CMakeLists.txt
+++ b/libc/test/src/ucontext/CMakeLists.txt
@@ -10,6 +10,8 @@ if(TARGET libc.src.ucontext.getcontext)
DEPENDS
libc.src.ucontext.getcontext
libc.src.ucontext.setcontext
+ libc.src.ucontext.makecontext
+ libc.src.ucontext.swapcontext
libc.src.signal.sigemptyset
libc.src.signal.sigaddset
libc.src.signal.sigprocmask
diff --git a/libc/test/src/ucontext/ucontext_test.cpp b/libc/test/src/ucontext/ucontext_test.cpp
index 1983c2f1ec5b2..f94195e37057c 100644
--- a/libc/test/src/ucontext/ucontext_test.cpp
+++ b/libc/test/src/ucontext/ucontext_test.cpp
@@ -7,7 +7,9 @@
//===----------------------------------------------------------------------===//
#include "src/ucontext/getcontext.h"
+#include "src/ucontext/makecontext.h"
#include "src/ucontext/setcontext.h"
+#include "src/ucontext/swapcontext.h"
#include "src/signal/sigaddset.h"
#include "src/signal/sigemptyset.h"
@@ -20,7 +22,6 @@
namespace LIBC_NAMESPACE {
static bool is_signal_set(const sigset_t *set, int signum) {
- // TODO: Replace this with sigismember once it is implemented.
// NSIG is 64, sigset_t is an array of unsigned long.
// Signum is 1-indexed.
int word = (signum - 1) / (sizeof(unsigned long) * 8);
@@ -28,8 +29,8 @@ static bool is_signal_set(const sigset_t *set, int signum) {
return (set->__signals[word] & (1UL << bit)) != 0;
}
+volatile int jumped = 0;
TEST(LlvmLibcUcontextTest, BasicStubTest) {
- static volatile int jumped = 0;
ucontext_t ctx;
ASSERT_EQ(getcontext(&ctx), 0);
if (!jumped) {
@@ -39,6 +40,92 @@ TEST(LlvmLibcUcontextTest, BasicStubTest) {
}
}
+ucontext_t old_ctx, new_ctx;
+volatile int swap_called = 0;
+
+void swap_func() {
+ swap_called = 1;
+ setcontext(&old_ctx);
+}
+
+TEST(LlvmLibcUcontextTest, SwapcontextTest) {
+ getcontext(&new_ctx);
+ constexpr size_t STACK_SIZE = 8192;
+ char stack[STACK_SIZE];
+ new_ctx.uc_stack.ss_sp = stack;
+ new_ctx.uc_stack.ss_size = sizeof(stack);
+ makecontext(&new_ctx, swap_func, 0);
+
+ swapcontext(&old_ctx, &new_ctx);
+
+ ASSERT_EQ(swap_called, 1);
+}
+
+ucontext_t old_ctx_args, new_ctx_args;
+volatile int makecontext_args_called = 0;
+volatile int arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8;
+
+void args_func(int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8) {
+ makecontext_args_called = 1;
+ arg1 = a1;
+ arg2 = a2;
+ arg3 = a3;
+ arg4 = a4;
+ arg5 = a5;
+ arg6 = a6;
+ arg7 = a7;
+ arg8 = a8;
+ setcontext(&old_ctx_args);
+}
+
+TEST(LlvmLibcUcontextTest, MakecontextArgsTest) {
+ getcontext(&new_ctx_args);
+ constexpr size_t STACK_SIZE = 8192;
+ char stack[STACK_SIZE];
+ new_ctx_args.uc_stack.ss_sp = stack;
+ new_ctx_args.uc_stack.ss_size = sizeof(stack);
+
+ // Cast function pointer to void(*)(void) as required by makecontext
+ using func_t = void (*)(void);
+ auto func = reinterpret_cast<func_t>(args_func);
+
+ makecontext(&new_ctx_args, func, 8, 11, 22, 33, 44, 55, 66, 77, 88);
+
+ swapcontext(&old_ctx_args, &new_ctx_args);
+
+ ASSERT_EQ(makecontext_args_called, 1);
+ ASSERT_EQ(arg1, 11);
+ ASSERT_EQ(arg2, 22);
+ ASSERT_EQ(arg3, 33);
+ ASSERT_EQ(arg4, 44);
+ ASSERT_EQ(arg5, 55);
+ ASSERT_EQ(arg6, 66);
+ ASSERT_EQ(arg8, 88);
+}
+
+ucontext_t old_ctx_return, new_ctx_return;
+volatile int makecontext_return_called = 0;
+
+void return_func() { makecontext_return_called = 1; }
+
+TEST(LlvmLibcUcontextTest, MakecontextReturnTest) {
+ getcontext(&new_ctx_return);
+ constexpr size_t STACK_SIZE = 8192;
+ char stack[STACK_SIZE];
+ new_ctx_return.uc_stack.ss_sp = stack;
+ new_ctx_return.uc_stack.ss_size = sizeof(stack);
+ new_ctx_return.uc_link = &old_ctx_return;
+
+ using func_t = void (*)(void);
+ auto func = reinterpret_cast<func_t>(return_func);
+
+ makecontext(&new_ctx_return, func, 0);
+
+ swapcontext(&old_ctx_return, &new_ctx_return);
+
+ ASSERT_EQ(makecontext_return_called, 1);
+}
+
TEST(LlvmLibcUcontextTest, SignalMaskTest) {
sigset_t set, old_set;
sigemptyset(&set);
diff --git a/libc/utils/docgen/nl_types.yaml b/libc/utils/docgen/nl_types.yaml
new file mode 100644
index 0000000000000..97fbcddfc9948
--- /dev/null
+++ b/libc/utils/docgen/nl_types.yaml
@@ -0,0 +1,13 @@
+functions:
+ catclose:
+ in-latest-posix: ""
+ catgets:
+ in-latest-posix: ""
+ catopen:
+ in-latest-posix: ""
+
+macros:
+ NL_CAT_LOCALE:
+ in-latest-posix: ""
+ NL_SETD:
+ in-latest-posix: ""
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6e4c385c72edb..ac7a83f3ae998 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12240,16 +12240,51 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
SDLoc dl(Node);
- unsigned BaseOpcode = ISD::getVecReduceBaseOpcode(Node->getOpcode());
+ ISD::NodeType BaseOpcode = ISD::getVecReduceBaseOpcode(Node->getOpcode());
SDValue Op = Node->getOperand(0);
EVT VT = Op.getValueType();
// Try to use a shuffle reduction for power of two vectors.
if (VT.isPow2VectorType()) {
+ // See if the reduction opcode is safe to use with widened types.
+ bool WidenSrc = false;
+ switch (Node->getOpcode()) {
+ case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ WidenSrc = VT.isFixedLengthVector();
+ break;
+ }
+
while (VT.getVectorElementCount().isKnownMultipleOf(2)) {
EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
- if (!isOperationLegalOrCustom(BaseOpcode, HalfVT))
+ if (!isOperationLegalOrCustom(BaseOpcode, HalfVT)) {
+ if (WidenSrc && Op.getOpcode() != ISD::BUILD_VECTOR) {
+ // Attempt to widen the source vectors to a legal op.
+ EVT WideVT = getTypeToTransformTo(*DAG.getContext(), HalfVT);
+ if (WideVT.isVector() &&
+ WideVT.getScalarType() == HalfVT.getScalarType() &&
+ isOperationLegalOrCustom(BaseOpcode, WideVT)) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Op, dl);
+ Lo = DAG.getInsertSubvector(dl, DAG.getPOISON(WideVT), Lo, 0);
+ Hi = DAG.getInsertSubvector(dl, DAG.getPOISON(WideVT), Hi, 0);
+ Op = DAG.getNode(BaseOpcode, dl, WideVT, Lo, Hi, Node->getFlags());
+ Op = DAG.getExtractSubvector(dl, HalfVT, Op, 0);
+ VT = HalfVT;
+ continue;
+ }
+ }
break;
+ }
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Op, dl);
diff --git a/llvm/lib/ProfileData/CMakeLists.txt b/llvm/lib/ProfileData/CMakeLists.txt
index 08e5e760252e1..131f0934fb05b 100644
--- a/llvm/lib/ProfileData/CMakeLists.txt
+++ b/llvm/lib/ProfileData/CMakeLists.txt
@@ -40,7 +40,7 @@ add_llvm_component_library(LLVMProfileData
TargetParser
)
-set(LLVM_ENABLE_OPENCSD "OFF" CACHE STRING "Enable OpenCSD support in LLVMProfileData")
+set(LLVM_ENABLE_OPENCSD "OFF" CACHE STRING "Use OpenCSD if available. Can be ON, OFF, or FORCE_ON")
set(LLVM_HAVE_OPENCSD OFF CACHE BOOL "Is OpenCSD available" FORCE)
@@ -76,7 +76,7 @@ if(NOT LLVM_ENABLE_OPENCSD STREQUAL "OFF")
message(STATUS "LLVMProfileData: OpenCSD support enabled (found ${OPENCSD_VERSION}).")
set(LLVM_HAVE_OPENCSD ON CACHE BOOL "Is OpenCSD available" FORCE)
elseif(OPENCSD_LIB AND OPENCSD_INCLUDE)
- if(LLVM_ENABLE_OPENCSD STREQUAL "ON")
+ if(LLVM_ENABLE_OPENCSD STREQUAL "FORCE_ON")
message(FATAL_ERROR
"OpenCSD found at ${OPENCSD_LIB} but its version "
"(${OPENCSD_VERSION}) is older than the required "
@@ -88,7 +88,7 @@ if(NOT LLVM_ENABLE_OPENCSD STREQUAL "OFF")
"${LLVM_OPENCSD_MIN_VERSION}; ETM decoding support disabled.")
endif()
else()
- if(LLVM_ENABLE_OPENCSD STREQUAL "ON")
+ if(LLVM_ENABLE_OPENCSD STREQUAL "FORCE_ON")
message(FATAL_ERROR "OpenCSD enabled but library or headers not found.")
else()
message(STATUS "LLVMProfileData: OpenCSD not found; ETM decoding support disabled.")
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
index fecc6554667fa..954e8e422b55f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
@@ -25,13 +25,20 @@ declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
declare i128 @llvm.vector.reduce.mul.v2i128(<2 x i128>)
define i8 @mulv_v2i8(<2 x i8> %a) {
-; CHECK-LABEL: mulv_v2i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mul w0, w9, w8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mulv_v2i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mulv_v2i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: fmov w9, s0
+; CHECK-GI-NEXT: mul w0, w9, w8
+; CHECK-GI-NEXT: ret
entry:
%arg1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a)
ret i8 %arg1
@@ -221,13 +228,20 @@ entry:
}
define i16 @mulv_v2i16(<2 x i16> %a) {
-; CHECK-LABEL: mulv_v2i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mul w0, w9, w8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mulv_v2i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mulv_v2i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: fmov w9, s0
+; CHECK-GI-NEXT: mul w0, w9, w8
+; CHECK-GI-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a)
ret i16 %arg1
@@ -354,13 +368,20 @@ entry:
}
define i32 @mulv_v2i32(<2 x i32> %a) {
-; CHECK-LABEL: mulv_v2i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mul w0, w9, w8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mulv_v2i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mulv_v2i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: fmov w9, s0
+; CHECK-GI-NEXT: mul w0, w9, w8
+; CHECK-GI-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a)
ret i32 %arg1
@@ -374,9 +395,8 @@ define i32 @mulv_v3i32(<3 x i32> %a) {
; CHECK-NEXT: mov v1.s[3], w8
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mul w0, w9, w8
+; CHECK-NEXT: mul v0.2s, v0.2s, v0.s[1]
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.mul.v3i32(<3 x i32> %a)
@@ -388,9 +408,8 @@ define i32 @mulv_v4i32(<4 x i32> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: mul w0, w9, w8
+; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mulv_v4i32:
@@ -412,9 +431,8 @@ define i32 @mulv_v8i32(<8 x i32> %a) {
; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: mul w0, w9, w8
+; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mulv_v8i32:
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index e231b91f1199b..3172735481d70 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -55,7 +55,8 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fmul_f32:
@@ -83,7 +84,8 @@ define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) {
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fmul_f32_same:
@@ -378,9 +380,8 @@ define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-SD-NEXT: mul v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: mul w0, w9, w8
+; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mul_i32:
@@ -412,9 +413,8 @@ define i32 @mul_i32_same(<4 x i32> %a, <4 x i32> %b) {
; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: mul w0, w9, w8
+; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mul_i32_same:
@@ -907,8 +907,8 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d)
; CHECK-SD-NEXT: fmul s2, s2, s3
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
-; CHECK-SD-NEXT: fmul s0, s0, s2
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmul s0, s2, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: nested_mul_f32:
@@ -1058,9 +1058,8 @@ define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-SD-NEXT: mul w8, w0, w1
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w9, v0.s[1]
-; CHECK-SD-NEXT: fmov w10, s0
-; CHECK-SD-NEXT: mul w9, w10, w9
+; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmov w9, s0
; CHECK-SD-NEXT: mul w0, w9, w8
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
index f4825f065d316..dd633ca379941 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
@@ -8,7 +8,8 @@ define float @mul_HalfS(<2 x float> %bin.rdx) {
; CHECK-SD-LABEL: mul_HalfS:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mul_HalfS:
@@ -41,9 +42,9 @@ define half @mul_HalfH(<4 x half> %bin.rdx) {
; CHECK-SD-FP16-LABEL: mul_HalfH:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1]
-; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2]
-; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3]
+; CHECK-SD-FP16-NEXT: dup v1.2s, v0.s[1]
+; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
+; CHECK-SD-FP16-NEXT: fmul h0, h0, v0.h[1]
; CHECK-SD-FP16-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: mul_HalfH:
@@ -79,7 +80,7 @@ define half @mul_H(<8 x half> %bin.rdx) {
; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-SD-NOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NOFP16-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v0.s[1]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: ret
;
@@ -87,9 +88,9 @@ define half @mul_H(<8 x half> %bin.rdx) {
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
-; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1]
-; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2]
-; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3]
+; CHECK-SD-FP16-NEXT: dup v1.2s, v0.s[1]
+; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
+; CHECK-SD-FP16-NEXT: fmul h0, h0, v0.h[1]
; CHECK-SD-FP16-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: mul_H:
@@ -124,7 +125,8 @@ define float @mul_S(<4 x float> %bin.rdx) {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mul_S:
@@ -163,7 +165,7 @@ define half @mul_2H(<16 x half> %bin.rdx) {
; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-SD-NOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NOFP16-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v0.s[1]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: ret
;
@@ -172,9 +174,9 @@ define half @mul_2H(<16 x half> %bin.rdx) {
; CHECK-SD-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h
; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
-; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1]
-; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2]
-; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3]
+; CHECK-SD-FP16-NEXT: dup v1.2s, v0.s[1]
+; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
+; CHECK-SD-FP16-NEXT: fmul h0, h0, v0.h[1]
; CHECK-SD-FP16-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: mul_2H:
@@ -215,7 +217,8 @@ define float @mul_2S(<8 x float> %bin.rdx) {
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mul_2S:
@@ -248,8 +251,8 @@ define float @mul_S_init_42(<4 x float> %bin.rdx) {
; CHECK-SD-NEXT: mov w8, #1109917696 // =0x42280000
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: fmov s1, w8
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
-; CHECK-SD-NEXT: fmul s0, s0, s1
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmul s0, s1, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mul_S_init_42:
@@ -280,8 +283,8 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
; CHECK-SD-NOFP16-NEXT: ext v3.16b, v1.16b, v1.16b, #8
; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v2.2s
; CHECK-SD-NOFP16-NEXT: fmul v1.2s, v1.2s, v3.2s
-; CHECK-SD-NOFP16-NEXT: fmul s0, s0, v0.s[1]
-; CHECK-SD-NOFP16-NEXT: fmul s1, s1, v1.s[1]
+; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NOFP16-NEXT: fmul v1.2s, v1.2s, v1.s[1]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
@@ -295,9 +298,9 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
; CHECK-SD-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h
; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
-; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1]
-; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2]
-; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3]
+; CHECK-SD-FP16-NEXT: dup v1.2s, v0.s[1]
+; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h
+; CHECK-SD-FP16-NEXT: fmul h0, h0, v0.h[1]
; CHECK-SD-FP16-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: fmul_reduct_reassoc_v8f16:
@@ -358,7 +361,8 @@ define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fmul_reduct_reassoc_v8f32:
@@ -387,7 +391,8 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32:
@@ -414,10 +419,10 @@ define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x floa
; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8
; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v3.2s
; CHECK-SD-NEXT: ext v3.16b, v2.16b, v2.16b, #8
-; CHECK-SD-NEXT: fmul s1, s1, v1.s[1]
+; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v1.s[1]
; CHECK-SD-NEXT: fmul v2.2s, v2.2s, v3.2s
; CHECK-SD-NEXT: fmul s0, s0, s1
-; CHECK-SD-NEXT: fmul s1, s2, v2.s[1]
+; CHECK-SD-NEXT: fmul v1.2s, v2.2s, v2.s[1]
; CHECK-SD-NEXT: fmul s0, s0, s1
; CHECK-SD-NEXT: ret
;
@@ -447,7 +452,8 @@ define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fmul_reduct_reassoc_v4v8f32:
@@ -499,9 +505,9 @@ define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b)
; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v2.2s
; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v3.2s
-; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
-; CHECK-SD-NEXT: fmul s1, s1, v1.s[1]
-; CHECK-SD-NEXT: fmul s1, s0, s1
+; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1]
+; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v1.s[1]
+; CHECK-SD-NEXT: fmul s1, s0, v1.s[0]
; CHECK-SD-NEXT: fmul s0, s1, s0
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/PowerPC/cttz-elts.ll b/llvm/test/CodeGen/PowerPC/cttz-elts.ll
index 301466657487f..d84ab409f3d9f 100644
--- a/llvm/test/CodeGen/PowerPC/cttz-elts.ll
+++ b/llvm/test/CodeGen/PowerPC/cttz-elts.ll
@@ -11,20 +11,16 @@ define i32 @v4i1(<4 x i1> %x) {
; CHECK-NEXT: vsraw 2, 2, 4
; CHECK-NEXT: lxvd2x 0, 0, 3
; CHECK-NEXT: xxswapd 35, 0
-; CHECK-NEXT: xxland 0, 34, 35
-; CHECK-NEXT: xxsldwi 1, 0, 0, 1
-; CHECK-NEXT: xxswapd 2, 0
-; CHECK-NEXT: xxsldwi 3, 0, 0, 3
-; CHECK-NEXT: mffprwz 3, 1
-; CHECK-NEXT: mffprwz 4, 2
+; CHECK-NEXT: xxland 34, 34, 35
+; CHECK-NEXT: xxmrghw 0, 34, 34
+; CHECK-NEXT: xxmrghw 35, 0, 34
+; CHECK-NEXT: vmaxuw 2, 2, 3
+; CHECK-NEXT: xxsldwi 0, 34, 34, 1
+; CHECK-NEXT: xxswapd 1, 34
+; CHECK-NEXT: mffprwz 3, 0
+; CHECK-NEXT: mffprwz 4, 1
; CHECK-NEXT: cmplw 4, 3
; CHECK-NEXT: iselgt 3, 4, 3
-; CHECK-NEXT: mffprwz 4, 0
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: iselgt 3, 3, 4
-; CHECK-NEXT: mffprwz 4, 3
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: iselgt 3, 3, 4
; CHECK-NEXT: subfic 3, 3, 4
; CHECK-NEXT: blr
%y = call i32 @llvm.experimental.cttz.elts(<4 x i1> %x, i1 false)
@@ -40,20 +36,16 @@ define i32 @v4i32(<4 x i32> %x) {
; CHECK-NEXT: vcmpgtuw 2, 2, 4
; CHECK-NEXT: lxvd2x 0, 0, 3
; CHECK-NEXT: xxswapd 35, 0
-; CHECK-NEXT: xxland 0, 34, 35
-; CHECK-NEXT: xxsldwi 1, 0, 0, 1
-; CHECK-NEXT: xxswapd 2, 0
-; CHECK-NEXT: xxsldwi 3, 0, 0, 3
-; CHECK-NEXT: mffprwz 3, 1
-; CHECK-NEXT: mffprwz 4, 2
+; CHECK-NEXT: xxland 34, 34, 35
+; CHECK-NEXT: xxmrghw 0, 34, 34
+; CHECK-NEXT: xxmrghw 35, 0, 34
+; CHECK-NEXT: vmaxuw 2, 2, 3
+; CHECK-NEXT: xxsldwi 0, 34, 34, 1
+; CHECK-NEXT: xxswapd 1, 34
+; CHECK-NEXT: mffprwz 3, 0
+; CHECK-NEXT: mffprwz 4, 1
; CHECK-NEXT: cmplw 4, 3
; CHECK-NEXT: iselgt 3, 4, 3
-; CHECK-NEXT: mffprwz 4, 0
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: iselgt 3, 3, 4
-; CHECK-NEXT: mffprwz 4, 3
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: iselgt 3, 3, 4
; CHECK-NEXT: subfic 3, 3, 4
; CHECK-NEXT: blr
%y = call i32 @llvm.experimental.cttz.elts(<4 x i32> %x, i1 false)
@@ -70,20 +62,16 @@ define i32 @v4i1_zero_is_poison(<4 x i1> %x) {
; CHECK-NEXT: vsraw 2, 2, 4
; CHECK-NEXT: lxvd2x 0, 0, 3
; CHECK-NEXT: xxswapd 35, 0
-; CHECK-NEXT: xxland 0, 34, 35
-; CHECK-NEXT: xxsldwi 1, 0, 0, 1
-; CHECK-NEXT: xxswapd 2, 0
-; CHECK-NEXT: xxsldwi 3, 0, 0, 3
-; CHECK-NEXT: mffprwz 3, 1
-; CHECK-NEXT: mffprwz 4, 2
+; CHECK-NEXT: xxland 34, 34, 35
+; CHECK-NEXT: xxmrghw 0, 34, 34
+; CHECK-NEXT: xxmrghw 35, 0, 34
+; CHECK-NEXT: vmaxuw 2, 2, 3
+; CHECK-NEXT: xxsldwi 0, 34, 34, 1
+; CHECK-NEXT: xxswapd 1, 34
+; CHECK-NEXT: mffprwz 3, 0
+; CHECK-NEXT: mffprwz 4, 1
; CHECK-NEXT: cmplw 4, 3
; CHECK-NEXT: iselgt 3, 4, 3
-; CHECK-NEXT: mffprwz 4, 0
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: iselgt 3, 3, 4
-; CHECK-NEXT: mffprwz 4, 3
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: iselgt 3, 3, 4
; CHECK-NEXT: subfic 3, 3, 4
; CHECK-NEXT: blr
%y = call i32 @llvm.experimental.cttz.elts(<4 x i1> %x, i1 true)
@@ -99,20 +87,16 @@ define i32 @v4i32_zero_is_poison(<4 x i32> %x) {
; CHECK-NEXT: vcmpgtuw 2, 2, 4
; CHECK-NEXT: lxvd2x 0, 0, 3
; CHECK-NEXT: xxswapd 35, 0
-; CHECK-NEXT: xxland 0, 34, 35
-; CHECK-NEXT: xxsldwi 1, 0, 0, 1
-; CHECK-NEXT: xxswapd 2, 0
-; CHECK-NEXT: xxsldwi 3, 0, 0, 3
-; CHECK-NEXT: mffprwz 3, 1
-; CHECK-NEXT: mffprwz 4, 2
+; CHECK-NEXT: xxland 34, 34, 35
+; CHECK-NEXT: xxmrghw 0, 34, 34
+; CHECK-NEXT: xxmrghw 35, 0, 34
+; CHECK-NEXT: vmaxuw 2, 2, 3
+; CHECK-NEXT: xxsldwi 0, 34, 34, 1
+; CHECK-NEXT: xxswapd 1, 34
+; CHECK-NEXT: mffprwz 3, 0
+; CHECK-NEXT: mffprwz 4, 1
; CHECK-NEXT: cmplw 4, 3
; CHECK-NEXT: iselgt 3, 4, 3
-; CHECK-NEXT: mffprwz 4, 0
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: iselgt 3, 3, 4
-; CHECK-NEXT: mffprwz 4, 3
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: iselgt 3, 3, 4
; CHECK-NEXT: subfic 3, 3, 4
; CHECK-NEXT: blr
%y = call i32 @llvm.experimental.cttz.elts(<4 x i32> %x, i1 true)
diff --git a/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
index 61191fd2637e8..8159468722596 100644
--- a/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
@@ -45,19 +45,20 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pcmpeqd %xmm0, %xmm1
; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT: movd %xmm1, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: por %xmm0, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; CHECK-NEXT: por %xmm3, %xmm0
+; CHECK-NEXT: pcmpgtd %xmm0, %xmm2
+; CHECK-NEXT: pand %xmm2, %xmm1
+; CHECK-NEXT: pandn %xmm3, %xmm2
+; CHECK-NEXT: por %xmm1, %xmm2
+; CHECK-NEXT: movd %xmm2, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
; CHECK-NEXT: movd %xmm0, %ecx
; CHECK-NEXT: cmpl %ecx, %eax
; CHECK-NEXT: cmoval %eax, %ecx
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: cmpl %eax, %ecx
-; CHECK-NEXT: cmovbel %eax, %ecx
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: cmpl %eax, %ecx
-; CHECK-NEXT: cmovbel %eax, %ecx
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index 01bdf0a098e7a..1839eefcd264e 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -253,12 +253,10 @@ define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32>
; AVX2-NEXT: vpsrld $31, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $1, %xmm2, %eax
-; AVX2-NEXT: vmovd %xmm2, %ecx
-; AVX2-NEXT: addl %eax, %ecx
-; AVX2-NEXT: vpextrd $2, %xmm2, %edx
-; AVX2-NEXT: vpextrd $3, %xmm2, %eax
-; AVX2-NEXT: addl %edx, %eax
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpextrd $1, %xmm2, %ecx
+; AVX2-NEXT: vmovd %xmm2, %eax
; AVX2-NEXT: addl %ecx, %eax
; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: vpextrd $1, %xmm3, %ecx
@@ -351,14 +349,12 @@ define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x f
; AVX2-NEXT: vpsrld $31, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX2-NEXT: vpextrd $1, %xmm2, %eax
; AVX2-NEXT: vmovd %xmm2, %ecx
; AVX2-NEXT: addl %eax, %ecx
-; AVX2-NEXT: vpextrd $2, %xmm2, %eax
-; AVX2-NEXT: vpextrd $3, %xmm2, %edx
-; AVX2-NEXT: addl %eax, %edx
-; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: andl $7, %edx
+; AVX2-NEXT: andl $7, %ecx
; AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX2-NEXT: vmovss %xmm0, (%rsp)
; AVX2-NEXT: vmovd %xmm3, %eax
@@ -596,9 +592,9 @@ define <16 x i32> @test_compress_v16i32(<16 x i32> %vec, <16 x i1> %mask, <16 x
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: subq $160, %rsp
; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm3, (%rsp)
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
@@ -606,15 +602,13 @@ define <16 x i32> @test_compress_v16i32(<16 x i32> %vec, <16 x i1> %mask, <16 x
; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpextrd $1, %xmm3, %eax
; AVX2-NEXT: vmovd %xmm3, %ecx
; AVX2-NEXT: addl %eax, %ecx
-; AVX2-NEXT: vpextrd $2, %xmm3, %eax
-; AVX2-NEXT: vpextrd $3, %xmm3, %edx
-; AVX2-NEXT: addl %eax, %edx
-; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: andl $15, %edx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vpextrb $1, %xmm2, %eax
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: vmovd %xmm2, %ecx
@@ -657,59 +651,60 @@ define <16 x i32> @test_compress_v16i32(<16 x i32> %vec, <16 x i1> %mask, <16 x
; AVX2-NEXT: vpextrb $12, %xmm2, %r13d
; AVX2-NEXT: andl $1, %r13d
; AVX2-NEXT: addq %r12, %r13
-; AVX2-NEXT: vpextrb $13, %xmm2, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %r13, %rcx
-; AVX2-NEXT: vpextrb $14, %xmm2, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vpextrb $15, %xmm2, %edx
+; AVX2-NEXT: vpextrb $13, %xmm2, %edx
; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: addq %r13, %rdx
+; AVX2-NEXT: vpextrb $14, %xmm2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: vpextrb $15, %xmm2, %edi
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: addq %rcx, %rdi
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: cmpq $16, %rdx
-; AVX2-NEXT: vextractps $3, %xmm2, %esi
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX2-NEXT: cmovbl (%rsp,%rdi,4), %esi
-; AVX2-NEXT: movl %esi, %edi
-; AVX2-NEXT: vmovss %xmm0, (%rsp)
+; AVX2-NEXT: cmpq $16, %rdi
+; AVX2-NEXT: vextractps $3, %xmm2, %eax
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rsi,4)
+; AVX2-NEXT: cmovbl 32(%rsp,%rsi,4), %eax
+; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rsi,4)
+; AVX2-NEXT: vextractps $1, %xmm0, 32(%rsp,%rsi,4)
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rsi,4)
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vextractps $2, %xmm0, 32(%rsp,%rsi,4)
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT: vmovss %xmm0, (%rsp,%rsi,4)
+; AVX2-NEXT: vextractps $3, %xmm0, 32(%rsp,%rsi,4)
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: vmovss %xmm0, 32(%rsp,%rax,4)
; AVX2-NEXT: andl $15, %r8d
-; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%r8,4)
+; AVX2-NEXT: vextractps $1, %xmm0, 32(%rsp,%r8,4)
; AVX2-NEXT: andl $15, %r9d
-; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%r9,4)
+; AVX2-NEXT: vextractps $2, %xmm0, 32(%rsp,%r9,4)
; AVX2-NEXT: andl $15, %r10d
-; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%r10,4)
+; AVX2-NEXT: vextractps $3, %xmm0, 32(%rsp,%r10,4)
; AVX2-NEXT: andl $15, %r11d
-; AVX2-NEXT: vmovss %xmm1, (%rsp,%r11,4)
+; AVX2-NEXT: vmovss %xmm1, 32(%rsp,%r11,4)
; AVX2-NEXT: andl $15, %ebx
-; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%rbx,4)
+; AVX2-NEXT: vextractps $1, %xmm1, 32(%rsp,%rbx,4)
; AVX2-NEXT: andl $15, %r14d
-; AVX2-NEXT: vextractps $2, %xmm1, (%rsp,%r14,4)
+; AVX2-NEXT: vextractps $2, %xmm1, 32(%rsp,%r14,4)
; AVX2-NEXT: andl $15, %r15d
-; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%r15,4)
+; AVX2-NEXT: vextractps $3, %xmm1, 32(%rsp,%r15,4)
; AVX2-NEXT: andl $15, %r12d
-; AVX2-NEXT: vmovss %xmm2, (%rsp,%r12,4)
+; AVX2-NEXT: vmovss %xmm2, 32(%rsp,%r12,4)
; AVX2-NEXT: andl $15, %r13d
-; AVX2-NEXT: vextractps $1, %xmm2, (%rsp,%r13,4)
+; AVX2-NEXT: vextractps $1, %xmm2, 32(%rsp,%r13,4)
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vextractps $2, %xmm2, 32(%rsp,%rdx,4)
; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vextractps $2, %xmm2, (%rsp,%rcx,4)
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vextractps $3, %xmm2, (%rsp,%rax,4)
-; AVX2-NEXT: cmpq $15, %rdx
+; AVX2-NEXT: vextractps $3, %xmm2, 32(%rsp,%rcx,4)
+; AVX2-NEXT: cmpq $15, %rdi
; AVX2-NEXT: movl $15, %eax
-; AVX2-NEXT: cmovbq %rdx, %rax
+; AVX2-NEXT: cmovbq %rdi, %rax
; AVX2-NEXT: movl %eax, %eax
-; AVX2-NEXT: movl %edi, (%rsp,%rax,4)
-; AVX2-NEXT: vmovaps (%rsp), %ymm0
+; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; AVX2-NEXT: movl %ecx, 32(%rsp,%rax,4)
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: leaq -40(%rbp), %rsp
; AVX2-NEXT: popq %rbx
@@ -756,14 +751,12 @@ define <16 x float> @test_compress_v16f32(<16 x float> %vec, <16 x i1> %mask, <1
; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpextrd $1, %xmm3, %eax
; AVX2-NEXT: vmovd %xmm3, %ecx
; AVX2-NEXT: addl %eax, %ecx
-; AVX2-NEXT: vpextrd $2, %xmm3, %eax
-; AVX2-NEXT: vpextrd $3, %xmm3, %edx
-; AVX2-NEXT: addl %eax, %edx
-; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: andl $15, %ecx
; AVX2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX2-NEXT: vmovss %xmm0, (%rsp)
; AVX2-NEXT: vmovd %xmm2, %eax
@@ -1342,38 +1335,16 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8>
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm4
; AVX2-NEXT: vpand %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpaddb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[8,9,10,11,12,13,14,15,12,13],zero,zero,xmm2[14,15],zero,zero
+; AVX2-NEXT: vpaddb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1]
+; AVX2-NEXT: vpaddb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpsrld $16, %xmm2, %xmm4
+; AVX2-NEXT: vpaddb %xmm4, %xmm2, %xmm2
; AVX2-NEXT: vpextrb $1, %xmm2, %eax
; AVX2-NEXT: vmovd %xmm2, %ecx
; AVX2-NEXT: addb %al, %cl
-; AVX2-NEXT: vpextrb $2, %xmm2, %eax
-; AVX2-NEXT: vpextrb $3, %xmm2, %edx
-; AVX2-NEXT: addb %al, %dl
-; AVX2-NEXT: addb %cl, %dl
-; AVX2-NEXT: vpextrb $4, %xmm2, %eax
-; AVX2-NEXT: vpextrb $5, %xmm2, %ecx
-; AVX2-NEXT: addb %al, %cl
-; AVX2-NEXT: vpextrb $6, %xmm2, %eax
-; AVX2-NEXT: addb %cl, %al
-; AVX2-NEXT: addb %dl, %al
-; AVX2-NEXT: vpextrb $7, %xmm2, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm2, %edx
-; AVX2-NEXT: addb %cl, %dl
-; AVX2-NEXT: vpextrb $9, %xmm2, %ecx
-; AVX2-NEXT: addb %dl, %cl
-; AVX2-NEXT: vpextrb $10, %xmm2, %edx
-; AVX2-NEXT: addb %cl, %dl
-; AVX2-NEXT: addb %al, %dl
-; AVX2-NEXT: vpextrb $11, %xmm2, %eax
-; AVX2-NEXT: vpextrb $12, %xmm2, %ecx
-; AVX2-NEXT: addb %al, %cl
-; AVX2-NEXT: vpextrb $13, %xmm2, %eax
-; AVX2-NEXT: addb %cl, %al
-; AVX2-NEXT: vpextrb $14, %xmm2, %ecx
-; AVX2-NEXT: addb %al, %cl
-; AVX2-NEXT: vpextrb $15, %xmm2, %eax
-; AVX2-NEXT: addb %cl, %al
-; AVX2-NEXT: addb %dl, %al
-; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: movzbl %cl, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp)
@@ -1556,12 +1527,13 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8>
; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $31, %ecx
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
+; AVX2-NEXT: cmpq $32, %rdx
+; AVX2-NEXT: cmovbl %eax, %ecx
; AVX2-NEXT: cmpq $31, %rdx
-; AVX2-NEXT: movl $31, %ecx
-; AVX2-NEXT: cmovbq %rdx, %rcx
-; AVX2-NEXT: vpextrb $15, %xmm0, %edx
-; AVX2-NEXT: cmovbel %eax, %edx
-; AVX2-NEXT: movb %dl, (%rsp,%rcx)
+; AVX2-NEXT: movl $31, %eax
+; AVX2-NEXT: cmovbq %rdx, %rax
+; AVX2-NEXT: movb %cl, (%rsp,%rax)
; AVX2-NEXT: vmovaps (%rsp), %ymm0
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
@@ -1589,18 +1561,16 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8>
; AVX512F-NEXT: vpaddd %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512F-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX512F-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX512F-NEXT: vpextrd $1, %xmm3, %eax
; AVX512F-NEXT: vmovd %xmm3, %ecx
; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: vpextrd $2, %xmm3, %eax
-; AVX512F-NEXT: vpextrd $3, %xmm3, %edx
-; AVX512F-NEXT: addl %eax, %edx
-; AVX512F-NEXT: addl %ecx, %edx
-; AVX512F-NEXT: andl $31, %edx
+; AVX512F-NEXT: andl $31, %ecx
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm0, (%rsp,%rdx)
+; AVX512F-NEXT: vpmovdb %zmm0, (%rsp,%rcx)
; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm0
; AVX512F-NEXT: vpblendvb %ymm0, (%rsp), %ymm2, %ymm0
; AVX512F-NEXT: movq %rbp, %rsp
@@ -1637,21 +1607,13 @@ define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x
; AVX2-NEXT: vpsrlw $15, %ymm3, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX2-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
+; AVX2-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpextrw $1, %xmm2, %eax
; AVX2-NEXT: vmovd %xmm2, %ecx
; AVX2-NEXT: addl %eax, %ecx
-; AVX2-NEXT: vpextrw $2, %xmm2, %eax
-; AVX2-NEXT: vpextrw $3, %xmm2, %edx
-; AVX2-NEXT: addl %eax, %edx
-; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: vpextrw $4, %xmm2, %eax
-; AVX2-NEXT: vpextrw $5, %xmm2, %ecx
-; AVX2-NEXT: addl %eax, %ecx
-; AVX2-NEXT: vpextrw $6, %xmm2, %eax
-; AVX2-NEXT: addl %ecx, %eax
-; AVX2-NEXT: addl %edx, %eax
-; AVX2-NEXT: vpextrw $7, %xmm2, %ecx
-; AVX2-NEXT: addl %eax, %ecx
; AVX2-NEXT: andl $15, %ecx
; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vpextrw $1, %xmm1, %eax
@@ -1793,10 +1755,10 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $96, %rsp
-; AVX2-NEXT: movl %r9d, %r11d
-; AVX2-NEXT: movl %r8d, %r10d
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: movl %edx, %r8d
+; AVX2-NEXT: movl %r9d, %r10d
+; AVX2-NEXT: movl %r8d, %r9d
+; AVX2-NEXT: movl %ecx, %r8d
+; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX2-NEXT: movzbl 360(%rbp), %eax
@@ -1867,21 +1829,21 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: vmovd %edi, %xmm5
; AVX2-NEXT: vpinsrb $1, %esi, %xmm5, %xmm5
; AVX2-NEXT: vpinsrb $2, %edx, %xmm5, %xmm5
-; AVX2-NEXT: vpinsrb $3, %r9d, %xmm5, %xmm5
-; AVX2-NEXT: vpinsrb $4, %r10d, %xmm5, %xmm5
-; AVX2-NEXT: vpinsrb $5, %r11d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 16(%rbp), %ebx
-; AVX2-NEXT: vpinsrb $6, %ebx, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 24(%rbp), %r14d
-; AVX2-NEXT: vpinsrb $7, %r14d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 32(%rbp), %r15d
-; AVX2-NEXT: vpinsrb $8, %r15d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 40(%rbp), %r12d
-; AVX2-NEXT: vpinsrb $9, %r12d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 48(%rbp), %r13d
-; AVX2-NEXT: vpinsrb $10, %r13d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 56(%rbp), %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $3, %r8d, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $4, %r9d, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $5, %r10d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 16(%rbp), %r11d
+; AVX2-NEXT: vpinsrb $6, %r11d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 24(%rbp), %ebx
+; AVX2-NEXT: vpinsrb $7, %ebx, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 32(%rbp), %r14d
+; AVX2-NEXT: vpinsrb $8, %r14d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 40(%rbp), %r15d
+; AVX2-NEXT: vpinsrb $9, %r15d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 48(%rbp), %r12d
+; AVX2-NEXT: vpinsrb $10, %r12d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 56(%rbp), %r13d
+; AVX2-NEXT: vpinsrb $11, %r13d, %xmm5, %xmm5
; AVX2-NEXT: movzbl 64(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
; AVX2-NEXT: movzbl 72(%rbp), %eax
@@ -1929,92 +1891,70 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: vpaddb %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-NEXT: vpaddb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpextrb $1, %xmm4, %eax
-; AVX2-NEXT: vmovd %xmm4, %ecx
-; AVX2-NEXT: addb %al, %cl
-; AVX2-NEXT: vpextrb $2, %xmm4, %edx
-; AVX2-NEXT: vpextrb $3, %xmm4, %eax
-; AVX2-NEXT: addb %dl, %al
-; AVX2-NEXT: addb %cl, %al
-; AVX2-NEXT: vpextrb $4, %xmm4, %ecx
-; AVX2-NEXT: vpextrb $5, %xmm4, %edx
-; AVX2-NEXT: addb %cl, %dl
-; AVX2-NEXT: vpextrb $6, %xmm4, %ecx
-; AVX2-NEXT: addb %dl, %cl
-; AVX2-NEXT: addb %al, %cl
-; AVX2-NEXT: vpextrb $7, %xmm4, %eax
-; AVX2-NEXT: vpextrb $8, %xmm4, %edx
-; AVX2-NEXT: addb %al, %dl
-; AVX2-NEXT: vpextrb $9, %xmm4, %eax
-; AVX2-NEXT: addb %dl, %al
-; AVX2-NEXT: vpextrb $10, %xmm4, %edx
-; AVX2-NEXT: addb %al, %dl
-; AVX2-NEXT: addb %cl, %dl
-; AVX2-NEXT: vpextrb $11, %xmm4, %eax
-; AVX2-NEXT: vpextrb $12, %xmm4, %ecx
-; AVX2-NEXT: addb %al, %cl
-; AVX2-NEXT: vpextrb $13, %xmm4, %eax
-; AVX2-NEXT: addb %cl, %al
-; AVX2-NEXT: vpextrb $14, %xmm4, %ecx
-; AVX2-NEXT: addb %al, %cl
-; AVX2-NEXT: vpextrb $15, %xmm4, %eax
+; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8,9,10,11,12,13,14,15,12,13],zero,zero,xmm4[14,15],zero,zero
+; AVX2-NEXT: vpaddb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1]
+; AVX2-NEXT: vpaddb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpsrld $16, %xmm4, %xmm5
+; AVX2-NEXT: vpaddb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpextrb $1, %xmm4, %ecx
+; AVX2-NEXT: vmovd %xmm4, %eax
; AVX2-NEXT: addb %cl, %al
-; AVX2-NEXT: addb %dl, %al
; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovaps %ymm2, (%rsp)
; AVX2-NEXT: movzbl %al, %eax
; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: movzbl (%rsp,%rax), %edx
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
+; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp)
; AVX2-NEXT: andl $1, %edi
; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rdi)
; AVX2-NEXT: andl $1, %esi
; AVX2-NEXT: addq %rdi, %rsi
; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rsi)
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rsi, %rdx
+; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rdx)
; AVX2-NEXT: andl $1, %r8d
-; AVX2-NEXT: addq %rsi, %r8
-; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r8)
+; AVX2-NEXT: addq %rdx, %r8
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r8)
; AVX2-NEXT: andl $1, %r9d
; AVX2-NEXT: addq %r8, %r9
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r9)
+; AVX2-NEXT: movl %r9d, %eax
+; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
; AVX2-NEXT: andl $1, %r10d
; AVX2-NEXT: addq %r9, %r10
-; AVX2-NEXT: movl %r10d, %eax
-; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: andl $1, %r11d
-; AVX2-NEXT: addq %r10, %r11
-; AVX2-NEXT: movzbl %bl, %eax
+; AVX2-NEXT: movzbl %r11b, %eax
; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %r11, %rax
-; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11
-; AVX2-NEXT: andl $63, %r11d
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%r11)
-; AVX2-NEXT: movzbl %r14b, %ecx
+; AVX2-NEXT: addq %r10, %rax
+; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX2-NEXT: andl $63, %r10d
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%r10)
+; AVX2-NEXT: movzbl %bl, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl %r15b, %eax
+; AVX2-NEXT: movzbl %r14b, %eax
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: addq %rcx, %rax
; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl %r12b, %ecx
+; AVX2-NEXT: movzbl %r15b, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl %r13b, %eax
+; AVX2-NEXT: movzbl %r12b, %eax
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: addq %rcx, %rax
; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 56(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movzbl %r13b, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
@@ -2388,7 +2328,7 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
; AVX2-NEXT: vpextrb $15, %xmm0, %eax
; AVX2-NEXT: cmpq $64, %rcx
-; AVX2-NEXT: cmovbl %edx, %eax
+; AVX2-NEXT: cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
; AVX2-NEXT: cmpq $63, %rcx
; AVX2-NEXT: movl $63, %edx
; AVX2-NEXT: cmovbq %rcx, %rdx
@@ -2840,19 +2780,19 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512F-NEXT: vpcompressd %zmm2, %zmm4 {%k2} {z}
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm7
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero,xmm7[8],zero,zero,zero,xmm7[9],zero,zero,zero,xmm7[10],zero,zero,zero,xmm7[11],zero,zero,zero,xmm7[12],zero,zero,zero,xmm7[13],zero,zero,zero,xmm7[14],zero,zero,zero,xmm7[15],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero,xmm6[8],zero,zero,zero,xmm6[9],zero,zero,zero,xmm6[10],zero,zero,zero,xmm6[11],zero,zero,zero,xmm6[12],zero,zero,zero,xmm6[13],zero,zero,zero,xmm6[14],zero,zero,zero,xmm6[15],zero,zero,zero
; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512F-NEXT: vpcompressd %zmm3, %zmm5 {%k2} {z}
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpcompressd %zmm0, %zmm6 {%k1} {z}
+; AVX512F-NEXT: vpcompressd %zmm0, %zmm7 {%k1} {z}
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero,xmm7[8],zero,zero,zero,xmm7[9],zero,zero,zero,xmm7[10],zero,zero,zero,xmm7[11],zero,zero,zero,xmm7[12],zero,zero,zero,xmm7[13],zero,zero,zero,xmm7[14],zero,zero,zero,xmm7[15],zero,zero,zero
+; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero,xmm6[8],zero,zero,zero,xmm6[9],zero,zero,zero,xmm6[10],zero,zero,zero,xmm6[11],zero,zero,zero,xmm6[12],zero,zero,zero,xmm6[13],zero,zero,zero,xmm6[14],zero,zero,zero,xmm6[15],zero,zero,zero
; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: vpcompressd %zmm7, %zmm7 {%k1} {z}
+; AVX512F-NEXT: vpcompressd %zmm6, %zmm6 {%k1} {z}
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm8 {%k1} {z} = -1
; AVX512F-NEXT: vpmovdb %zmm4, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpsrld $31, %zmm2, %zmm4
@@ -2860,30 +2800,26 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX512F-NEXT: vpaddd %ymm4, %ymm9, %ymm4
; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm9
; AVX512F-NEXT: vpaddd %xmm4, %xmm9, %xmm4
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,3,2,3]
+; AVX512F-NEXT: vpaddd %xmm4, %xmm9, %xmm4
; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
; AVX512F-NEXT: vmovd %xmm4, %ecx
-; AVX512F-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512F-NEXT: vpextrd $3, %xmm4, %esi
; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: addl %edx, %esi
-; AVX512F-NEXT: addl %ecx, %esi
-; AVX512F-NEXT: andl $31, %esi
-; AVX512F-NEXT: vpmovdb %zmm6, 64(%rsp,%rsi)
+; AVX512F-NEXT: andl $31, %ecx
+; AVX512F-NEXT: vpmovdb %zmm7, 64(%rsp,%rcx)
; AVX512F-NEXT: vpmovdb %zmm5, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpsrld $31, %zmm3, %zmm4
; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm5
; AVX512F-NEXT: vpaddd %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX512F-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; AVX512F-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
; AVX512F-NEXT: vmovd %xmm4, %ecx
; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: vpextrd $2, %xmm4, %eax
-; AVX512F-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512F-NEXT: addl %eax, %edx
-; AVX512F-NEXT: addl %ecx, %edx
-; AVX512F-NEXT: andl $31, %edx
-; AVX512F-NEXT: vpmovdb %zmm7, 96(%rsp,%rdx)
+; AVX512F-NEXT: andl $31, %ecx
+; AVX512F-NEXT: vpmovdb %zmm6, 96(%rsp,%rcx)
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
; AVX512F-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpsrld $31, %zmm0, %zmm4
@@ -2892,16 +2828,14 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX512F-NEXT: vpaddd %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX512F-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; AVX512F-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
; AVX512F-NEXT: vmovd %xmm4, %ecx
; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: vpextrd $2, %xmm4, %eax
-; AVX512F-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512F-NEXT: addl %eax, %edx
-; AVX512F-NEXT: addl %ecx, %edx
-; AVX512F-NEXT: andl $63, %edx
+; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
-; AVX512F-NEXT: vmovaps %ymm4, 128(%rsp,%rdx)
+; AVX512F-NEXT: vmovaps %ymm4, 128(%rsp,%rcx)
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vpmovdb %zmm8, %xmm4
; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
@@ -2950,21 +2884,13 @@ define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x
; AVX2-NEXT: vpaddw %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-NEXT: vpaddw %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; AVX2-NEXT: vpaddw %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1]
+; AVX2-NEXT: vpaddw %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vpextrw $1, %xmm4, %eax
; AVX2-NEXT: vmovd %xmm4, %ecx
; AVX2-NEXT: addl %eax, %ecx
-; AVX2-NEXT: vpextrw $2, %xmm4, %eax
-; AVX2-NEXT: vpextrw $3, %xmm4, %edx
-; AVX2-NEXT: addl %eax, %edx
-; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: vpextrw $4, %xmm4, %eax
-; AVX2-NEXT: vpextrw $5, %xmm4, %ecx
-; AVX2-NEXT: addl %eax, %ecx
-; AVX2-NEXT: vpextrw $6, %xmm4, %eax
-; AVX2-NEXT: addl %ecx, %eax
-; AVX2-NEXT: addl %edx, %eax
-; AVX2-NEXT: vpextrw $7, %xmm4, %ecx
-; AVX2-NEXT: addl %eax, %ecx
; AVX2-NEXT: andl $31, %ecx
; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vpextrb $1, %xmm2, %eax
@@ -3208,18 +3134,16 @@ define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x
; AVX512F-NEXT: vpaddd %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX512F-NEXT: vpaddd %xmm5, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX512F-NEXT: vpaddd %xmm5, %xmm1, %xmm1
; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
; AVX512F-NEXT: vmovd %xmm1, %ecx
; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512F-NEXT: vpextrd $3, %xmm1, %edx
-; AVX512F-NEXT: addl %eax, %edx
-; AVX512F-NEXT: addl %ecx, %edx
-; AVX512F-NEXT: andl $31, %edx
+; AVX512F-NEXT: andl $31, %ecx
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovdw %zmm0, (%rsp,%rdx,2)
+; AVX512F-NEXT: vpmovdw %zmm0, (%rsp,%rcx,2)
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
; AVX512F-NEXT: vpsllw $15, %ymm4, %ymm1
; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1
@@ -4151,30 +4075,26 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512F-NEXT: vpaddd %ymm7, %ymm4, %ymm4
; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm7
; AVX512F-NEXT: vpaddd %xmm7, %xmm4, %xmm4
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
+; AVX512F-NEXT: vpaddd %xmm7, %xmm4, %xmm4
; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
; AVX512F-NEXT: vmovd %xmm4, %ecx
-; AVX512F-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512F-NEXT: vpextrd $3, %xmm4, %esi
; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: addl %edx, %esi
-; AVX512F-NEXT: addl %ecx, %esi
-; AVX512F-NEXT: andl $31, %esi
-; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rsp,%rsi,4)
+; AVX512F-NEXT: andl $31, %ecx
+; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rsp,%rcx,4)
; AVX512F-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpsrld $31, %zmm5, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX512F-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
; AVX512F-NEXT: vmovd %xmm1, %ecx
; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512F-NEXT: vpextrd $3, %xmm1, %edx
-; AVX512F-NEXT: addl %eax, %edx
-; AVX512F-NEXT: addl %ecx, %edx
-; AVX512F-NEXT: andl $31, %edx
-; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsp,%rdx,4)
+; AVX512F-NEXT: andl $31, %ecx
+; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsp,%rcx,4)
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
; AVX512F-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
@@ -4184,19 +4104,17 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpextrd $1, %xmm0, %eax
; AVX512F-NEXT: vmovd %xmm0, %ecx
; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: vpextrd $2, %xmm0, %eax
-; AVX512F-NEXT: vpextrd $3, %xmm0, %edx
-; AVX512F-NEXT: addl %eax, %edx
-; AVX512F-NEXT: addl %ecx, %edx
-; AVX512F-NEXT: andl $63, %edx
+; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
-; AVX512F-NEXT: vmovaps %zmm0, 320(%rsp,%rdx,4)
+; AVX512F-NEXT: vmovaps %zmm0, 320(%rsp,%rcx,4)
; AVX512F-NEXT: vmovaps %zmm2, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %zmm1, 384(%rsp,%rdx,4)
+; AVX512F-NEXT: vmovaps %zmm1, 384(%rsp,%rcx,4)
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
@@ -4223,17 +4141,15 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512VL-NEXT: vpaddd %ymm5, %ymm1, %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX512VL-NEXT: vpaddd %xmm5, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX512VL-NEXT: vpaddd %xmm5, %xmm1, %xmm1
; AVX512VL-NEXT: vpextrd $1, %xmm1, %eax
; AVX512VL-NEXT: vmovd %xmm1, %ecx
; AVX512VL-NEXT: addl %eax, %ecx
-; AVX512VL-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512VL-NEXT: vpextrd $3, %xmm1, %edx
-; AVX512VL-NEXT: addl %eax, %edx
-; AVX512VL-NEXT: addl %ecx, %edx
-; AVX512VL-NEXT: andl $31, %edx
+; AVX512VL-NEXT: andl $31, %ecx
; AVX512VL-NEXT: kshiftrq $16, %k2, %k2
; AVX512VL-NEXT: vpcompressd %zmm2, %zmm1 {%k2} {z}
-; AVX512VL-NEXT: vmovdqa64 %zmm1, (%rsp,%rdx,4)
+; AVX512VL-NEXT: vmovdqa64 %zmm1, (%rsp,%rcx,4)
; AVX512VL-NEXT: vpcompressd %zmm3, %zmm1 {%k3} {z}
; AVX512VL-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1
@@ -4242,16 +4158,14 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpextrd $1, %xmm1, %eax
; AVX512VL-NEXT: vmovd %xmm1, %ecx
; AVX512VL-NEXT: addl %eax, %ecx
-; AVX512VL-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512VL-NEXT: vpextrd $3, %xmm1, %edx
-; AVX512VL-NEXT: addl %eax, %edx
-; AVX512VL-NEXT: addl %ecx, %edx
-; AVX512VL-NEXT: andl $31, %edx
+; AVX512VL-NEXT: andl $31, %ecx
; AVX512VL-NEXT: vpcompressd %zmm4, %zmm1 {%k1} {z}
-; AVX512VL-NEXT: vmovdqa64 %zmm1, 128(%rsp,%rdx,4)
+; AVX512VL-NEXT: vmovdqa64 %zmm1, 128(%rsp,%rcx,4)
; AVX512VL-NEXT: vmovdqa64 (%rsp), %zmm1
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
; AVX512VL-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
@@ -4262,19 +4176,17 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrd $1, %xmm0, %eax
; AVX512VL-NEXT: vmovd %xmm0, %ecx
; AVX512VL-NEXT: addl %eax, %ecx
-; AVX512VL-NEXT: vpextrd $2, %xmm0, %eax
-; AVX512VL-NEXT: vpextrd $3, %xmm0, %edx
-; AVX512VL-NEXT: addl %eax, %edx
-; AVX512VL-NEXT: addl %ecx, %edx
-; AVX512VL-NEXT: andl $63, %edx
+; AVX512VL-NEXT: andl $63, %ecx
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
-; AVX512VL-NEXT: vmovaps %zmm0, 256(%rsp,%rdx,4)
+; AVX512VL-NEXT: vmovaps %zmm0, 256(%rsp,%rcx,4)
; AVX512VL-NEXT: vmovaps %zmm2, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %zmm1, 320(%rsp,%rdx,4)
+; AVX512VL-NEXT: vmovaps %zmm1, 320(%rsp,%rcx,4)
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-extract-last-active.ll b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
index 3f622c5c98077..a4ed74887c556 100644
--- a/llvm/test/CodeGen/X86/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
@@ -13,24 +13,29 @@ define i32 @extract_last_active_v4i32(<4 x i32> %a, <4 x i1> %c) {
; CHECK-NEXT: pslld $31, %xmm1
; CHECK-NEXT: psrad $31, %xmm1
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT: movdqa %xmm1, %xmm3
+; CHECK-NEXT: por %xmm0, %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; CHECK-NEXT: por %xmm4, %xmm0
+; CHECK-NEXT: pcmpgtd %xmm0, %xmm3
+; CHECK-NEXT: pand %xmm3, %xmm1
+; CHECK-NEXT: pandn %xmm4, %xmm3
+; CHECK-NEXT: por %xmm1, %xmm3
+; CHECK-NEXT: movd %xmm3, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: cmpl %ecx, %eax
+; CHECK-NEXT: cmoval %eax, %ecx
+; CHECK-NEXT: andl $3, %ecx
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
; CHECK-NEXT: por %xmm2, %xmm0
-; CHECK-NEXT: movd %xmm0, %ecx
-; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: movd %xmm0, %edx
+; CHECK-NEXT: andb $1, %dl
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: cmpb $1, %cl
+; CHECK-NEXT: cmpb $1, %dl
; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; CHECK-NEXT: movd %xmm0, %ecx
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; CHECK-NEXT: movd %xmm0, %edx
-; CHECK-NEXT: cmpl %ecx, %edx
-; CHECK-NEXT: cmoval %edx, %ecx
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; CHECK-NEXT: movd %xmm0, %edx
-; CHECK-NEXT: cmpl %edx, %ecx
-; CHECK-NEXT: cmovbel %edx, %ecx
; CHECK-NEXT: orl -24(%rsp,%rcx,4), %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 -1)
@@ -44,17 +49,22 @@ define i32 @extract_last_active_v4i32_no_default(<4 x i32> %a, <4 x i1> %c) {
; CHECK-NEXT: psrad $31, %xmm1
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; CHECK-NEXT: movd %xmm0, %ecx
-; CHECK-NEXT: cmpl %eax, %ecx
-; CHECK-NEXT: cmoval %ecx, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: por %xmm0, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; CHECK-NEXT: por %xmm3, %xmm0
+; CHECK-NEXT: pcmpgtd %xmm0, %xmm2
+; CHECK-NEXT: pand %xmm2, %xmm1
+; CHECK-NEXT: pandn %xmm3, %xmm2
+; CHECK-NEXT: por %xmm1, %xmm2
+; CHECK-NEXT: movd %xmm2, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
; CHECK-NEXT: movd %xmm0, %ecx
; CHECK-NEXT: cmpl %ecx, %eax
-; CHECK-NEXT: cmovbel %ecx, %eax
-; CHECK-NEXT: movl -24(%rsp,%rax,4), %eax
+; CHECK-NEXT: cmoval %eax, %ecx
+; CHECK-NEXT: andl $3, %ecx
+; CHECK-NEXT: movl -24(%rsp,%rcx,4), %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 poison)
ret i32 %res
@@ -90,28 +100,35 @@ define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) {
; CHECK: # %bb.0:
; CHECK-NEXT: movd %esi, %xmm1
; CHECK-NEXT: movd %edi, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm3
-; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-NEXT: movd %edx, %xmm4
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; CHECK-NEXT: movdqa %xmm2, %xmm4
+; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-NEXT: movd %edx, %xmm3
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: pslld $31, %xmm3
-; CHECK-NEXT: psrad $31, %xmm3
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; CHECK-NEXT: pslld $31, %xmm4
+; CHECK-NEXT: psrad $31, %xmm4
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT: movdqa %xmm4, %xmm5
+; CHECK-NEXT: por %xmm0, %xmm5
+; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
+; CHECK-NEXT: por %xmm6, %xmm0
+; CHECK-NEXT: pcmpgtd %xmm0, %xmm5
+; CHECK-NEXT: pand %xmm5, %xmm4
+; CHECK-NEXT: pandn %xmm6, %xmm5
+; CHECK-NEXT: por %xmm4, %xmm5
+; CHECK-NEXT: movd %xmm5, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
; CHECK-NEXT: movd %xmm0, %ecx
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
-; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: cmpl %ecx, %eax
; CHECK-NEXT: cmoval %eax, %ecx
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: cmpl $0, %ecx
-; CHECK-NEXT: cmovbel %eax, %ecx
-; CHECK-NEXT: por %xmm4, %xmm2
+; CHECK-NEXT: andl $3, %ecx
+; CHECK-NEXT: por %xmm3, %xmm2
; CHECK-NEXT: por %xmm1, %xmm2
; CHECK-NEXT: movd %xmm2, %edx
; CHECK-NEXT: andb $1, %dl
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpb $1, %dl
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: orl -24(%rsp,%rcx,4), %eax
@@ -212,13 +229,12 @@ define i32 @extract_last_active_v4i32_penryn(<4 x i32> %a, <4 x i1> %c) "target-
; CHECK-NEXT: pslld $31, %xmm1
; CHECK-NEXT: psrad $31, %xmm1
; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT: pextrd $2, %xmm1, %eax
-; CHECK-NEXT: pextrd $1, %xmm1, %ecx
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; CHECK-NEXT: pmaxud %xmm1, %xmm2
+; CHECK-NEXT: pextrd $1, %xmm2, %eax
+; CHECK-NEXT: movd %xmm2, %ecx
; CHECK-NEXT: cmpl %eax, %ecx
; CHECK-NEXT: cmoval %ecx, %eax
-; CHECK-NEXT: pextrd $3, %xmm1, %ecx
-; CHECK-NEXT: cmpl %ecx, %eax
-; CHECK-NEXT: cmovbel %ecx, %eax
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movl -24(%rsp,%rax,4), %eax
; CHECK-NEXT: retq
diff --git a/llvm/test/tools/llubi/intr_vector_manip.ll b/llvm/test/tools/llubi/intr_vector_manip.ll
index bcccb9de93d8a..20895550565d9 100644
--- a/llvm/test/tools/llubi/intr_vector_manip.ll
+++ b/llvm/test/tools/llubi/intr_vector_manip.ll
@@ -5,12 +5,12 @@ define void @main() {
%insert_mid = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, <2 x i32> <i32 10, i32 11>, i64 2)
%insert_poison_lane = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, <2 x i32> <i32 poison, i32 11>, i64 2)
%insert_tail = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> zeroinitializer, <2 x i32> <i32 9, i32 10>, i64 4)
- %insert_poison = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> zeroinitializer, <2 x i32> <i32 9, i32 10>, i64 5)
+ %insert_poison = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> zeroinitializer, <2 x i32> <i32 9, i32 10>, i64 6)
%extract_mid = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, i64 2)
%extract_poison_lane = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5>, i64 0)
%extract_tail = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, i64 4)
- %extract_poison = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, i64 5)
+ %extract_poison = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, i64 6)
%reverse = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>)
%reverse_poison = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> <i32 0, i32 poison, i32 2, i32 3>)
@@ -28,6 +28,12 @@ define void @main() {
%splice_left_poison_idx = call <4 x i32> @llvm.vector.splice.left.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 poison)
%splice_right_poison_idx = call <4 x i32> @llvm.vector.splice.right.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 poison)
+ %insert_bad_idx = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> zeroinitializer, <2 x i32> zeroinitializer, i64 1)
+ %extract_bad_idx = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> zeroinitializer, i64 1)
+
+ %insert_idx_overflow = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv2i32(<vscale x 4 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i64 9223372036854775808)
+ %extract_idx_overflow = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> zeroinitializer, i64 9223372036854775808)
+
ret void
}
@@ -35,11 +41,11 @@ define void @main() {
; CHECK-NEXT: %insert_mid = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, <2 x i32> <i32 10, i32 11>, i64 2) => { i32 0, i32 1, i32 10, i32 11, i32 4, i32 5 }
; CHECK-NEXT: %insert_poison_lane = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, <2 x i32> <i32 poison, i32 11>, i64 2) => { i32 0, i32 1, poison, i32 11, i32 4, i32 5 }
; CHECK-NEXT: %insert_tail = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> zeroinitializer, <2 x i32> <i32 9, i32 10>, i64 4) => { i32 0, i32 0, i32 0, i32 0, i32 9, i32 10 }
-; CHECK-NEXT: %insert_poison = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> zeroinitializer, <2 x i32> <i32 9, i32 10>, i64 5) => poison
+; CHECK-NEXT: %insert_poison = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> zeroinitializer, <2 x i32> <i32 9, i32 10>, i64 6) => poison
; CHECK-NEXT: %extract_mid = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, i64 2) => { i32 2, i32 3 }
; CHECK-NEXT: %extract_poison_lane = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5>, i64 0) => { i32 0, poison }
; CHECK-NEXT: %extract_tail = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, i64 4) => { i32 4, i32 5 }
-; CHECK-NEXT: %extract_poison = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, i64 5) => poison
+; CHECK-NEXT: %extract_poison = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, i64 6) => poison
; CHECK-NEXT: %reverse = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>) => { i32 3, i32 2, i32 1, i32 0 }
; CHECK-NEXT: %reverse_poison = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> <i32 0, i32 poison, i32 2, i32 3>) => { i32 3, i32 2, poison, i32 0 }
; CHECK-NEXT: %splice_left = call <4 x i32> @llvm.vector.splice.left.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 10, i32 11, i32 12, i32 13>, i32 2) => { i32 2, i32 3, i32 10, i32 11 }
@@ -52,5 +58,9 @@ define void @main() {
; CHECK-NEXT: %insert_poison_idx = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2>, i64 poison) => poison
; CHECK-NEXT: %splice_left_poison_idx = call <4 x i32> @llvm.vector.splice.left.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 poison) => poison
; CHECK-NEXT: %splice_right_poison_idx = call <4 x i32> @llvm.vector.splice.right.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 poison) => poison
+; CHECK-NEXT: %insert_bad_idx = call <6 x i32> @llvm.vector.insert.v6i32.v2i32(<6 x i32> zeroinitializer, <2 x i32> zeroinitializer, i64 1) => poison
+; CHECK-NEXT: %extract_bad_idx = call <2 x i32> @llvm.vector.extract.v2i32.v6i32(<6 x i32> zeroinitializer, i64 1) => poison
+; CHECK-NEXT: %insert_idx_overflow = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv2i32(<vscale x 4 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i64 -9223372036854775808) => poison
+; CHECK-NEXT: %extract_idx_overflow = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> zeroinitializer, i64 -9223372036854775808) => poison
; CHECK-NEXT: ret void
; CHECK-NEXT: Exiting function: main
diff --git a/llvm/test/tools/llubi/intr_vscale_poison.ll b/llvm/test/tools/llubi/intr_vscale_poison.ll
index 2fe406d532254..f6314abb3c2bc 100644
--- a/llvm/test/tools/llubi/intr_vscale_poison.ll
+++ b/llvm/test/tools/llubi/intr_vscale_poison.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llubi_test_checks.py UTC_ARGS: --version 6
-; RUN: llubi --vscale=257 --verbose < %s 2>&1 | FileCheck %s
+; RUN: llubi --vscale=512 --verbose < %s 2>&1 | FileCheck %s
define void @main() {
call i8 @llvm.vscale.i8()
@@ -8,6 +8,6 @@ define void @main() {
}
; CHECK: Entering function: main
; CHECK-NEXT: %1 = call i8 @llvm.vscale.i8() => poison
-; CHECK-NEXT: %2 = call i16 @llvm.vscale.i16() => i16 257
+; CHECK-NEXT: %2 = call i16 @llvm.vscale.i16() => i16 512
; CHECK-NEXT: ret void
; CHECK-NEXT: Exiting function: main
diff --git a/llvm/tools/llubi/lib/Interpreter.cpp b/llvm/tools/llubi/lib/Interpreter.cpp
index 26e01c0e4bd70..1f68051c617a0 100644
--- a/llvm/tools/llubi/lib/Interpreter.cpp
+++ b/llvm/tools/llubi/lib/Interpreter.cpp
@@ -22,6 +22,8 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Allocator.h"
+#include <limits>
+
namespace llvm::ubi {
using namespace PatternMatch;
@@ -744,8 +746,18 @@ class InstExecutor : public InstVisitor<InstExecutor, void>,
const auto &Vec = Args[0].asAggregate();
const auto &SubVec = Args[1].asAggregate();
const auto &Idx = Args[2].asInteger();
- const uint64_t Offset = Idx.getZExtValue();
- if (Offset + SubVec.size() > Vec.size())
+ auto EC =
+ cast<VectorType>(CB.getArgOperand(1)->getType())->getElementCount();
+ const uint64_t RawOffset = Idx.getZExtValue();
+ const uint32_t MinSize = EC.getKnownMinValue();
+ if (RawOffset % MinSize != 0)
+ return AnyValue::poison();
+ const uint64_t Chunk = RawOffset / MinSize;
+ const uint64_t EVL = Ctx.getEVL(EC);
+ if (Chunk > std::numeric_limits<uint64_t>::max() / EVL)
+ return AnyValue::poison();
+ const uint64_t Offset = Chunk * EVL;
+ if (Offset > Vec.size() || SubVec.size() > Vec.size() - Offset)
return AnyValue::poison();
std::vector<AnyValue> Res;
Res.reserve(Vec.size());
@@ -762,12 +774,19 @@ class InstExecutor : public InstVisitor<InstExecutor, void>,
return AnyValue::poison();
const auto &Vec = Args[0].asAggregate();
const auto &Idx = Args[1].asInteger();
- const uint64_t Offset = Idx.getZExtValue();
- const uint64_t DstSize =
- Ctx.getEVL(cast<VectorType>(RetTy)->getElementCount());
- if (Offset + DstSize > Vec.size())
+ auto EC = cast<VectorType>(RetTy)->getElementCount();
+ const uint64_t RawOffset = Idx.getZExtValue();
+ const uint32_t MinSize = EC.getKnownMinValue();
+ if (RawOffset % MinSize != 0)
+ return AnyValue::poison();
+ const uint64_t Chunk = RawOffset / MinSize;
+ const uint64_t EVL = Ctx.getEVL(EC);
+ if (Chunk > std::numeric_limits<uint64_t>::max() / EVL)
+ return AnyValue::poison();
+ const uint64_t Offset = Chunk * EVL;
+ if (Offset > Vec.size() || EVL > Vec.size() - Offset)
return AnyValue::poison();
- return std::vector(Vec.begin() + Offset, Vec.begin() + Offset + DstSize);
+ return std::vector(Vec.begin() + Offset, Vec.begin() + Offset + EVL);
}
case Intrinsic::vector_reverse: {
auto Vec = Args[0].asAggregate();
diff --git a/llvm/tools/llubi/llubi.cpp b/llvm/tools/llubi/llubi.cpp
index 0ec2e236049dd..6e0616405b703 100644
--- a/llvm/tools/llubi/llubi.cpp
+++ b/llvm/tools/llubi/llubi.cpp
@@ -166,6 +166,16 @@ int main(int argc, char **argv) {
return 1;
}
+ if (VScale == 0) {
+ WithColor::error() << "--vscale value must be positive\n";
+ return 1;
+ }
+
+ if (!isPowerOf2_32(VScale)) {
+ WithColor::error() << "--vscale value must be a power of 2\n";
+ return 1;
+ }
+
LLVMContext Context;
// Load the bitcode...
More information about the libc-commits
mailing list