[llvm] [Scalar] Dedicated pass for identifying redundant operations on packed bytes (PR #146364)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 3 17:02:02 PDT 2025
https://github.com/zGoldthorpe updated https://github.com/llvm/llvm-project/pull/146364
>From 652e7a5fb486c992c90bf0bdfcadfb12fb41b72c Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe at amd.com>
Date: Mon, 30 Jun 2025 09:24:53 -0500
Subject: [PATCH 1/2] Squashing wip branch.
---
llvm/include/llvm/InitializePasses.h | 1 +
llvm/include/llvm/Transforms/Scalar.h | 7 +
.../Scalar/PackedIntegerCombinePass.h | 32 +
llvm/lib/Passes/PassBuilder.cpp | 1 +
llvm/lib/Passes/PassBuilderPipelines.cpp | 7 +
llvm/lib/Passes/PassRegistry.def | 1 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 11 +-
llvm/lib/Transforms/Scalar/CMakeLists.txt | 1 +
.../Scalar/PackedIntegerCombinePass.cpp | 1936 +++++++++++++++++
llvm/lib/Transforms/Scalar/Scalar.cpp | 1 +
.../CodeGen/AMDGPU/combine-vload-extract.ll | 6 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 3 +
llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 105 +-
.../AMDGPU/splitkit-getsubrangeformask.ll | 168 +-
llvm/test/Other/new-pm-defaults.ll | 1 +
.../Other/new-pm-thinlto-postlink-defaults.ll | 1 +
.../new-pm-thinlto-postlink-pgo-defaults.ll | 1 +
...-pm-thinlto-postlink-samplepgo-defaults.ll | 1 +
.../Other/new-pm-thinlto-prelink-defaults.ll | 1 +
.../new-pm-thinlto-prelink-pgo-defaults.ll | 1 +
...w-pm-thinlto-prelink-samplepgo-defaults.ll | 1 +
.../PackedIntegerCombine/instructions.ll | 601 +++++
.../PackedIntegerCombine/int2int.ll | 302 +++
.../PackedIntegerCombine/int2vec.ll | 393 ++++
.../PackedIntegerCombine/vec2int.ll | 480 ++++
.../PackedIntegerCombine/vec2vec.ll | 294 +++
26 files changed, 4173 insertions(+), 184 deletions(-)
create mode 100644 llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
create mode 100644 llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
create mode 100644 llvm/test/Transforms/PackedIntegerCombine/instructions.ll
create mode 100644 llvm/test/Transforms/PackedIntegerCombine/int2int.ll
create mode 100644 llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
create mode 100644 llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
create mode 100644 llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 1c4ed3843b390..7e934e635c063 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -237,6 +237,7 @@ initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry &);
LLVM_ABI void initializeOptimizePHIsLegacyPass(PassRegistry &);
LLVM_ABI void initializePEILegacyPass(PassRegistry &);
LLVM_ABI void initializePHIEliminationPass(PassRegistry &);
+LLVM_ABI void initializePackedIntegerCombineLegacyPassPass(PassRegistry &);
LLVM_ABI void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry &);
LLVM_ABI void initializePatchableFunctionLegacyPass(PassRegistry &);
LLVM_ABI void initializePeepholeOptimizerLegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index 1398f171b0f78..ec9d89507c375 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -154,6 +154,13 @@ LLVM_ABI FunctionPass *
createInferAddressSpacesPass(unsigned AddressSpace = ~0u);
LLVM_ABI extern char &InferAddressSpacesID;
+//===----------------------------------------------------------------------===//
+//
+// PackedIntegerCombinePass - Tracks individual bytes through instructions to
+// systematically identify redundant byte packing or unpacking operations.
+//
+LLVM_ABI FunctionPass *createPackedIntegerCombinePass();
+
//===----------------------------------------------------------------------===//
//
// PartiallyInlineLibCalls - Tries to inline the fast path of library
diff --git a/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
new file mode 100644
index 0000000000000..a5916e2e611cf
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
@@ -0,0 +1,32 @@
+//===- PackedIntegerCombinePass.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides the interface for LLVM's Packed Integer Combine pass.
+/// This pass tries to treat integers as packed chunks of individual bytes,
+/// and leverage this to coalesce needlessly fragmented
+/// computations.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_PACKEDINTCOMBINE_H
+#define LLVM_TRANSFORMS_SCALAR_PACKEDINTCOMBINE_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class PackedIntegerCombinePass
+ : public PassInfoMixin<PackedIntegerCombinePass> {
+public:
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_PACKEDINTCOMBINE_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 0697a0a6b4c74..7a382ace34dbc 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -313,6 +313,7 @@
#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
#include "llvm/Transforms/Scalar/NaryReassociate.h"
#include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h"
#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
#include "llvm/Transforms/Scalar/PlaceSafepoints.h"
#include "llvm/Transforms/Scalar/Reassociate.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index c83d2dc1f1514..2da72606bc47a 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -121,6 +121,7 @@
#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
#include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h"
#include "llvm/Transforms/Scalar/Reassociate.h"
#include "llvm/Transforms/Scalar/SCCP.h"
#include "llvm/Transforms/Scalar/SROA.h"
@@ -542,6 +543,9 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
// opportunities that creates).
FPM.addPass(BDCEPass());
+ // Simplify bit-packed operations before cleaning up with instcombine.
+ FPM.addPass(PackedIntegerCombinePass());
+
// Run instcombine after redundancy and dead bit elimination to exploit
// opportunities opened up by them.
FPM.addPass(InstCombinePass());
@@ -743,6 +747,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
// opportunities that creates).
FPM.addPass(BDCEPass());
+ // Simplify bit-packed operations before cleaning up with instcombine.
+ FPM.addPass(PackedIntegerCombinePass());
+
// Run instcombine after redundancy and dead bit elimination to exploit
// opportunities opened up by them.
FPM.addPass(InstCombinePass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 65276489e6f02..6f1c405a5efa7 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -476,6 +476,7 @@ FUNCTION_PASS("objc-arc", ObjCARCOptPass())
FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass())
FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass())
FUNCTION_PASS("pa-eval", PAEvalPass())
+FUNCTION_PASS("packedintcombine", PackedIntegerCombinePass())
FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
FUNCTION_PASS("place-safepoints", PlaceSafepointsPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c3536113e9bef..8f0ef348fe778 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -104,6 +104,7 @@
#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
#include "llvm/Transforms/Scalar/NaryReassociate.h"
+#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h"
#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
#include "llvm/Transforms/Scalar/Sink.h"
#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
@@ -1378,8 +1379,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
TargetPassConfig::addCodeGenPrepare();
- if (isPassEnabled(EnableLoadStoreVectorizer))
+ if (isPassEnabled(EnableLoadStoreVectorizer)) {
addPass(createLoadStoreVectorizerPass());
+ // LSV pass opens up more opportunities for packed integer combining.
+ addPass(createPackedIntegerCombinePass());
+ }
// LowerSwitch pass may introduce unreachable blocks that can
// cause unexpected behavior for subsequent passes. Placing it
@@ -2101,8 +2105,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
Base::addCodeGenPrepare(addPass);
- if (isPassEnabled(EnableLoadStoreVectorizer))
+ if (isPassEnabled(EnableLoadStoreVectorizer)) {
addPass(LoadStoreVectorizerPass());
+ // LSV pass opens up more opportunities for packed integer combining.
+ addPass(PackedIntegerCombinePass());
+ }
// LowerSwitch pass may introduce unreachable blocks that can cause unexpected
// behavior for subsequent passes. Placing it here seems better that these
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 84a5b02043d01..d45a3785f9f8f 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -61,6 +61,7 @@ add_llvm_component_library(LLVMScalarOpts
NaryReassociate.cpp
NewGVN.cpp
PartiallyInlineLibCalls.cpp
+ PackedIntegerCombinePass.cpp
PlaceSafepoints.cpp
Reassociate.cpp
Reg2Mem.cpp
diff --git a/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
new file mode 100644
index 0000000000000..31edd28069a2b
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
@@ -0,0 +1,1936 @@
+//===- PackedIntegerCombinePass.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file provides the interface for LLVM's Packed Integer Combine pass.
+/// This pass tries to treat integers as packed chunks of individual bytes,
+/// and leverage this to coalesce needlessly fragmented
+/// computations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "packedintcombine"
+
+static cl::opt<unsigned> MaxCollectionIterations(
+ "packedint-max-iterations",
+ cl::desc("Maximum number of iterations to isolate final packed "
+ "instructions. Set to 0 to iterate until convergence."),
+ cl::init(2), cl::Hidden);
+
+static cl::opt<bool>
+ AggressiveRewriting("packedint-aggressive-rewriter",
+ cl::desc("Aggressively rewrite packed instructions."),
+ cl::init(false), cl::Hidden);
+
+namespace {
+
+/// Reference to either a constant byte, or a byte extracted from an IR value.
+class Byte {
+ /// The base value from which the byte is obtained.
+ Value *Base;
+
+ /// If the base value is not null, then this holds the index of the byte
+ /// being used, where 0 is the least significant byte.
+ /// Otherwise, this is treated as a constant byte.
+ unsigned Integer;
+
+public:
+ static constexpr unsigned BitWidth = 8;
+ static constexpr unsigned AllOnes = 0xff;
+
+ /// Construct a byte from a well-defined IR value.
+ explicit Byte(Value &Base, unsigned Index) : Base(&Base), Integer(Index) {}
+
+ /// Construct a constant byte.
+ explicit Byte(unsigned Constant) : Base(nullptr), Integer(Constant) {
+ assert(Constant <= AllOnes && "Constant is too large to fit in a byte.");
+ }
+
+ /// Construct a constant byte that is fully set.
+ static Byte ones() { return Byte(Byte::AllOnes); }
+ /// Construct the zero byte.
+ static Byte zeroes() { return Byte(0); }
+
+ /// Indicate whether the byte is a known integer constant.
+ /// Note that poison or undef base values are not recognised as constant.
+ bool isConstant() const { return !Base; }
+
+ /// Get the constant byte value.
+ unsigned getConstant() const {
+ assert(isConstant() && "Expected a constant byte.");
+ return Integer;
+ }
+
+ /// Get the base IR value from which this byte is obtained.
+ Value *getBase() const {
+ assert(!isConstant() && "Byte constants do not have a base value.");
+ return Base;
+ }
+
+ /// Get the byte offset of the IR value referenced by the byte.
+ unsigned getIndex() const {
+ assert(!isConstant() && "Byte constants are not indexed.");
+ return Integer;
+ }
+
+ bool operator==(const Byte &Other) const {
+ return Base == Other.Base && Integer == Other.Integer;
+ }
+
+ void print(raw_ostream &ROS, bool NewLine = true) const {
+ if (isConstant())
+ ROS << "const";
+ else
+ Base->printAsOperand(ROS, false);
+
+ ROS << "[" << Integer << "]";
+
+ if (NewLine)
+ ROS << "\n";
+ }
+
+ LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
+};
+
+inline raw_ostream &operator<<(raw_ostream &ROS, const Byte &B) {
+ B.print(ROS, false);
+ return ROS;
+}
+
+/// Convenience data structure for describing the layout of bytes for vector and
+/// integer types, treating integer types as singleton vectors.
+struct ByteLayout {
+ /// The number of bytes that fit in a single element.
+ unsigned NumBytesPerElement;
+ /// The number of vector elements (or 1, if the type is an integer type).
+ unsigned NumVecElements;
+
+ /// Get the total number of bytes held by the vector or integer type.
+ unsigned getNumBytes() const { return NumBytesPerElement * NumVecElements; }
+};
+
+/// Interpret the given type as a number of packed bytes, if possible.
+static std::optional<ByteLayout> tryGetByteLayout(const Type *Ty) {
+ unsigned IntBitWidth, NumElts;
+ if (const auto *IntTy = dyn_cast<IntegerType>(Ty)) {
+ IntBitWidth = IntTy->getBitWidth();
+ NumElts = 1;
+ } else if (const auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+ const auto *IntTy = dyn_cast<IntegerType>(VecTy->getElementType());
+ if (!IntTy)
+ return std::nullopt;
+ IntBitWidth = IntTy->getBitWidth();
+ NumElts = VecTy->getNumElements();
+ } else
+ return std::nullopt;
+
+ if (IntBitWidth % Byte::BitWidth != 0)
+ return std::nullopt;
+
+ return ByteLayout{IntBitWidth / Byte::BitWidth, NumElts};
+}
+
+/// Interpret the given type as a number of backed bytes (aborts if impossible).
+static ByteLayout getByteLayout(const Type *Ty) {
+ const std::optional<ByteLayout> Layout = tryGetByteLayout(Ty);
+ assert(Layout);
+ return *Layout;
+}
+
+/// A convenience class for combining Byte instances obtained from the same base
+/// value, and with a common relative offset, which can hence be obtained
+/// simultaneously.
+struct CoalescedBytes {
+ /// The value from which the coalesced bytes are all derived. This pointer is
+ /// never null.
+ Value *Base;
+ /// The number of bytes to shift right to align the coalesced bytes with the
+ /// target value.
+ ///
+ /// For instance, if bytes 3, 4, 5 of some value %val are coalesced to provide
+ /// bytes 0, 1, 2 of the target %tgt, then ShrByteOffset = 3.
+ signed SignedShrByteOffset;
+ /// The bitmask identifying which bytes of the target value are covered by
+ /// these coalesced bytes.
+ ///
+ /// For instance, if bytes 3, 4, 5 of some value %val are coalesced to provide
+ /// bytes 0, 1, 2 of the target %tgt, then this mask's first three bits will
+ /// be set, corresponding to the first three bits of %tgt.
+ SmallBitVector Mask;
+
+ explicit CoalescedBytes(Value &Base, signed Offset, SmallBitVector Mask)
+ : Base(&Base), SignedShrByteOffset(Offset), Mask(Mask) {}
+ explicit CoalescedBytes(Value &Base, signed Offset, unsigned NumBytes)
+ : Base(&Base), SignedShrByteOffset(Offset), Mask(NumBytes) {}
+
+ bool alignsWith(Value *V, signed VOffset) const {
+ return Base == V && SignedShrByteOffset == VOffset;
+ }
+
+ /// Get the number of bytes to shift the base value right to align with the
+ /// target value.
+ unsigned getShrBytes() const { return std::max(0, SignedShrByteOffset); }
+
+ /// Get the number of bytes to shift the base value left to align with the
+ /// target value.
+ unsigned getShlBytes() const { return std::max(0, -SignedShrByteOffset); }
+
+ /// Get the number of bits to shift the base value right to align with the
+ /// target value.
+ unsigned getShrBits() const { return getShrBytes() * Byte::BitWidth; }
+
+ /// Get the number of bits to shift the base value left to align with the
+ /// target value.
+ unsigned getShlBits() const { return getShlBytes() * Byte::BitWidth; }
+
+ void print(raw_ostream &ROS, bool NewLine = true) const {
+ ROS << "{ ";
+ for (unsigned Idx = 0; Idx < Mask.size(); ++Idx) {
+ if (Mask.test(Idx)) {
+ Base->printAsOperand(ROS, false);
+ ROS << "[" << (static_cast<int>(Idx) + SignedShrByteOffset) << "]";
+ } else
+ ROS << 0;
+
+ ROS << "; ";
+ }
+ ROS << "}";
+
+ if (NewLine)
+ ROS << "\n";
+ }
+
+ LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
+};
+
+inline raw_ostream &operator<<(raw_ostream &ROS, const CoalescedBytes &CB) {
+ CB.print(ROS, false);
+ return ROS;
+}
+
+/// Association of a Byte (constant or byte extracted from an LLVM Value) to the
+/// operand(s) responsible for producing it. A value of ByteUse::AllOperands
+/// (-1) indicates that all operands are responsible for producing the given
+/// byte.
+class ByteUse {
+
+ Byte B;
+ int OpIdx;
+
+public:
+ /// Sentinel value representing that all operands are responsible for the
+ /// given Byte.
+ static constexpr int AllOperands = -1;
+
+ ByteUse(Byte B, int OpIdx) : B(B), OpIdx(OpIdx) {}
+
+ const Byte &getByte() const { return B; }
+ int getOperandIndex() const { return OpIdx; }
+
+ bool operator==(const ByteUse &BU) const {
+ return BU.B == B && BU.OpIdx == OpIdx;
+ }
+};
+
+using ByteVector = SmallVector<ByteUse, 8>;
+
+/// The decomposition of an IR value into its individual bytes, tracking where
+/// each byte is obtained.
+class ByteDefinition {
+ /// Enum classifying what Ptr points to.
+ enum ByteType : uint8_t {
+ /// Ptr's value is undefined.
+ INVALID,
+ /// The byte definition is given by a ByteVector, which is referenced (but
+ /// not captured) by Ptr.
+ VECTOR,
+ /// The bytes are obtained from a (currently opaque) IR value, held by Ptr.
+ VALUE,
+ /// The bytes are obtained from a constant integer, held by Ptr.
+ CONST_INT,
+ /// The bytes are obtained from a constant vector of integers, held by Ptr.
+ CONST_VEC,
+ };
+
+ ByteType DefType;
+ void *Ptr;
+ ByteLayout Layout;
+ ByteDefinition(ByteType DefType, void *Ptr, ByteLayout Layout)
+ : DefType(DefType), Ptr(Ptr), Layout(Layout) {}
+
+public:
+ /// Indicate that a value cannot be decomposed into bytes in a known way.
+ static ByteDefinition invalid() { return {INVALID, nullptr, {0, 0}}; }
+ /// Indicate that a value's bytes are known, and track their producers.
+ static ByteDefinition vector(ByteVector &Ref, ByteLayout Layout) {
+ return {VECTOR, &Ref, Layout};
+ }
+ /// Indicate that a value's bytes are opaque.
+ static ByteDefinition value(Value &V) {
+ return {VALUE, &V, getByteLayout(V.getType())};
+ }
+ /// Indicate that the bytes come from a constant integer.
+ static ByteDefinition constInt(ConstantInt &Int) {
+ return {CONST_INT, &Int, getByteLayout(Int.getType())};
+ }
+ /// Indicate that the bytes come from a constant vector of integers.
+ static ByteDefinition constVec(Constant &Vec) {
+ assert(Vec.getType()->isVectorTy());
+ return {CONST_VEC, &Vec, getByteLayout(Vec.getType())};
+ }
+
+ ByteVector &getVector() const {
+ assert(DefType == VECTOR);
+ return *static_cast<ByteVector *>(Ptr);
+ }
+ Value &getValue() const {
+ assert(DefType == VALUE);
+ return *static_cast<Value *>(Ptr);
+ }
+ ConstantInt &getConstInt() const {
+ assert(DefType == CONST_INT);
+ return *static_cast<ConstantInt *>(Ptr);
+ }
+ Constant &getConstVec() const {
+ assert(DefType == CONST_VEC);
+ return *static_cast<Constant *>(Ptr);
+ }
+
+ bool isValid() const { return DefType != INVALID; }
+
+ /// Return true iff the byte definition is valid.
+ operator bool() const { return isValid(); }
+
+ /// Get the definition of the byte at the specified byte offset, where 0 is
+ /// the least significant byte.
+ Byte getByte(unsigned Idx) const {
+ switch (DefType) {
+ default:
+ llvm_unreachable("Invalid byte definition");
+ case VECTOR:
+ return getVector()[Idx].getByte();
+ case VALUE:
+ return Byte(getValue(), Idx);
+ case CONST_INT:
+ return Byte(getConstInt().getValue().extractBitsAsZExtValue(
+ Byte::BitWidth, Idx * Byte::BitWidth));
+ case CONST_VEC: {
+ const auto &Vec = getConstVec();
+ const ByteLayout Layout = getByteLayout(Vec.getType());
+ const unsigned VecIdx = Idx / Layout.NumBytesPerElement;
+ const unsigned EltIdx = Idx % Layout.NumBytesPerElement;
+
+ Constant *Elt = Vec.getAggregateElement(VecIdx);
+ if (const auto *Int = dyn_cast<ConstantInt>(Elt))
+ return Byte(Int->getValue().extractBitsAsZExtValue(
+ Byte::BitWidth, EltIdx * Byte::BitWidth));
+
+ return Byte(*Elt, EltIdx);
+ }
+ }
+ }
+
+ const ByteLayout &getLayout() const { return Layout; }
+
+ void print(raw_ostream &ROS, bool NewLine = true) const {
+ switch (DefType) {
+ default:
+ ROS << "[INVALID]";
+ break;
+ case VECTOR: {
+ ByteVector &BV = getVector();
+ ROS << "{ ";
+ for (unsigned ByteIdx = 0; ByteIdx < BV.size(); ++ByteIdx)
+ ROS << ByteIdx << ": " << BV[ByteIdx].getByte() << "; ";
+ ROS << "}";
+ break;
+ }
+ case VALUE:
+ ROS << "(";
+ getValue().printAsOperand(ROS);
+ ROS << ")[0:" << Layout.getNumBytes() << "]";
+ break;
+ case CONST_INT:
+ ROS << getConstInt();
+ break;
+ case CONST_VEC:
+ ROS << getConstVec();
+ break;
+ }
+
+ if (NewLine)
+ ROS << "\n";
+ }
+
+ LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
+};
+
+inline raw_ostream &operator<<(raw_ostream &ROS, const ByteDefinition &Def) {
+ Def.print(ROS, false);
+ return ROS;
+}
+
+/// Tries to update byte definitions using the provided instruction.
+///
+/// In order to avoid eliminating values which are required for multiple packed
+/// integers, the ByteExpander distinguishes two types of packed integer values:
+/// - "Final" values, which are packed bytes which are either used by
+/// instructions that cannot be classified as packed byte operations, or
+/// values which are used by several other "final" values.
+/// - "Intermediate" values, which are values whose sole raisons d'etre are to
+/// produce bytes for a unique final value.
+///
+/// Effectively, intermediate values may be eliminated or replaced freely,
+/// whereas final values must remain present in the IR after the pass completes.
+/// Accordingly, byte defniitions of final values are expanded only up to other
+/// final value producers.
+class ByteExpander final : public InstVisitor<ByteExpander, ByteVector> {
+ /// Resolution of values to their known definitions.
+ DenseMap<Value *, ByteVector> Definitions;
+ /// Map to all (eventual) non-intermediate users of a value.
+ DenseMap<Value *, DenseSet<Value *>> FinalUsers;
+
+ void updateFinalUsers(Value *V);
+ bool checkIfIntermediate(Value *V, bool IsOperand);
+
+public:
+ // Visitation implementations return `true` iff a new byte definition was
+ // successfully constructed.
+
+ ByteVector visitAdd(BinaryOperator &I);
+ ByteVector visitAnd(BinaryOperator &I);
+ ByteVector visitOr(BinaryOperator &I);
+ ByteVector visitXor(BinaryOperator &I);
+ ByteVector visitShl(BinaryOperator &I);
+ ByteVector visitLShr(BinaryOperator &I);
+ ByteVector visitTruncInst(TruncInst &I);
+ ByteVector visitZExtInst(ZExtInst &I);
+ ByteVector visitBitCastInst(BitCastInst &I);
+ ByteVector visitExtractElementInst(ExtractElementInst &I);
+ ByteVector visitInsertElementInst(InsertElementInst &I);
+ ByteVector visitShuffleVectorInst(ShuffleVectorInst &I);
+ // fallback for unhandled instructions
+ ByteVector visitInstruction(Instruction &I) { return {}; }
+
+ /// Return the final values producing each byte of a value, if known, or
+ /// otherwise return a nullptr.
+ ByteVector *expandByteDefinition(Value *V);
+
+ /// Decompose a value into its bytes. If \p ExpandDef is true, expand each
+ /// byte to the final values producing them if possible. The return value is
+ /// guaranteed to be valid so long as the value passed can be viewed as packed
+ /// bytes.
+ ByteDefinition getByteDefinition(Value *V, bool ExpandDef = true);
+
+ /// Same as above, but only expand bytes to their final value producers if the
+ /// value \p V in question is an intermediate value. This is provided as a
+ /// convenience for instruction visitation, as definitions should only expand
+ /// until final value producers, even if the final value producers' bytes can
+ /// be expanded further.
+ ByteDefinition getByteDefinitionIfIntermediateOperand(Value *V);
+
+ /// Get the set of all final values which use \p V.
+ const DenseSet<Value *> &getFinalUsers(Value *V);
+
+ /// Check if the provided value is known to be an intermediate value.
+ bool checkIfIntermediate(Value *V) { return checkIfIntermediate(V, false); }
+
+ /// Iterate over all instructions in a function over several passes to
+ /// identify all final values and their byte definitions.
+ std::vector<Instruction *> collectPIICandidates(Function &F);
+};
+
+ByteVector ByteExpander::visitAdd(BinaryOperator &I) {
+ const ByteDefinition LhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ const ByteDefinition RhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+ if (!LhsDef || !RhsDef)
+ return {};
+
+ const ByteLayout &Layout = LhsDef.getLayout();
+ const unsigned NumBytes = Layout.getNumBytes();
+
+ ByteVector BV;
+ BV.reserve(NumBytes);
+
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) {
+ const Byte Lhs = LhsDef.getByte(ByteIdx);
+ const Byte Rhs = RhsDef.getByte(ByteIdx);
+
+ const bool LhsIsZero = Lhs.isConstant() && Lhs.getConstant() == 0;
+ const bool RhsIsZero = Rhs.isConstant() && Rhs.getConstant() == 0;
+ if (LhsIsZero)
+ BV.emplace_back(Rhs, RhsIsZero ? ByteUse::AllOperands : 1);
+ else if (RhsIsZero)
+ BV.emplace_back(Lhs, 0);
+ else
+ return {};
+ }
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitAnd(BinaryOperator &I) {
+ const ByteDefinition LhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ const ByteDefinition RhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+ if (!LhsDef || !RhsDef)
+ return {};
+
+ const ByteLayout &Layout = LhsDef.getLayout();
+ const unsigned NumBytes = Layout.getNumBytes();
+
+ ByteVector BV;
+ BV.reserve(NumBytes);
+
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) {
+ const Byte Lhs = LhsDef.getByte(ByteIdx);
+ const Byte Rhs = RhsDef.getByte(ByteIdx);
+
+ if (Lhs == Rhs) {
+ BV.emplace_back(Lhs, ByteUse::AllOperands);
+ continue;
+ }
+
+ if (Lhs.isConstant()) {
+ if (Lhs.getConstant() == 0) {
+ BV.emplace_back(Byte::zeroes(), 0);
+ continue;
+ }
+ if (Lhs.getConstant() == Byte::AllOnes) {
+ BV.emplace_back(Rhs, 1);
+ continue;
+ }
+ }
+ if (Rhs.isConstant()) {
+ if (Rhs.getConstant() == 0) {
+ BV.emplace_back(Byte::zeroes(), 1);
+ continue;
+ }
+ if (Rhs.getConstant() == Byte::AllOnes) {
+ BV.emplace_back(Lhs, 0);
+ continue;
+ }
+ }
+
+ if (Lhs == Rhs) {
+ BV.emplace_back(Lhs, ByteUse::AllOperands);
+ continue;
+ }
+ return {};
+ }
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitOr(BinaryOperator &I) {
+ const ByteDefinition LhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ const ByteDefinition RhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+ if (!LhsDef || !RhsDef)
+ return {};
+
+ const ByteLayout &Layout = LhsDef.getLayout();
+ const unsigned NumBytes = Layout.getNumBytes();
+
+ ByteVector BV;
+ BV.reserve(NumBytes);
+
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) {
+ const Byte Lhs = LhsDef.getByte(ByteIdx);
+ const Byte Rhs = RhsDef.getByte(ByteIdx);
+
+ if (Lhs == Rhs) {
+ BV.emplace_back(Lhs, ByteUse::AllOperands);
+ continue;
+ }
+
+ if (Lhs.isConstant()) {
+ if (Lhs.getConstant() == 0) {
+ BV.emplace_back(Rhs, 1);
+ continue;
+ }
+ if (Lhs.getConstant() == Byte::AllOnes) {
+ BV.emplace_back(Byte::ones(), 0);
+ continue;
+ }
+ }
+
+ if (Rhs.isConstant()) {
+ if (Rhs.getConstant() == 0) {
+ BV.emplace_back(Lhs, 0);
+ continue;
+ }
+ if (Rhs.getConstant() == Byte::AllOnes) {
+ BV.emplace_back(Byte::ones(), 1);
+ continue;
+ }
+ }
+
+ return {};
+ }
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitXor(BinaryOperator &I) {
+ const ByteDefinition LhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ const ByteDefinition RhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+ if (!LhsDef || !RhsDef)
+ return {};
+
+ const ByteLayout &Layout = LhsDef.getLayout();
+ const unsigned NumBytes = Layout.getNumBytes();
+
+ ByteVector BV;
+ BV.reserve(NumBytes);
+
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) {
+ const Byte Lhs = LhsDef.getByte(ByteIdx);
+ const Byte Rhs = RhsDef.getByte(ByteIdx);
+ if (Lhs == Rhs)
+ BV.emplace_back(Byte::zeroes(), ByteUse::AllOperands);
+ else if (Lhs.isConstant() && Lhs.getConstant() == 0)
+ BV.emplace_back(Rhs, 1);
+ else if (Rhs.isConstant() && Rhs.getConstant() == 0)
+ BV.emplace_back(Lhs, 0);
+ else
+ return {};
+ }
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitShl(BinaryOperator &I) {
+ const ByteDefinition BaseDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ if (!BaseDef)
+ return {};
+
+ const unsigned NumBytes = BaseDef.getLayout().getNumBytes();
+
+ const auto *Const = dyn_cast<Constant>(I.getOperand(1));
+ if (!Const)
+ return {};
+
+ if (isa<ConstantInt>(Const)) {
+ const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue();
+ if (ShAmt % Byte::BitWidth != 0)
+ return {};
+ const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytes);
+
+ ByteVector BV;
+ BV.reserve(NumBytes);
+ BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0));
+ for (unsigned ByteIdx = 0; ByteIdx + ByteShAmt < NumBytes; ++ByteIdx)
+ BV.emplace_back(BaseDef.getByte(ByteIdx), 0);
+
+ assert(BV.size() == NumBytes);
+ return BV;
+ }
+
+ assert(Const->getType()->isVectorTy());
+
+ ByteVector BV;
+ BV.reserve(NumBytes);
+
+ const unsigned NumBytesPerElt = BaseDef.getLayout().NumBytesPerElement;
+ for (unsigned EltIdx = 0; EltIdx < BaseDef.getLayout().NumVecElements;
+ ++EltIdx) {
+ const auto *ConstInt =
+ dyn_cast<ConstantInt>(Const->getAggregateElement(EltIdx));
+ if (!ConstInt)
+ return {};
+ const unsigned ShAmt = ConstInt->getValue().getLimitedValue();
+ if (ShAmt % Byte::BitWidth != 0)
+ return {};
+
+ const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytesPerElt);
+ BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0));
+ for (unsigned ByteIdx = 0; ByteIdx + ByteShAmt < NumBytesPerElt; ++ByteIdx)
+ BV.emplace_back(BaseDef.getByte(EltIdx * NumBytesPerElt + ByteIdx), 0);
+ }
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitLShr(BinaryOperator &I) {
+ const ByteDefinition BaseDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ if (!BaseDef)
+ return {};
+
+ const unsigned NumBytes = BaseDef.getLayout().getNumBytes();
+
+ const auto *Const = dyn_cast<Constant>(I.getOperand(1));
+ if (!Const)
+ return {};
+
+ if (isa<ConstantInt>(Const)) {
+ const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue();
+ if (ShAmt % Byte::BitWidth != 0)
+ return {};
+ const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytes);
+
+ ByteVector BV;
+ BV.reserve(NumBytes);
+ for (unsigned ByteIdx = ByteShAmt; ByteIdx < NumBytes; ++ByteIdx)
+ BV.emplace_back(BaseDef.getByte(ByteIdx), 0);
+
+ BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0));
+
+ assert(BV.size() == NumBytes);
+ return BV;
+ }
+
+ assert(Const->getType()->isVectorTy());
+
+ ByteVector BV;
+ BV.reserve(NumBytes);
+
+ const unsigned NumBytesPerElt = BaseDef.getLayout().NumBytesPerElement;
+
+ for (unsigned EltIdx = 0; EltIdx < BaseDef.getLayout().NumVecElements;
+ ++EltIdx) {
+ const auto *ConstInt =
+ dyn_cast<ConstantInt>(Const->getAggregateElement(EltIdx));
+ if (!ConstInt)
+ return {};
+ const unsigned ShAmt = ConstInt->getValue().getLimitedValue();
+ if (ShAmt % Byte::BitWidth != 0)
+ return {};
+ const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytesPerElt);
+ for (unsigned ByteIdx = ByteShAmt; ByteIdx < NumBytesPerElt; ++ByteIdx)
+ BV.emplace_back(BaseDef.getByte(EltIdx * NumBytesPerElt + ByteIdx), 0);
+
+ BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0));
+ }
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitTruncInst(TruncInst &I) {
+ const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+ if (!Layout)
+ return {};
+
+ const std::optional<ByteLayout> SrcLayout =
+ tryGetByteLayout(I.getOperand(0)->getType());
+ if (!SrcLayout)
+ return {};
+
+ const ByteDefinition SrcDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ if (!SrcDef)
+ return {};
+
+ ByteVector BV;
+ const unsigned NumBytes = Layout->getNumBytes();
+ BV.reserve(NumBytes);
+
+ const unsigned NumBytesPerElt = Layout->NumBytesPerElement;
+ const unsigned NumSrcBytesPerElt = SrcLayout->NumBytesPerElement;
+ for (unsigned EltIdx = 0; EltIdx < Layout->NumVecElements; ++EltIdx)
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+ BV.emplace_back(SrcDef.getByte(EltIdx * NumSrcBytesPerElt + ByteIdx), 0);
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitZExtInst(ZExtInst &I) {
+ const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+ if (!Layout)
+ return {};
+
+ const ByteDefinition SrcDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ if (!SrcDef)
+ return {};
+
+ ByteVector BV;
+ const unsigned NumBytes = Layout->getNumBytes();
+ BV.reserve(NumBytes);
+
+ const unsigned NumSrcBytesPerElt = SrcDef.getLayout().NumBytesPerElement;
+ const unsigned NumZExtBytesPerElt =
+ Layout->NumBytesPerElement - NumSrcBytesPerElt;
+
+ unsigned SrcIdx = 0;
+ for (unsigned EltIdx = 0; EltIdx < Layout->NumVecElements; ++EltIdx) {
+ for (unsigned ByteIdx = 0; ByteIdx < NumSrcBytesPerElt; ++SrcIdx, ++ByteIdx)
+ BV.emplace_back(SrcDef.getByte(SrcIdx), 0);
+
+ BV.append(NumZExtBytesPerElt, ByteUse(Byte::zeroes(), 0));
+ }
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitBitCastInst(BitCastInst &I) {
+ const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+ if (!Layout)
+ return {};
+
+ const ByteDefinition SrcDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ if (!SrcDef)
+ return {};
+
+ const unsigned NumBytes = Layout->getNumBytes();
+ ByteVector BV;
+ BV.reserve(NumBytes);
+ for (unsigned Idx = 0; Idx < NumBytes; ++Idx)
+ BV.emplace_back(SrcDef.getByte(Idx), 0);
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitExtractElementInst(ExtractElementInst &I) {
+ const ByteDefinition VecDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ if (!VecDef)
+ return {};
+
+ const auto *VecIdx = dyn_cast<ConstantInt>(I.getOperand(1));
+ if (!VecIdx)
+ return {};
+
+ const unsigned NumBytes = VecDef.getLayout().NumBytesPerElement;
+ const unsigned ByteOffset = VecIdx->getLimitedValue() * NumBytes;
+ ByteVector BV;
+ BV.reserve(NumBytes);
+
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx)
+ BV.emplace_back(VecDef.getByte(ByteIdx + ByteOffset), 0);
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitInsertElementInst(InsertElementInst &I) {
+ const ByteDefinition VecDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ if (!VecDef)
+ return {};
+
+ const ByteDefinition EltDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+ if (!EltDef)
+ return {};
+
+ const auto *VecIdx = dyn_cast<ConstantInt>(I.getOperand(2));
+ if (!VecIdx)
+ return {};
+
+ const unsigned NumBytes = VecDef.getLayout().getNumBytes();
+ const unsigned NumBytesPerElt = VecDef.getLayout().NumBytesPerElement;
+
+ ByteVector BV;
+ BV.reserve(NumBytes);
+ for (unsigned EltIdx = 0; EltIdx < VecDef.getLayout().NumVecElements;
+ ++EltIdx) {
+ if (EltIdx == VecIdx->getLimitedValue()) {
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+ BV.emplace_back(EltDef.getByte(ByteIdx), 0);
+ } else {
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+ BV.emplace_back(VecDef.getByte(EltIdx * NumBytesPerElt + ByteIdx), 1);
+ }
+ }
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+ByteVector ByteExpander::visitShuffleVectorInst(ShuffleVectorInst &I) {
+ const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+ if (!Layout)
+ return {};
+
+ const ByteDefinition LhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ const ByteDefinition RhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+ if (!LhsDef || !RhsDef)
+ return {};
+
+ const int LhsSize = LhsDef.getLayout().NumVecElements;
+ const unsigned NumBytes = Layout->getNumBytes();
+ const unsigned NumBytesPerElt = Layout->NumBytesPerElement;
+
+ ByteVector BV;
+ BV.reserve(NumBytes);
+
+ for (unsigned EltIdx = 0; EltIdx < Layout->NumVecElements; ++EltIdx) {
+ const int Idx = I.getMaskValue(EltIdx);
+ if (Idx < 0) {
+ auto *Poison = PoisonValue::get(I.getType()->getElementType());
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+ BV.emplace_back(Byte(*Poison, 0), ByteIdx);
+ } else if (Idx < LhsSize) {
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+ BV.emplace_back(
+ LhsDef.getByte(static_cast<unsigned>(Idx) * NumBytesPerElt +
+ ByteIdx),
+ 0);
+ } else {
+ for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+ BV.emplace_back(RhsDef.getByte(static_cast<unsigned>(Idx - LhsSize) *
+ NumBytesPerElt +
+ ByteIdx),
+ 1);
+ }
+ }
+
+ assert(BV.size() == NumBytes);
+ return BV;
+}
+
+void ByteExpander::updateFinalUsers(Value *V) {
+ assert(!isa<Constant>(V));
+
+ // FIXME: Old users are copied because iterator is potentially invalidated by
+ // intermediacy checks.
+ DenseSet<Value *> OldFinalUsers = getFinalUsers(V);
+
+ DenseSet<Value *> NewFinalUsers;
+ for (Value *User : OldFinalUsers) {
+ if (!Definitions.contains(User) || !checkIfIntermediate(User)) {
+ NewFinalUsers.insert(User);
+ continue;
+ }
+ const DenseSet<Value *> &NestedUses = getFinalUsers(User);
+ NewFinalUsers.insert_range(NestedUses);
+ }
+
+ FinalUsers[V] = std::move(NewFinalUsers);
+}
+
+const DenseSet<Value *> &ByteExpander::getFinalUsers(Value *V) {
+ assert(!isa<Constant>(V));
+
+ auto It = FinalUsers.find(V);
+
+ if (It != FinalUsers.end())
+ return It->getSecond();
+
+ DenseSet<Value *> &Uses = FinalUsers[V];
+ for (Use &U : V->uses())
+ Uses.insert(U.getUser());
+
+ return Uses;
+}
+
+ByteVector *ByteExpander::expandByteDefinition(Value *V) {
+ auto It = Definitions.find(V);
+ if (It == Definitions.end())
+ return nullptr;
+ return &It->getSecond();
+}
+
+ByteDefinition ByteExpander::getByteDefinition(Value *V, bool ExpandDef) {
+ const std::optional<ByteLayout> Layout = tryGetByteLayout(V->getType());
+ if (!Layout)
+ return ByteDefinition::invalid();
+
+ if (auto *ConstInt = dyn_cast<ConstantInt>(V))
+ return ByteDefinition::constInt(*ConstInt);
+ if (auto *Const = dyn_cast<Constant>(V))
+ if (Const->getType()->isVectorTy())
+ return ByteDefinition::constVec(*Const);
+
+ if (ExpandDef)
+ if (ByteVector *BV = expandByteDefinition(V))
+ return ByteDefinition::vector(*BV, *Layout);
+
+ return ByteDefinition::value(*V);
+}
+
+ByteDefinition ByteExpander::getByteDefinitionIfIntermediateOperand(Value *V) {
+ return getByteDefinition(V, checkIfIntermediate(V, true));
+}
+
+bool ByteExpander::checkIfIntermediate(Value *V, bool IsOperand) {
+ if (isa<Constant>(V))
+ return true;
+
+ /// Short-circuit check.
+ if (IsOperand && V->hasOneUse())
+ return true;
+
+ const DenseSet<Value *> &FU = getFinalUsers(V);
+ if (FU.size() != 1)
+ return false;
+
+ return Definitions.contains(*FU.begin());
+}
+
+std::vector<Instruction *> ByteExpander::collectPIICandidates(Function &F) {
+ std::vector<Instruction *> PackedIntInsts;
+ LLVM_DEBUG(dbgs() << "PICP: Entering function " << F.getName() << "\n");
+
+ unsigned NumIterations = 1;
+ for (;;) {
+ LLVM_DEBUG(dbgs() << "PICP: Iteration " << NumIterations << "\n");
+ bool Converged = true;
+
+ std::vector<Instruction *> CollectedInsts;
+ SetVector<Value *> WorkList;
+
+ for (BasicBlock *BB : ReversePostOrderTraversal<Function *>(&F)) {
+ for (Instruction &I : *BB) {
+ ByteVector BV = visit(I);
+ if (BV.empty())
+ continue;
+
+ CollectedInsts.push_back(&I);
+
+ ByteVector &Def = Definitions[&I];
+ if (Def == BV)
+ continue;
+
+ Converged = false;
+ Def = std::move(BV);
+ for (ByteUse &BU : Def) {
+ const Byte &B = BU.getByte();
+ if (!B.isConstant() && !isa<Constant>(B.getBase()))
+ WorkList.insert(B.getBase());
+ }
+
+ WorkList.insert(&I);
+
+ LLVM_DEBUG({
+ dbgs() << "PICP: Updating definition: ";
+ I.printAsOperand(dbgs());
+ dbgs() << " = " << getByteDefinition(&I) << "\n";
+ });
+ }
+ }
+
+ PackedIntInsts.swap(CollectedInsts);
+
+ if (Converged) {
+ LLVM_DEBUG(dbgs() << "PICP: Reached fixpoint\n");
+ break;
+ }
+ if (NumIterations == MaxCollectionIterations) {
+ LLVM_DEBUG(dbgs() << "PICP: Reached maximum iteration limit\n");
+ break;
+ }
+
+ // Update final uses of values before their operands.
+ for (auto RI = WorkList.rbegin(); RI != WorkList.rend(); ++RI)
+ updateFinalUsers(*RI);
+ ++NumIterations;
+ }
+
+ LLVM_DEBUG(dbgs() << "PICP: Total iterations: " << NumIterations << "\n");
+ return PackedIntInsts;
+}
+
+/// Return the value of all bits in a range, or std::nullopt if the bits vary.
+static std::optional<bool> checkAllBits(const SmallBitVector &Mask, unsigned Lo,
+ unsigned NumBits) {
+ bool Bit = Mask[Lo];
+ for (unsigned Idx = 1; Idx < NumBits; ++Idx)
+ if (Mask[Lo + Idx] != Bit)
+ return std::nullopt;
+ return Bit;
+}
+
+/// Structure for tracking the set of bytes of a final value which are produced
+/// by a given byte pack.
+struct PartialBytePack {
+
+ /// The value which produces the subset of bytes of a final value.
+ /// The byte pack is invalid if this pointer is null.
+ Value *BytePack;
+ /// A mask which identifies which bytes of a final value are provided by the
+ /// given byte pack. If a mask bit is not set, then the corresponding byte of
+ /// the byte pack must be zero.
+ SmallBitVector SetBytes;
+
+ PartialBytePack(Value *BytePack, SmallBitVector SetBytes)
+ : BytePack(BytePack), SetBytes(SetBytes) {}
+ PartialBytePack(Value *BytePack, unsigned NumBytes)
+ : BytePack(BytePack), SetBytes(NumBytes) {}
+
+ static PartialBytePack invalid() { return {nullptr, {}}; }
+
+ bool isValid() const { return BytePack != nullptr; }
+};
+
+/// Construct an integer whose bytes are set depending on the value of the
+/// corresponding \p Mask bit. A bit of \p Mask corresponds to an entire byte of
+/// the resulting APInt.
+static APInt createMaskConstant(unsigned BitWidth, const SmallBitVector &Mask) {
+ APInt BitMaskInt(BitWidth, 0);
+ for (unsigned ByteIdx : Mask.set_bits()) {
+ const unsigned BitIdx = ByteIdx * Byte::BitWidth;
+ if (BitIdx >= BitWidth)
+ break;
+ BitMaskInt.setBits(BitIdx, BitIdx + Byte::BitWidth);
+ }
+ return BitMaskInt;
+}
+
+/// Construct a mask whose bits correspond to vector elements identified by the
+/// \p ByteMask, or an empty vector if the \p Bytemask does not identify whole
+/// vector elements.
+static SmallBitVector getVectorElementMask(SmallBitVector &ByteMask,
+ unsigned NumBytesPerElement) {
+ if (ByteMask.size() % NumBytesPerElement != 0)
+ return {};
+ const unsigned NumElts = ByteMask.size() / NumBytesPerElement;
+
+ SmallBitVector EltMask;
+ EltMask.reserve(NumElts);
+ for (unsigned EltIdx = 0; EltIdx < NumElts; ++EltIdx) {
+ const std::optional<bool> Bits =
+ checkAllBits(ByteMask, EltIdx * NumBytesPerElement, NumBytesPerElement);
+ if (!Bits)
+ return {};
+ EltMask.push_back(*Bits);
+ }
+
+ assert(EltMask.size() == NumElts);
+ return EltMask;
+}
+
+/// A key for the CastCache of the BytePackFolder.
+struct CastEntry {
+ /// The value being casted.
+ Value *Base;
+ /// The type being casted into.
+ Type *CastTy;
+ /// The opcode of the cast instruction.
+ Instruction::CastOps OpCode;
+
+ struct MapInfo {
+ static CastEntry getEmptyKey() {
+ return {DenseMapInfo<Value *>::getEmptyKey(),
+ DenseMapInfo<Type *>::getEmptyKey(),
+ DenseMapInfo<Instruction::CastOps>::getEmptyKey()};
+ }
+ static CastEntry getTombstoneKey() {
+ return {DenseMapInfo<Value *>::getTombstoneKey(),
+ DenseMapInfo<Type *>::getTombstoneKey(),
+ DenseMapInfo<Instruction::CastOps>::getTombstoneKey()};
+ }
+ static unsigned getHashValue(const CastEntry &E) {
+ return hash_combine(
+ DenseMapInfo<Value *>::getHashValue(E.Base),
+ DenseMapInfo<Type *>::getHashValue(E.CastTy),
+ DenseMapInfo<Instruction::CastOps>::getHashValue(E.OpCode));
+ }
+ static bool isEqual(const CastEntry &Lhs, const CastEntry &Rhs) {
+ return Lhs.Base == Rhs.Base && Lhs.CastTy == Rhs.CastTy &&
+ Lhs.OpCode == Rhs.OpCode;
+ }
+ };
+};
+
+/// The class responsible for taking coalesced bytes and folding them together
+/// to produce the desired final value.
+///
+/// When coalesced bytes are pushed, they are promoted to the target type, and
+/// shifted to align the bytes to their corresponding offsets in the target
+/// value.
+class BytePackFolder {
+ /// The target final value to produce.
+ Instruction *TargetInst;
+ /// The layout of the target value.
+ ByteLayout Layout;
+ /// The collection of intermediate partial byte packs generated while folding
+ /// coalesced bytes.
+ std::vector<PartialBytePack> WorkList;
+ /// The list of non-cast instructions generated while folding coalesced bytes.
+ SmallVector<Instruction *> Insts;
+ /// A dedicated partial byte pack for collecting vector-aligned coalesced
+ /// bytes, if the target value is a vector type.
+ PartialBytePack VectorAlignedPack;
+ /// A cache holding all value casts needed, to avoid generating duplicate
+ /// casts.
+ MapVector<CastEntry, Instruction *,
+ DenseMap<CastEntry, unsigned, CastEntry::MapInfo>>
+ CastCache;
+
+ /// Create or reuse a cast of a given value.
+ Value *pushCast(Instruction::CastOps OpCode, Value *V, Type *DstTy) {
+ if (V->getType() == DstTy)
+ return V;
+
+ CastEntry E{V, DstTy, OpCode};
+ auto *It = CastCache.find(E);
+ if (It != CastCache.end())
+ return It->second;
+
+ auto *CI = CastInst::Create(OpCode, V, DstTy, V->getName() + ".cast");
+ CastCache[E] = CI;
+
+ LLVM_DEBUG({
+ dbgs() << "PICP [";
+ TargetInst->printAsOperand(dbgs());
+ dbgs() << "]: Queuing cast " << *CI << "\n";
+ });
+ return CI;
+ }
+
+ Instruction *pushInst(Instruction *I) {
+ // Cast instructions should be handled with pushCast.
+ assert(!isa<CastInst>(I));
+ Insts.push_back(I);
+
+ LLVM_DEBUG({
+ dbgs() << "PICP [";
+ TargetInst->printAsOperand(dbgs());
+ dbgs() << "]: Queuing inst " << *I << "\n";
+ });
+ return I;
+ }
+
+ /// Common functionality for promoting coalesced bytes to a vector.
+ bool pushToVectorImpl(Value *V, SmallBitVector &ByteMask, unsigned NumSrcElts,
+ int ShrEltOffset, const Twine &Name) {
+ auto *TargetVecTy = cast<FixedVectorType>(TargetInst->getType());
+ auto *I32Ty = IntegerType::getInt32Ty(V->getContext());
+
+ // Try to push bytes to the vector-aligned builder.
+ SmallBitVector VecMask =
+ getVectorElementMask(ByteMask, Layout.NumBytesPerElement);
+ if (!VecMask.empty()) {
+ if (!VectorAlignedPack.isValid())
+ VectorAlignedPack = PartialBytePack(
+ ConstantVector::getNullValue(TargetVecTy), Layout.getNumBytes());
+
+ if (NumSrcElts == 1) {
+ // Insert a single element
+ assert(ShrEltOffset <= 0);
+ VectorAlignedPack.BytePack = pushInst(InsertElementInst::Create(
+ VectorAlignedPack.BytePack, V,
+ ConstantInt::get(I32Ty, -ShrEltOffset), Name + ".insert"));
+ VectorAlignedPack.SetBytes |= ByteMask;
+ return true;
+ }
+
+ assert(isa<FixedVectorType>(V->getType()));
+
+ if (NumSrcElts != Layout.NumVecElements) {
+ // We need to construct a vector of the same size as the vector-aligned
+ // byte pack before shuffling it in.
+ SmallVector<int> ExtractMask;
+ ExtractMask.reserve(Layout.NumVecElements);
+ for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) {
+ if (VecMask.test(EltIdx)) {
+ const int SrcIdx = static_cast<int>(EltIdx) + ShrEltOffset;
+ assert(SrcIdx >= 0);
+ ExtractMask.push_back(SrcIdx);
+ } else
+ ExtractMask.push_back(PoisonMaskElem);
+ }
+ assert(ExtractMask.size() == Layout.NumVecElements);
+
+ V = pushInst(new ShuffleVectorInst(V, ExtractMask, Name + ".extract"));
+ // We have accounted for the shift already, so no need to account for it
+ // when shuffling into the vector-aligned byte pack.
+ ShrEltOffset = 0;
+ }
+
+ assert(V->getType() == TargetVecTy);
+
+ if (VecMask.all()) {
+ VectorAlignedPack.BytePack = V;
+ VectorAlignedPack.SetBytes.set();
+ return true;
+ }
+
+ SmallVector<int> ShuffleMask;
+ ShuffleMask.reserve(Layout.NumVecElements);
+ for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) {
+ if (VecMask.test(EltIdx)) {
+ const int SrcIdx = static_cast<int>(EltIdx) + ShrEltOffset;
+ assert(SrcIdx >= 0);
+ ShuffleMask.push_back(SrcIdx);
+ } else
+ ShuffleMask.push_back(EltIdx + Layout.NumVecElements);
+ }
+ assert(ShuffleMask.size() == Layout.NumVecElements);
+
+ // We can shuffle directly into the vector-aligned byte pack.
+ VectorAlignedPack.BytePack = pushInst(new ShuffleVectorInst(
+ V, VectorAlignedPack.BytePack, ShuffleMask, Name + ".shuffle"));
+ VectorAlignedPack.SetBytes |= ByteMask;
+ return true;
+ }
+
+ // Otherwise, just extract and mask the relevant elements, and append to the
+ // worklist.
+
+ if (NumSrcElts == 1) {
+ assert(ShrEltOffset <= 0);
+ V = pushInst(InsertElementInst::Create(
+ ConstantVector::getNullValue(TargetVecTy), V,
+ ConstantInt::get(I32Ty, -ShrEltOffset), Name + ".insert"));
+ } else if (NumSrcElts != Layout.NumVecElements) {
+ SmallVector<int> ShuffleMask;
+ ShuffleMask.reserve(Layout.NumVecElements);
+ ShuffleMask.append(std::max(0, -ShrEltOffset), Layout.NumVecElements);
+ for (unsigned SrcIdx = std::max(0, ShrEltOffset);
+ SrcIdx < Layout.NumVecElements; ++SrcIdx)
+ ShuffleMask.push_back(SrcIdx);
+ ShuffleMask.append(std::max(0, ShrEltOffset), Layout.NumVecElements);
+ assert(ShuffleMask.size() == Layout.NumVecElements);
+
+ V = pushInst(
+ new ShuffleVectorInst(V, ConstantVector::getNullValue(V->getType()),
+ ShuffleMask, Name + ".shuffle"));
+ }
+
+ assert(V->getType() == TargetVecTy);
+
+ const unsigned TargetBitWidth = Layout.getNumBytes() * Byte::BitWidth;
+ const unsigned TargetEltBitWidth =
+ Layout.NumBytesPerElement * Byte::BitWidth;
+ Type *TargetEltTy = TargetVecTy->getElementType();
+
+ APInt MaskBits = createMaskConstant(TargetBitWidth, ByteMask);
+ SmallVector<Constant *> EltwiseMask;
+ EltwiseMask.reserve(Layout.NumVecElements);
+ for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx)
+ EltwiseMask.push_back(ConstantInt::get(
+ TargetEltTy,
+ MaskBits.extractBits(TargetEltBitWidth, EltIdx * TargetEltBitWidth)));
+
+ V = pushInst(BinaryOperator::CreateAnd(V, ConstantVector::get(EltwiseMask),
+ Name + ".mask"));
+
+ WorkList.emplace_back(V, ByteMask);
+ return true;
+ }
+
+ bool pushIntegerToInteger(CoalescedBytes CB) {
+ assert(isa<IntegerType>(CB.Base->getType()));
+ auto *TargetIntTy = cast<IntegerType>(TargetInst->getType());
+
+ const unsigned NumTargetBytes = Layout.getNumBytes();
+ Value *V = CB.Base;
+ const unsigned NumSrcBytes = getByteLayout(V->getType()).getNumBytes();
+ const StringRef &Name = V->getName();
+
+ // Transformation: shr -> trunc -> mask -> zext -> shl
+ if (const unsigned ShrAmt = CB.getShrBits())
+ V = pushInst(BinaryOperator::CreateLShr(
+ V, ConstantInt::get(V->getType(), ShrAmt), Name + ".shift"));
+
+ if (NumSrcBytes > NumTargetBytes)
+ V = pushCast(Instruction::Trunc, V, TargetIntTy);
+
+ const unsigned ShlByteOffset = CB.getShlBytes();
+ const unsigned NumBytesToCheck = std::min(
+ ShlByteOffset < NumTargetBytes ? NumTargetBytes - ShlByteOffset : 0,
+ CB.getShrBytes() < NumSrcBytes ? NumSrcBytes - CB.getShrBytes() : 0);
+ if (!checkAllBits(CB.Mask, ShlByteOffset, NumBytesToCheck)) {
+ SmallBitVector RelMask = CB.Mask;
+ RelMask >>= ShlByteOffset;
+ Constant *Mask = ConstantInt::get(
+ V->getType(),
+ createMaskConstant(V->getType()->getIntegerBitWidth(), RelMask));
+ V = pushInst(BinaryOperator::CreateAnd(V, Mask, Name + ".mask"));
+ }
+
+ if (NumSrcBytes < NumTargetBytes)
+ V = pushCast(Instruction::ZExt, V, TargetIntTy);
+
+ if (const unsigned ShlAmt = CB.getShlBits())
+ V = pushInst(BinaryOperator::CreateShl(
+ V, ConstantInt::get(V->getType(), ShlAmt), Name + ".shift"));
+
+ WorkList.emplace_back(V, CB.Mask);
+ return true;
+ }
+
+ bool pushIntegerToVector(CoalescedBytes CB) {
+ assert(isa<IntegerType>(CB.Base->getType()));
+ auto *TargetVecTy = cast<FixedVectorType>(TargetInst->getType());
+ Type *TargetEltTy = TargetVecTy->getElementType();
+
+ Value *V = CB.Base;
+ const unsigned NumSrcBytes =
+ V->getType()->getIntegerBitWidth() / Byte::BitWidth;
+ const StringRef &Name = V->getName();
+
+ // Give up if bytes are obtained from a strange offset.
+ if (CB.SignedShrByteOffset % Layout.NumBytesPerElement != 0)
+ return {};
+ const int ShrEltOffset =
+ CB.SignedShrByteOffset / static_cast<int>(Layout.NumBytesPerElement);
+
+ // Give up if the source integer does not decompose naturally into vector
+ // elements.
+ if (NumSrcBytes % Layout.NumBytesPerElement != 0)
+ return {};
+ const unsigned NumSrcElts = NumSrcBytes / Layout.NumBytesPerElement;
+
+ if (NumSrcElts > 1) {
+ auto *CastTy = FixedVectorType::get(TargetEltTy, NumSrcElts);
+ V = pushCast(Instruction::BitCast, V, CastTy);
+ }
+
+ return pushToVectorImpl(V, CB.Mask, NumSrcElts, ShrEltOffset, Name);
+ }
+
+ bool pushVectorToInteger(CoalescedBytes CB) {
+ assert(isa<FixedVectorType>(CB.Base->getType()));
+ auto *TargetIntTy = cast<IntegerType>(TargetInst->getType());
+
+ const unsigned NumTargetBytes = Layout.getNumBytes();
+ Value *V = CB.Base;
+ const StringRef &Name = V->getName();
+ ByteLayout VecLayout = getByteLayout(V->getType());
+
+ // For sub-element accesses, try to subdivide the vector into smaller
+ // elements.
+ if (VecLayout.NumBytesPerElement > NumTargetBytes) {
+ if (VecLayout.NumBytesPerElement % NumTargetBytes != 0)
+ return {};
+
+ const unsigned SplitFactor =
+ VecLayout.NumBytesPerElement / NumTargetBytes;
+ auto *NewTy = FixedVectorType::get(TargetIntTy, VecLayout.NumVecElements *
+ SplitFactor);
+ V = pushCast(Instruction::BitCast, V, NewTy);
+ VecLayout = getByteLayout(V->getType());
+ }
+
+ // Give up if bytes are obtained from a strange offset.
+ if (CB.SignedShrByteOffset % VecLayout.NumBytesPerElement != 0)
+ return {};
+
+ int ShrEltOffset =
+ CB.SignedShrByteOffset / static_cast<int>(VecLayout.NumBytesPerElement);
+
+ // Give up if the target integer does not decompose naturally into vector
+ // elements.
+ if (NumTargetBytes % VecLayout.NumBytesPerElement != 0)
+ return {};
+ const unsigned NumTargetElts =
+ NumTargetBytes / VecLayout.NumBytesPerElement;
+
+ auto *I32Ty = IntegerType::getInt32Ty(V->getContext());
+
+ // Coarsely isolate elements of interest, and use a bitmask to clean up the
+ // rest.
+ const bool NeedsBitMask = [&] {
+ if (NumTargetElts == 1) {
+ // Extract the unique relevant element
+ const int ExtractIdx = ShrEltOffset;
+ assert(ExtractIdx >= 0);
+ V = pushInst(ExtractElementInst::Create(
+ V, ConstantInt::get(I32Ty, ExtractIdx), Name + ".extract"));
+ ShrEltOffset = 0;
+ return !CB.Mask.all();
+ }
+
+ if (NumTargetElts != VecLayout.NumVecElements) {
+ bool IsVectorAligned = true;
+
+ // Extract all relevant elements into a shufflevector
+ SmallVector<int> ShuffleMask;
+ ShuffleMask.reserve(NumTargetElts);
+
+ for (unsigned EltIdx = 0; EltIdx < NumTargetElts; ++EltIdx) {
+ const std::optional<bool> EltMask =
+ checkAllBits(CB.Mask, EltIdx * VecLayout.NumBytesPerElement,
+ VecLayout.NumBytesPerElement);
+
+ IsVectorAligned &= EltMask.has_value();
+ if (!EltMask || *EltMask) {
+ const int ExtractIdx = static_cast<int>(EltIdx) + ShrEltOffset;
+ assert(ExtractIdx >= 0);
+ ShuffleMask.push_back(ExtractIdx);
+ } else {
+ ShuffleMask.push_back(VecLayout.NumVecElements);
+ }
+ }
+
+ V = pushInst(
+ new ShuffleVectorInst(V, ConstantVector::getNullValue(V->getType()),
+ ShuffleMask, Name + ".shuffle"));
+ V = pushCast(Instruction::BitCast, V, TargetIntTy);
+
+ ShrEltOffset = 0;
+ return !IsVectorAligned;
+ }
+
+ V = pushCast(Instruction::BitCast, V, TargetIntTy);
+ return !CB.Mask.all();
+ }();
+
+ assert(V->getType() == TargetIntTy);
+
+ const int ShrBitOffset = ShrEltOffset *
+ static_cast<int>(VecLayout.NumBytesPerElement) *
+ static_cast<int>(Byte::BitWidth);
+ if (ShrBitOffset > 0)
+ V = pushInst(BinaryOperator::CreateLShr(
+ V, ConstantInt::get(V->getType(), ShrBitOffset), Name + ".shift"));
+ else if (ShrBitOffset < 0)
+ V = pushInst(BinaryOperator::CreateShl(
+ V, ConstantInt::get(V->getType(), -ShrBitOffset), Name + ".shift"));
+
+ if (NeedsBitMask) {
+ // Mask out unwanted bytes.
+ Constant *Mask = ConstantInt::get(
+ TargetIntTy, createMaskConstant(TargetIntTy->getBitWidth(), CB.Mask));
+ V = pushInst(BinaryOperator::CreateAnd(V, Mask, Name + ".mask"));
+ }
+
+ WorkList.emplace_back(V, CB.Mask);
+ return true;
+ }
+
+ bool pushVectorToVector(CoalescedBytes CB) {
+ assert(isa<FixedVectorType>(CB.Base->getType()));
+ auto *TargetVecTy = cast<FixedVectorType>(TargetInst->getType());
+ Type *TargetEltTy = TargetVecTy->getElementType();
+
+ const ByteLayout SrcLayout = getByteLayout(CB.Base->getType());
+ Value *V = CB.Base;
+ const StringRef &Name = V->getName();
+
+ // Give up if the source vector cannot be converted to match the elements of
+ // the target vector.
+ if (SrcLayout.getNumBytes() % Layout.NumBytesPerElement != 0)
+ return {};
+ const unsigned NumSrcElts =
+ SrcLayout.getNumBytes() / Layout.NumBytesPerElement;
+
+ // Give up if the shift amount is not aligned to the target vector.
+ if (CB.SignedShrByteOffset % Layout.NumBytesPerElement != 0)
+ return {};
+
+ const int ShrEltOffset =
+ CB.SignedShrByteOffset / static_cast<int>(Layout.NumBytesPerElement);
+
+ Type *SrcTy;
+ if (NumSrcElts > 1)
+ SrcTy = FixedVectorType::get(TargetEltTy, NumSrcElts);
+ else
+ SrcTy = TargetEltTy;
+
+ V = pushCast(Instruction::BitCast, V, SrcTy);
+
+ return pushToVectorImpl(V, CB.Mask, NumSrcElts, ShrEltOffset, Name);
+ }
+
+ PartialBytePack mergeIntegerPacks(PartialBytePack &Lhs,
+ PartialBytePack &Rhs) {
+ assert(isa<IntegerType>(Lhs.BytePack->getType()) &&
+ isa<IntegerType>(Rhs.BytePack->getType()));
+ Value *Merge = pushInst(BinaryOperator::CreateDisjointOr(
+ Lhs.BytePack, Rhs.BytePack, TargetInst->getName() + ".merge", nullptr));
+ return {Merge, Lhs.SetBytes | Rhs.SetBytes};
+ }
+
+ PartialBytePack mergeVectorPacks(PartialBytePack &Lhs, PartialBytePack &Rhs) {
+ assert(isa<FixedVectorType>(Lhs.BytePack->getType()) &&
+ isa<FixedVectorType>(Rhs.BytePack->getType()));
+ SmallVector<int> ShuffleMask;
+ ShuffleMask.reserve(Layout.NumVecElements);
+ for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) {
+ const unsigned Lo = EltIdx * Layout.NumBytesPerElement;
+ const std::optional<bool> LhsBits =
+ checkAllBits(Lhs.SetBytes, Lo, Layout.NumBytesPerElement);
+ if (!LhsBits) {
+ const std::optional<bool> RhsBits =
+ checkAllBits(Rhs.SetBytes, Lo, Layout.NumBytesPerElement);
+ if (!RhsBits) {
+ ShuffleMask.clear();
+ break;
+ }
+ ShuffleMask.push_back(*RhsBits ? EltIdx + Layout.NumVecElements
+ : EltIdx);
+ continue;
+ }
+
+ ShuffleMask.push_back(*LhsBits ? EltIdx : EltIdx + Layout.NumVecElements);
+ }
+
+ const Twine &Name = TargetInst->getName() + ".merge";
+ Value *Merge;
+ if (ShuffleMask.empty())
+ Merge = pushInst(BinaryOperator::CreateDisjointOr(
+ Lhs.BytePack, Rhs.BytePack, Name, nullptr));
+ else
+ Merge = pushInst(
+ new ShuffleVectorInst(Lhs.BytePack, Rhs.BytePack, ShuffleMask, Name));
+
+ return {Merge, Lhs.SetBytes | Rhs.SetBytes};
+ }
+
+public:
+ BytePackFolder(Instruction *TargetV)
+ : TargetInst(TargetV), Layout(getByteLayout(TargetV->getType())),
+ VectorAlignedPack(PartialBytePack::invalid()) {}
+
+ ~BytePackFolder() {
+ /// If instructions are not committed, they need to be cleaned up.
+
+ for (auto &[_, I] : CastCache) {
+ LLVM_DEBUG({
+ dbgs() << "PICP [";
+ TargetInst->printAsOperand(dbgs());
+ dbgs() << "]: Dequeuing cast " << *I << "\n";
+ });
+ I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+ I->deleteValue();
+ }
+
+ while (!Insts.empty()) {
+ LLVM_DEBUG({
+ dbgs() << "PICP [";
+ TargetInst->printAsOperand(dbgs());
+ dbgs() << "]: Dequeuing inst " << *Insts.back() << "\n";
+ });
+ Insts.back()->deleteValue();
+ Insts.pop_back();
+ }
+ }
+
+ /// Try to generate instructions for coalescing the given bytes and aligning
+ /// them to the target value. Returns true iff this is successful.
+ bool pushCoalescedBytes(CoalescedBytes CB) {
+ if (isa<Constant>(CB.Base) && CB.SignedShrByteOffset == 0) {
+ WorkList.emplace_back(CB.Base, CB.Mask);
+ return true;
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "PICP [";
+ TargetInst->printAsOperand(dbgs());
+ dbgs() << "]: Preparing bytes " << CB << "\n";
+ });
+ if (isa<FixedVectorType>(TargetInst->getType())) {
+ if (isa<FixedVectorType>(CB.Base->getType()))
+ return pushVectorToVector(CB);
+
+ return pushIntegerToVector(CB);
+ }
+
+ if (isa<FixedVectorType>(CB.Base->getType()))
+ return pushVectorToInteger(CB);
+
+ return pushIntegerToInteger(CB);
+ }
+
+ /// After coalescing all byte packs individually, this folds the coalesced
+ /// byte packs together to (re)produce the final value and return it.
+ Value *foldBytePacks(IRBuilder<> &IRB) {
+ Type *TargetTy = TargetInst->getType();
+
+ if (VectorAlignedPack.isValid()) {
+ WorkList.push_back(VectorAlignedPack);
+ VectorAlignedPack = PartialBytePack::invalid();
+ }
+
+ while (WorkList.size() > 1) {
+ std::vector<PartialBytePack> NewWorkList;
+ NewWorkList.reserve((WorkList.size() + 1) / 2);
+
+ for (unsigned Item = 0; Item + 1 < WorkList.size(); Item += 2) {
+ PartialBytePack &Lhs = WorkList[Item];
+ PartialBytePack &Rhs = WorkList[Item + 1];
+ NewWorkList.push_back(isa<FixedVectorType>(TargetTy)
+ ? mergeVectorPacks(Lhs, Rhs)
+ : mergeIntegerPacks(Lhs, Rhs));
+ }
+ if (WorkList.size() % 2 == 1)
+ NewWorkList.push_back(WorkList.back());
+
+ WorkList.swap(NewWorkList);
+ }
+
+ IRB.SetInsertPoint(TargetInst);
+ for (Value *I : Insts)
+ IRB.Insert(I, I->getName());
+
+ Insts.clear();
+
+ for (auto &[E, I] : CastCache) {
+ if (auto *BaseI = dyn_cast<Instruction>(E.Base))
+ I->insertInto(BaseI->getParent(), *BaseI->getInsertionPointAfterDef());
+ else {
+ BasicBlock &BB = TargetInst->getFunction()->getEntryBlock();
+ I->insertInto(&BB, BB.getFirstInsertionPt());
+ }
+ }
+
+ CastCache.clear();
+
+ // Note: WorkList may be empty if the value is known to be zero.
+ return WorkList.empty() ? Constant::getNullValue(TargetTy)
+ : WorkList.back().BytePack;
+ }
+};
+
+/// A final value (or an operand thereof, if the rewriter is not aggressive)
+/// queued up to be reconstructed.
+struct PackedIntInstruction {
+ /// The target value to reconstruct.
+ Instruction *TargetInst;
+ /// The chosen partitioning of its bytes.
+ SmallVector<CoalescedBytes, 8> CBV;
+
+ PackedIntInstruction(Instruction &I, SmallVector<CoalescedBytes, 8> &&CBV)
+ : TargetInst(&I), CBV(CBV) {}
+
+ /// Try to reconstruct a value given its coalesced byte partitioning,
+ /// returning the reconstructed value on success, or a nullptr on failure.
+ Value *rewrite(IRBuilder<> &IRB) const {
+ BytePackFolder BPF(TargetInst);
+ for (const CoalescedBytes &CB : CBV) {
+ if (!BPF.pushCoalescedBytes(CB)) {
+ LLVM_DEBUG(dbgs() << "PICP: Coalescing rejected!\n");
+ return nullptr;
+ }
+ }
+
+ return BPF.foldBytePacks(IRB);
+ }
+};
+
+/// Coalesce the bytes in a definition into a partition for rewriting.
+/// If the rewriter is non-aggressive, return nullopt if the rewriting is
+/// determined to be unnecessary.
+static std::optional<SmallVector<CoalescedBytes, 8>>
+getCoalescingOpportunity(Type *Ty, const ByteVector &BV) {
+ const ByteLayout Layout = getByteLayout(Ty);
+ assert(Layout.getNumBytes() == BV.size() &&
+ "Byte definition has unexpected width.");
+
+ SmallVector<CoalescedBytes, 8> CBV;
+ SmallVector<int, 8> CBVOperands;
+ const unsigned BitWidth = Layout.getNumBytes() * Byte::BitWidth;
+ APInt ConstBits(BitWidth, 0);
+ SmallBitVector ConstBytes(BV.size());
+
+ bool OperandsAlreadyCoalesced = true;
+ bool UsesSingleSource = true;
+ for (unsigned ByteIdx = 0; ByteIdx < BV.size(); ++ByteIdx) {
+ const ByteUse &BU = BV[ByteIdx];
+ const Byte &B = BU.getByte();
+ if (B.isConstant()) {
+ const unsigned Const = B.getConstant();
+ if (!Const)
+ continue;
+
+ ConstBits.insertBits(Const, ByteIdx * Byte::BitWidth, Byte::BitWidth);
+ ConstBytes.set(ByteIdx);
+ } else {
+ CoalescedBytes *CB = nullptr;
+ Value *Base = B.getBase();
+ const signed Offset =
+ static_cast<signed>(B.getIndex()) - static_cast<signed>(ByteIdx);
+ for (unsigned CBIdx = 0; CBIdx < CBV.size(); ++CBIdx) {
+ if (CBV[CBIdx].alignsWith(Base, Offset)) {
+ CB = &CBV[CBIdx];
+ int &OpIdx = CBVOperands[CBIdx];
+ if (OpIdx < 0)
+ OpIdx = BU.getOperandIndex();
+ else if (BU.getOperandIndex() >= 0 && OpIdx != BU.getOperandIndex()) {
+ LLVM_DEBUG(dbgs()
+ << "PICP: Bytes " << *CB << " from operand " << OpIdx
+ << " can be coalesced with byte " << B
+ << " from operand " << BU.getOperandIndex() << "\n");
+ OperandsAlreadyCoalesced = false;
+ }
+ }
+ }
+
+ if (!CB) {
+ CB = &CBV.emplace_back(*Base, Offset, BV.size());
+ CBVOperands.push_back(BU.getOperandIndex());
+ }
+
+ UsesSingleSource &= CB->Base == CBV.front().Base;
+ CB->Mask.set(ByteIdx);
+ }
+ }
+
+ if (!AggressiveRewriting) {
+ if (OperandsAlreadyCoalesced && !CBV.empty()) {
+ // If packed bytes from the same source and offset are not split between
+ // operands, then this instruction does not need to be rewritten.
+ LLVM_DEBUG(dbgs() << "PICP: Operands are already coalesced.\n");
+ return std::nullopt;
+ }
+ if (UsesSingleSource && CBV.size() > 1) {
+ // If packed bytes come from the same source, but cannot be coalesced
+ // (e.g., bytes from one operand are shuffled), then rewriting this
+ // instruction may lead to strange IR.
+ LLVM_DEBUG(
+ dbgs()
+ << "PICP: Instruction rearranges bytes from a single source.\n");
+ return std::nullopt;
+ }
+ }
+
+ // The CBV will be used for rewriting; append the constant value that was also
+ // accumulated, if nonzero.
+ if (ConstBytes.any()) {
+ // Create initial constant as desired type.
+ if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+ SmallVector<Constant *> EltwiseMask;
+ const unsigned NumBitsPerElt = Layout.NumBytesPerElement * Byte::BitWidth;
+ EltwiseMask.reserve(Layout.NumVecElements);
+ for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx)
+ EltwiseMask.push_back(ConstantInt::get(
+ VecTy->getElementType(),
+ ConstBits.extractBits(NumBitsPerElt, EltIdx * NumBitsPerElt)));
+
+ CBV.emplace_back(*ConstantVector::get(EltwiseMask), 0, ConstBytes);
+ } else
+ CBV.emplace_back(*ConstantInt::get(Ty, ConstBits), 0, ConstBytes);
+ }
+
+ return CBV;
+}
+
+/// Queue into \p PIIV the set of final values (or operands thereof, if the
+/// rewriter is non-aggressive) which are deemed beneficial to rewrite.
+static void queueRewriting(std::vector<PackedIntInstruction> &PIIV,
+ Instruction &FinalInst, ByteExpander &BE) {
+ SmallVector<Instruction *, 8> WorkList{&FinalInst};
+ SmallPtrSet<Instruction *, 8> Seen{&FinalInst};
+
+ do {
+ Instruction *I = WorkList.back();
+ WorkList.pop_back();
+
+ const ByteVector *BV = BE.expandByteDefinition(I);
+ if (!BV)
+ // This instruction is beyond the analysis scope of PICP.
+ continue;
+
+ LLVM_DEBUG(dbgs() << "PICP rewrite candidate: " << *I << "\n"
+ << " byte pack: " << BE.getByteDefinition(I)
+ << "\n");
+ auto CBV = [&]() -> std::optional<SmallVector<CoalescedBytes, 8>> {
+ // Short-circuit check for casts.
+ if (!AggressiveRewriting && I->getNumOperands() == 1)
+ return std::nullopt;
+
+ return getCoalescingOpportunity(I->getType(), *BV);
+ }();
+
+ if (!CBV) {
+ // Narrow rewriting to the operands of this instruction instead.
+ for (Use &U : I->operands())
+ if (auto *Op = dyn_cast<Instruction>(U.get()))
+ if (Seen.insert(Op).second)
+ WorkList.push_back(Op);
+ continue;
+ }
+
+ PIIV.emplace_back(*I, std::move(*CBV));
+ } while (!WorkList.empty());
+}
+
+static bool runImpl(Function &F) {
+ ByteExpander BE;
+
+ std::vector<Instruction *> PIICandidates = BE.collectPIICandidates(F);
+ std::vector<PackedIntInstruction> PIIV;
+
+ for (Instruction *I : PIICandidates) {
+ if (!BE.checkIfIntermediate(I))
+ queueRewriting(PIIV, *I, BE);
+ else
+ LLVM_DEBUG(dbgs() << "PICP intermediate inst: " << *I << "\n"
+ << " final user: "
+ << **BE.getFinalUsers(I).begin() << "\n");
+ }
+
+ DenseMap<Instruction *, Value *> InstSubs;
+ IRBuilder<> IRB(F.getContext());
+ for (const PackedIntInstruction &PII : PIIV)
+ if (Value *V = PII.rewrite(IRB)) {
+ LLVM_DEBUG(dbgs() << "PICP rewrite successful for " << *PII.TargetInst
+ << "\n");
+ InstSubs[PII.TargetInst] = V;
+ }
+
+ if (InstSubs.empty())
+ return false;
+
+ for (auto &[OldI, NewV] : InstSubs)
+ OldI->replaceAllUsesWith(NewV);
+
+ for (auto RIt = PIICandidates.rbegin(); RIt != PIICandidates.rend(); ++RIt) {
+ Instruction *I = *RIt;
+ if (I->getNumUses() == 0)
+ I->eraseFromParent();
+ }
+ return true;
+}
+
+class PackedIntegerCombineLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ PackedIntegerCombineLegacyPass() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override { return runImpl(F); }
+};
+char PackedIntegerCombineLegacyPass::ID = 0;
+
+} // namespace
+
+PreservedAnalyses PackedIntegerCombinePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ if (!runImpl(F))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+INITIALIZE_PASS(PackedIntegerCombineLegacyPass, DEBUG_TYPE,
+ "Packed Integer Combine", false, false)
+
+FunctionPass *llvm::createPackedIntegerCombinePass() {
+ return new PackedIntegerCombineLegacyPass();
+}
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index c7e4a3e824700..2c2b8964af8d5 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -36,6 +36,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLowerAtomicLegacyPassPass(Registry);
initializeMergeICmpsLegacyPassPass(Registry);
initializeNaryReassociateLegacyPassPass(Registry);
+ initializePackedIntegerCombineLegacyPassPass(Registry);
initializePartiallyInlineLibCallsLegacyPassPass(Registry);
initializeReassociateLegacyPassPass(Registry);
initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index 93b5f155fc81e..2cd541363d44d 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -205,12 +205,12 @@ define i64 @load_3xi16_combine(ptr addrspace(1) %p) #0 {
; GCN-LABEL: load_3xi16_combine:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: global_load_dword v2, v[0:1], off
; GCN-NEXT: global_load_ushort v3, v[0:1], off offset:4
+; GCN-NEXT: global_load_dword v2, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(1) %p, i32 1
%gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index af3241e95e91d..801712ba90988 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -541,6 +541,7 @@
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
; GCN-O1-OPTS-NEXT: GPU Load and Store Vectorizer
+; GCN-O1-OPTS-NEXT: Packed Integer Combine
; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis
; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches
; GCN-O1-OPTS-NEXT: Lower invoke and unwind, for unwindless code generators
@@ -856,6 +857,7 @@
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Scalar Evolution Analysis
; GCN-O2-NEXT: GPU Load and Store Vectorizer
+; GCN-O2-NEXT: Packed Integer Combine
; GCN-O2-NEXT: Lazy Value Information Analysis
; GCN-O2-NEXT: Lower SwitchInst's to branches
; GCN-O2-NEXT: Lower invoke and unwind, for unwindless code generators
@@ -1186,6 +1188,7 @@
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Scalar Evolution Analysis
; GCN-O3-NEXT: GPU Load and Store Vectorizer
+; GCN-O3-NEXT: Packed Integer Combine
; GCN-O3-NEXT: Lazy Value Information Analysis
; GCN-O3-NEXT: Lower SwitchInst's to branches
; GCN-O3-NEXT: Lower invoke and unwind, for unwindless code generators
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 38e45042b5ee4..d13f5d569aae6 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1867,27 +1867,9 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
-; NOSDWA-NEXT: v_readfirstlane_b32 s0, v1
-; NOSDWA-NEXT: v_readfirstlane_b32 s1, v0
-; NOSDWA-NEXT: s_lshr_b32 s3, s1, 24
-; NOSDWA-NEXT: s_lshr_b32 s5, s0, 24
-; NOSDWA-NEXT: s_and_b32 s2, s1, 0xffff
-; NOSDWA-NEXT: s_bfe_u32 s1, s1, 0x80010
-; NOSDWA-NEXT: s_and_b32 s4, s0, 0xffff
-; NOSDWA-NEXT: s_bfe_u32 s0, s0, 0x80010
-; NOSDWA-NEXT: s_lshl_b32 s3, s3, 8
-; NOSDWA-NEXT: s_lshl_b32 s5, s5, 8
-; NOSDWA-NEXT: s_or_b32 s1, s1, s3
-; NOSDWA-NEXT: s_or_b32 s0, s0, s5
-; NOSDWA-NEXT: s_lshl_b32 s1, s1, 16
-; NOSDWA-NEXT: s_lshl_b32 s0, s0, 16
-; NOSDWA-NEXT: s_or_b32 s1, s2, s1
-; NOSDWA-NEXT: s_or_b32 s0, s4, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s1
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s0
; NOSDWA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; NOSDWA-NEXT: s_endpgm
;
@@ -1898,85 +1880,21 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
; GFX89-NEXT: v_mov_b32_e32 v0, s0
; GFX89-NEXT: v_mov_b32_e32 v1, s1
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: v_readfirstlane_b32 s0, v1
-; GFX89-NEXT: v_readfirstlane_b32 s1, v0
-; GFX89-NEXT: s_lshr_b32 s3, s1, 24
-; GFX89-NEXT: s_lshr_b32 s5, s0, 24
-; GFX89-NEXT: s_and_b32 s2, s1, 0xffff
-; GFX89-NEXT: s_bfe_u32 s1, s1, 0x80010
-; GFX89-NEXT: s_and_b32 s4, s0, 0xffff
-; GFX89-NEXT: s_bfe_u32 s0, s0, 0x80010
-; GFX89-NEXT: s_lshl_b32 s3, s3, 8
-; GFX89-NEXT: s_lshl_b32 s5, s5, 8
-; GFX89-NEXT: s_or_b32 s1, s1, s3
-; GFX89-NEXT: s_or_b32 s0, s0, s5
-; GFX89-NEXT: s_lshl_b32 s1, s1, 16
-; GFX89-NEXT: s_lshl_b32 s0, s0, 16
-; GFX89-NEXT: s_or_b32 s1, s2, s1
-; GFX89-NEXT: s_or_b32 s0, s4, s0
-; GFX89-NEXT: v_mov_b32_e32 v0, s1
-; GFX89-NEXT: v_mov_b32_e32 v1, s0
; GFX89-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX89-NEXT: s_endpgm
;
-; GFX9-LABEL: pulled_out_test:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_readfirstlane_b32 s1, v0
-; GFX9-NEXT: s_lshr_b32 s5, s1, 24
-; GFX9-NEXT: s_lshr_b32 s7, s0, 24
-; GFX9-NEXT: s_and_b32 s4, s1, 0xffff
-; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010
-; GFX9-NEXT: s_and_b32 s6, s0, 0xffff
-; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010
-; GFX9-NEXT: s_lshl_b32 s5, s5, 8
-; GFX9-NEXT: s_lshl_b32 s7, s7, 8
-; GFX9-NEXT: s_or_b32 s1, s1, s5
-; GFX9-NEXT: s_or_b32 s0, s0, s7
-; GFX9-NEXT: s_lshl_b32 s1, s1, 16
-; GFX9-NEXT: s_lshl_b32 s0, s0, 16
-; GFX9-NEXT: s_or_b32 s1, s4, s1
-; GFX9-NEXT: s_or_b32 s0, s6, s0
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: pulled_out_test:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: s_lshr_b32 s5, s0, 24
-; GFX10-NEXT: s_lshr_b32 s7, s1, 24
-; GFX10-NEXT: s_and_b32 s4, s0, 0xffff
-; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010
-; GFX10-NEXT: s_and_b32 s6, s1, 0xffff
-; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
-; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_lshl_b32 s7, s7, 8
-; GFX10-NEXT: s_or_b32 s0, s0, s5
-; GFX10-NEXT: s_or_b32 s1, s1, s7
-; GFX10-NEXT: s_lshl_b32 s0, s0, 16
-; GFX10-NEXT: s_lshl_b32 s1, s1, 16
-; GFX10-NEXT: s_or_b32 s0, s4, s0
-; GFX10-NEXT: s_or_b32 s1, s6, s1
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX10-NEXT: s_endpgm
+; GFX9_10-LABEL: pulled_out_test:
+; GFX9_10: ; %bb.0: ; %entry
+; GFX9_10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9_10-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9_10-NEXT: s_waitcnt vmcnt(0)
+; GFX9_10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9_10-NEXT: s_endpgm
entry:
%idxprom = ashr exact i64 15, 32
%arrayidx = getelementptr inbounds <8 x i8>, ptr addrspace(1) %sourceA, i64 %idxprom
@@ -2297,5 +2215,4 @@ declare i32 @llvm.amdgcn.workitem.id.x()
attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
-; GFX9_10: {{.*}}
; SDWA: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 5aafb0f576fb4..fc990283b004c 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: KILL undef %125:sgpr_128
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %119:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: KILL undef %119:sgpr_128
; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
@@ -44,37 +44,38 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
- ; CHECK-NEXT: KILL undef %74:sreg_64
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.78, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %68:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
+ ; CHECK-NEXT: KILL undef %68:sreg_64
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %83:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL undef %89:sgpr_128
- ; CHECK-NEXT: KILL undef %118:sgpr_128
+ ; CHECK-NEXT: KILL undef %83:sgpr_128
+ ; CHECK-NEXT: KILL undef %112:sgpr_128
; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
- ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.84, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.90, addrspace 4)
; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
+ ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %163:sreg_32, 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %163:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.96, addrspace 4)
; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %163:sreg_32, implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
@@ -87,20 +88,21 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %296:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %351:sgpr_128, undef %352:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %362:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.104, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %346:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
@@ -114,19 +116,19 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
+ ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %378:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.131, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.142, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.147, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.159, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.137, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.154, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
@@ -138,50 +140,47 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.270, align 8, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4)
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.279, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4)
- ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4)
- ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
- ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
- ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub2:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub2
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub3:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub3
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM1]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -191,33 +190,32 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
- ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
- ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
- ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.287, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub2:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub2
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub3:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub3
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM2]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.253, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %467:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: KILL undef %467:sreg_64
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX2_IMM2]].sub0_sub1_sub2, [[S_LOAD_DWORDX2_IMM2]].sub3
; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
- ; CHECK-NEXT: KILL undef %470:sreg_64
- ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.262, addrspace 4)
; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.296, align 8, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]]
- ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
- ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
+ ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
@@ -226,20 +224,20 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.314, addrspace 4)
; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.320, addrspace 4)
; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.326, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]]
- ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]]
+ ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -267,9 +265,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_11]], [[V_SUBREV_U32_e64_6]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_12]], [[V_SUBREV_U32_e64_7]], implicit $exec
- ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN9]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_13]], [[V_SUBREV_U32_e64_8]], implicit $exec
- ; CHECK-NEXT: [[V_SUBREV_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 51, [[BUFFER_LOAD_FORMAT_X_IDXEN9]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_SUBREV_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 51, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_14]], [[V_SUBREV_U32_e64_9]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_11:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 52, [[BUFFER_LOAD_FORMAT_X_IDXEN10]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_16:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_15]], [[V_SUBREV_U32_e64_10]], implicit $exec
@@ -351,13 +349,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
- ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %540:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
- ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+ ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %554:vgpr_32, undef %556:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
.expVert:
%0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index c554fdbf4c799..9aba7e030ec19 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -207,6 +207,7 @@
; CHECK-O-NEXT: Running pass: SCCPPass
; CHECK-O-NEXT: Running pass: BDCEPass
; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 62bb02d9b3c40..3ed2950fa9ce2 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -133,6 +133,7 @@
; CHECK-O-NEXT: Running pass: SCCPPass
; CHECK-O-NEXT: Running pass: BDCEPass
; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 0da7a9f73bdce..bc04d1b35bb82 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -118,6 +118,7 @@
; CHECK-O-NEXT: Running pass: SCCPPass
; CHECK-O-NEXT: Running pass: BDCEPass
; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 38b7890682783..772b3cc4a5fb6 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -127,6 +127,7 @@
; CHECK-O-NEXT: Running pass: SCCPPass
; CHECK-O-NEXT: Running pass: BDCEPass
; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 5aacd26def2be..84d62d56c3a9b 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -165,6 +165,7 @@
; CHECK-O-NEXT: Running pass: SCCPPass
; CHECK-O-NEXT: Running pass: BDCEPass
; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index f6a9406596803..6e7b973213494 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -167,6 +167,7 @@
; CHECK-O-NEXT: Running pass: SCCPPass
; CHECK-O-NEXT: Running pass: BDCEPass
; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 48a9433d24999..10e3f7c6e1fb6 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -131,6 +131,7 @@
; CHECK-O-NEXT: Running pass: SCCPPass
; CHECK-O-NEXT: Running pass: BDCEPass
; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Transforms/PackedIntegerCombine/instructions.ll b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll
new file mode 100644
index 0000000000000..e7b6a6bc66fa1
--- /dev/null
+++ b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll
@@ -0,0 +1,601 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+;; u0xff00ff00 = -16711936
+;; u0x00ff00ff = 16711935
+define i32 @add.0(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @add.0(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711936
+; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT: [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT: ret i32 [[ADD]]
+;
+; AGGRESSIVE-LABEL: define i32 @add.0(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[B_MASK2:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT: [[A_MASK4:%.*]] = and i32 [[A]], -16711936
+; AGGRESSIVE-NEXT: [[ADD_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]]
+; AGGRESSIVE-NEXT: ret i32 [[ADD_MERGE]]
+;
+ %a.mask = and i32 %a, u0xff00ff00
+ %b.mask = and i32 %b, u0x00ff00ff
+ %add = add i32 %a.mask, %b.mask
+ ret i32 %add
+}
+
+;; u0xff00ffff = -16711681
+;; u0x00ff00ff = 16711935
+;; Nothing happens in this case because of the overlapping bytes.
+define i32 @add.1(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @add.1(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711681
+; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT: [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT: ret i32 [[ADD]]
+;
+; AGGRESSIVE-LABEL: define i32 @add.1(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i32 [[A]], -16711681
+; AGGRESSIVE-NEXT: [[B_MASK4:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT: [[ADD:%.*]] = add i32 [[A_MASK2]], [[B_MASK4]]
+; AGGRESSIVE-NEXT: ret i32 [[ADD]]
+;
+ %a.mask = and i32 %a, u0xff00ffff
+ %b.mask = and i32 %b, u0x00ff00ff
+ %add = add i32 %a.mask, %b.mask
+ ret i32 %add
+}
+
+define i32 @and.0(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @and.0(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: ret i32 0
+;
+; AGGRESSIVE-LABEL: define i32 @and.0(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: ret i32 0
+;
+ %a.mask = and i32 %a, u0xff00ff00
+ %b.mask = and i32 %b, u0x00ff00ff
+ %and = and i32 %a.mask, %b.mask
+ ret i32 %and
+}
+
+;; u0xff00ff00 = -16711936
+;; u0x00ff00ff = 16711935
+define i32 @and.1(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @and.1(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711681
+; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT: [[AND:%.*]] = and i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT: ret i32 [[AND]]
+;
+; AGGRESSIVE-LABEL: define i32 @and.1(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i32 [[A]], -16711681
+; AGGRESSIVE-NEXT: [[B_MASK4:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT: [[AND:%.*]] = and i32 [[A_MASK2]], [[B_MASK4]]
+; AGGRESSIVE-NEXT: ret i32 [[AND]]
+;
+ %a.mask = and i32 %a, u0xff00ffff
+ %b.mask = and i32 %b, u0x00ff00ff
+ %and = and i32 %a.mask, %b.mask
+ ret i32 %and
+}
+
+define i32 @and.2(i32 %x) {
+; LAZY-LABEL: define i32 @and.2(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT: [[X_0:%.*]] = and i32 [[X]], -16711681
+; LAZY-NEXT: [[X_1:%.*]] = and i32 [[X]], 16711935
+; LAZY-NEXT: [[AND:%.*]] = and i32 [[X_0]], [[X_1]]
+; LAZY-NEXT: ret i32 [[AND]]
+;
+; AGGRESSIVE-LABEL: define i32 @and.2(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT: [[X_MASK:%.*]] = and i32 [[X]], 255
+; AGGRESSIVE-NEXT: ret i32 [[X_MASK]]
+;
+ %x.0 = and i32 %x, u0xff00ffff
+ %x.1 = and i32 %x, u0x00ff00ff
+ %and = and i32 %x.0, %x.1
+ ret i32 %and
+}
+
+;; u0xff00ff00 = -16711936
+;; u0x00ff00ff = 16711935
+define i32 @or.0(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @or.0(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711936
+; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT: [[OR:%.*]] = or i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT: ret i32 [[OR]]
+;
+; AGGRESSIVE-LABEL: define i32 @or.0(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[B_MASK2:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT: [[A_MASK4:%.*]] = and i32 [[A]], -16711936
+; AGGRESSIVE-NEXT: [[OR_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]]
+; AGGRESSIVE-NEXT: ret i32 [[OR_MERGE]]
+;
+ %a.mask = and i32 %a, u0xff00ff00
+ %b.mask = and i32 %b, u0x00ff00ff
+ %or = or i32 %a.mask, %b.mask
+ ret i32 %or
+}
+
+;; u0xff00ffff = -16711681
+;; u0x00ff00ff = 16711935
+;; Nothing happens in this case because of the overlapping bytes.
+define i32 @or.1(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @or.1(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711681
+; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT: [[OR:%.*]] = or i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT: ret i32 [[OR]]
+;
+; AGGRESSIVE-LABEL: define i32 @or.1(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i32 [[A]], -16711681
+; AGGRESSIVE-NEXT: [[B_MASK4:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT: [[OR:%.*]] = or i32 [[A_MASK2]], [[B_MASK4]]
+; AGGRESSIVE-NEXT: ret i32 [[OR]]
+;
+ %a.mask = and i32 %a, u0xff00ffff
+ %b.mask = and i32 %b, u0x00ff00ff
+ %or = or i32 %a.mask, %b.mask
+ ret i32 %or
+}
+
+define i32 @or.2(i32 %x) {
+; LAZY-LABEL: define i32 @or.2(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT: ret i32 [[X]]
+;
+; AGGRESSIVE-LABEL: define i32 @or.2(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT: ret i32 [[X]]
+;
+ %x.0 = and i32 %x, u0xff00ffff
+ %x.1 = and i32 %x, u0x00ff00ff
+ %or = or i32 %x.0, %x.1
+ ret i32 %or
+}
+
+;; u0xff00ff00 = -16711936
+;; u0x00ff00ff = 16711935
+define i32 @or.3(i32 %x) {
+; LAZY-LABEL: define i32 @or.3(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT: [[X_0:%.*]] = or i32 [[X]], 16711935
+; LAZY-NEXT: ret i32 [[X_0]]
+;
+; AGGRESSIVE-LABEL: define i32 @or.3(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT: [[X_MASK:%.*]] = and i32 [[X]], -16711936
+; AGGRESSIVE-NEXT: [[X_0_MERGE:%.*]] = or disjoint i32 [[X_MASK]], 16711935
+; AGGRESSIVE-NEXT: ret i32 [[X_0_MERGE]]
+;
+ %x.0 = or i32 %x, u0x00ff00ff
+ ret i32 %x.0
+}
+
+;; u0xff00ff00 = -16711936
+;; u0x00ff00ff = 16711935
+define i32 @xor.0(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @xor.0(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711936
+; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT: [[XOR:%.*]] = xor i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT: ret i32 [[XOR]]
+;
+; AGGRESSIVE-LABEL: define i32 @xor.0(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[B_MASK2:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT: [[A_MASK4:%.*]] = and i32 [[A]], -16711936
+; AGGRESSIVE-NEXT: [[XOR_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]]
+; AGGRESSIVE-NEXT: ret i32 [[XOR_MERGE]]
+;
+ %a.mask = and i32 %a, u0xff00ff00
+ %b.mask = and i32 %b, u0x00ff00ff
+ %xor = xor i32 %a.mask, %b.mask
+ ret i32 %xor
+}
+
+;; u0xff00ffff = -16711681
+;; u0x00ff00ff = 16711935
+;; Nothing happens in this case because of the overlapping bytes.
+define i32 @xor.1(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @xor.1(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711681
+; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT: [[XOR:%.*]] = xor i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT: ret i32 [[XOR]]
+;
+; AGGRESSIVE-LABEL: define i32 @xor.1(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i32 [[A]], -16711681
+; AGGRESSIVE-NEXT: [[B_MASK4:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT: [[XOR:%.*]] = xor i32 [[A_MASK2]], [[B_MASK4]]
+; AGGRESSIVE-NEXT: ret i32 [[XOR]]
+;
+ %a.mask = and i32 %a, u0xff00ffff
+ %b.mask = and i32 %b, u0x00ff00ff
+ %xor = xor i32 %a.mask, %b.mask
+ ret i32 %xor
+}
+
+define i32 @xor.2(i32 %x) {
+; LAZY-LABEL: define i32 @xor.2(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT: [[X_MASK:%.*]] = and i32 [[X]], -256
+; LAZY-NEXT: ret i32 [[X_MASK]]
+;
+; AGGRESSIVE-LABEL: define i32 @xor.2(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT: [[X_MASK:%.*]] = and i32 [[X]], -256
+; AGGRESSIVE-NEXT: ret i32 [[X_MASK]]
+;
+ %x.0 = and i32 %x, u0xff00ffff
+ %x.1 = and i32 %x, u0x00ff00ff
+ %xor = xor i32 %x.0, %x.1
+ ret i32 %xor
+}
+
+define i32 @shl.0(i32 %base) {
+; LAZY-LABEL: define i32 @shl.0(
+; LAZY-SAME: i32 [[BASE:%.*]]) {
+; LAZY-NEXT: [[PRE:%.*]] = and i32 [[BASE]], 16777215
+; LAZY-NEXT: [[SHL:%.*]] = shl i32 [[PRE]], 8
+; LAZY-NEXT: [[POST:%.*]] = and i32 [[SHL]], -256
+; LAZY-NEXT: ret i32 [[POST]]
+;
+; AGGRESSIVE-LABEL: define i32 @shl.0(
+; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT: [[BASE_SHIFT:%.*]] = shl i32 [[BASE]], 8
+; AGGRESSIVE-NEXT: ret i32 [[BASE_SHIFT]]
+;
+ %pre = and i32 %base, u0x00ffffff
+ %shl = shl i32 %pre, 8
+ %post = and i32 %shl, u0xffffff00
+ ret i32 %post
+}
+
+;; u0x0000ff00 = 65280
+define i32 @shl.1(i32 %base) {
+; LAZY-LABEL: define i32 @shl.1(
+; LAZY-SAME: i32 [[BASE:%.*]]) {
+; LAZY-NEXT: [[PRE:%.*]] = and i32 [[BASE]], -16711936
+; LAZY-NEXT: [[SHL:%.*]] = shl i32 [[PRE]], 8
+; LAZY-NEXT: ret i32 [[SHL]]
+;
+; AGGRESSIVE-LABEL: define i32 @shl.1(
+; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT: [[BASE_MASK:%.*]] = and i32 [[BASE]], 65280
+; AGGRESSIVE-NEXT: [[BASE_SHIFT:%.*]] = shl i32 [[BASE_MASK]], 8
+; AGGRESSIVE-NEXT: ret i32 [[BASE_SHIFT]]
+;
+ %pre = and i32 %base, u0xff00ff00
+ %shl = shl i32 %pre, 8
+ ret i32 %shl
+}
+
+;; u0x0fffffff = 268435455
+;; Nothing happens because it is not byte-aligned.
+define i32 @shl.2(i32 %base) {
+; LAZY-LABEL: define i32 @shl.2(
+; LAZY-SAME: i32 [[BASE:%.*]]) {
+; LAZY-NEXT: [[PRE:%.*]] = and i32 [[BASE]], 268435455
+; LAZY-NEXT: [[SHL:%.*]] = shl i32 [[PRE]], 4
+; LAZY-NEXT: [[POST:%.*]] = and i32 [[SHL]], -16
+; LAZY-NEXT: ret i32 [[POST]]
+;
+; AGGRESSIVE-LABEL: define i32 @shl.2(
+; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT: [[PRE:%.*]] = and i32 [[BASE]], 268435455
+; AGGRESSIVE-NEXT: [[SHL:%.*]] = shl i32 [[PRE]], 4
+; AGGRESSIVE-NEXT: [[POST:%.*]] = and i32 [[SHL]], -16
+; AGGRESSIVE-NEXT: ret i32 [[POST]]
+;
+ %pre = and i32 %base, u0x0fffffff
+ %shl = shl i32 %pre, 4
+ %post = and i32 %shl, u0xfffffff0
+ ret i32 %post
+}
+
+define <8 x i8> @shl.vec(<2 x i32> %base) {
+; LAZY-LABEL: define <8 x i8> @shl.vec(
+; LAZY-SAME: <2 x i32> [[BASE:%.*]]) {
+; LAZY-NEXT: [[PRE:%.*]] = and <2 x i32> [[BASE]], <i32 -65281, i32 16777215>
+; LAZY-NEXT: [[SHL:%.*]] = shl <2 x i32> [[PRE]], <i32 16, i32 8>
+; LAZY-NEXT: [[POST:%.*]] = and <2 x i32> [[SHL]], splat (i32 -256)
+; LAZY-NEXT: [[CAST:%.*]] = bitcast <2 x i32> [[POST]] to <8 x i8>
+; LAZY-NEXT: ret <8 x i8> [[CAST]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @shl.vec(
+; AGGRESSIVE-SAME: <2 x i32> [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT: [[BASE_CAST:%.*]] = bitcast <2 x i32> [[BASE]] to <8 x i8>
+; AGGRESSIVE-NEXT: [[BASE_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AGGRESSIVE-NEXT: [[BASE_SHUFFLE2:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> [[BASE_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 4, i32 5, i32 6>
+; AGGRESSIVE-NEXT: ret <8 x i8> [[BASE_SHUFFLE2]]
+;
+ %pre = and <2 x i32> %base, <i32 u0xffff00ff, i32 u0x00ffffff>
+ %shl = shl <2 x i32> %pre, <i32 16, i32 8>
+ %post = and <2 x i32> %shl, splat(i32 u0xffffff00)
+ %cast = bitcast <2 x i32> %post to <8 x i8>
+ ret <8 x i8> %cast
+}
+
+define i32 @lshr.0(i32 %base) {
+; LAZY-LABEL: define i32 @lshr.0(
+; LAZY-SAME: i32 [[BASE:%.*]]) {
+; LAZY-NEXT: [[PRE:%.*]] = and i32 [[BASE]], -256
+; LAZY-NEXT: [[LSHR:%.*]] = lshr i32 [[PRE]], 8
+; LAZY-NEXT: [[POST:%.*]] = and i32 [[LSHR]], 16777215
+; LAZY-NEXT: ret i32 [[POST]]
+;
+; AGGRESSIVE-LABEL: define i32 @lshr.0(
+; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT: [[BASE_SHIFT:%.*]] = lshr i32 [[BASE]], 8
+; AGGRESSIVE-NEXT: ret i32 [[BASE_SHIFT]]
+;
+ %pre = and i32 %base, u0xffffff00
+ %lshr = lshr i32 %pre, 8
+ %post = and i32 %lshr, u0x00ffffff
+ ret i32 %post
+}
+
+;; u0x0000ff00 = 65280
+define i32 @lshr.1(i32 %base) {
+; LAZY-LABEL: define i32 @lshr.1(
+; LAZY-SAME: i32 [[BASE:%.*]]) {
+; LAZY-NEXT: [[PRE:%.*]] = and i32 [[BASE]], 16711935
+; LAZY-NEXT: [[LSHR:%.*]] = lshr i32 [[PRE]], 8
+; LAZY-NEXT: ret i32 [[LSHR]]
+;
+; AGGRESSIVE-LABEL: define i32 @lshr.1(
+; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT: [[BASE_SHIFT:%.*]] = lshr i32 [[BASE]], 8
+; AGGRESSIVE-NEXT: [[BASE_MASK:%.*]] = and i32 [[BASE_SHIFT]], 65280
+; AGGRESSIVE-NEXT: ret i32 [[BASE_MASK]]
+;
+ %pre = and i32 %base, u0x00ff00ff
+ %lshr = lshr i32 %pre, 8
+ ret i32 %lshr
+}
+
+define <8 x i8> @lshr.vec(<2 x i32> %base) {
+; LAZY-LABEL: define <8 x i8> @lshr.vec(
+; LAZY-SAME: <2 x i32> [[BASE:%.*]]) {
+; LAZY-NEXT: [[PRE:%.*]] = and <2 x i32> [[BASE]], <i32 -16711681, i32 -256>
+; LAZY-NEXT: [[LSHR:%.*]] = lshr <2 x i32> [[PRE]], <i32 16, i32 8>
+; LAZY-NEXT: [[POST:%.*]] = and <2 x i32> [[LSHR]], splat (i32 16777215)
+; LAZY-NEXT: [[CAST:%.*]] = bitcast <2 x i32> [[POST]] to <8 x i8>
+; LAZY-NEXT: ret <8 x i8> [[CAST]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @lshr.vec(
+; AGGRESSIVE-SAME: <2 x i32> [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT: [[BASE_CAST:%.*]] = bitcast <2 x i32> [[BASE]] to <8 x i8>
+; AGGRESSIVE-NEXT: [[BASE_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 3, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AGGRESSIVE-NEXT: [[BASE_SHUFFLE2:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> [[BASE_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7, i32 15>
+; AGGRESSIVE-NEXT: ret <8 x i8> [[BASE_SHUFFLE2]]
+;
+ %pre = and <2 x i32> %base, <i32 u0xff00ffff, i32 u0xffffff00>
+ %lshr = lshr <2 x i32> %pre, <i32 16, i32 8>
+ %post = and <2 x i32> %lshr, splat(i32 u0x00ffffff)
+ %cast = bitcast <2 x i32> %post to <8 x i8>
+ ret <8 x i8> %cast
+}
+
+define i32 @trunc.0(i64 %src) {
+; LAZY-LABEL: define i32 @trunc.0(
+; LAZY-SAME: i64 [[SRC:%.*]]) {
+; LAZY-NEXT: [[MASK:%.*]] = and i64 [[SRC]], 4294967295
+; LAZY-NEXT: [[TRUNC:%.*]] = trunc i64 [[MASK]] to i32
+; LAZY-NEXT: ret i32 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i32 @trunc.0(
+; AGGRESSIVE-SAME: i64 [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = trunc i64 [[SRC]] to i32
+; AGGRESSIVE-NEXT: ret i32 [[SRC_CAST]]
+;
+ %mask = and i64 %src, u0x00000000ffffffff
+ %trunc = trunc i64 %mask to i32
+ ret i32 %trunc
+}
+
+;; u0xff00ff00 = -16711936
+define i32 @trunc.1(i64 %src) {
+; LAZY-LABEL: define i32 @trunc.1(
+; LAZY-SAME: i64 [[SRC:%.*]]) {
+; LAZY-NEXT: [[MASK:%.*]] = and i64 [[SRC]], -71777214294589696
+; LAZY-NEXT: [[TRUNC:%.*]] = trunc i64 [[MASK]] to i32
+; LAZY-NEXT: ret i32 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i32 @trunc.1(
+; AGGRESSIVE-SAME: i64 [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = trunc i64 [[SRC]] to i32
+; AGGRESSIVE-NEXT: [[SRC_MASK:%.*]] = and i32 [[SRC_CAST]], -16711936
+; AGGRESSIVE-NEXT: ret i32 [[SRC_MASK]]
+;
+ %mask = and i64 %src, u0xff00ff00ff00ff00
+ %trunc = trunc i64 %mask to i32
+ ret i32 %trunc
+}
+
+define <4 x i8> @trunc.vec(<2 x i32> %src) {
+; LAZY-LABEL: define <4 x i8> @trunc.vec(
+; LAZY-SAME: <2 x i32> [[SRC:%.*]]) {
+; LAZY-NEXT: [[TRUNC:%.*]] = trunc <2 x i32> [[SRC]] to <2 x i16>
+; LAZY-NEXT: [[CAST:%.*]] = bitcast <2 x i16> [[TRUNC]] to <4 x i8>
+; LAZY-NEXT: ret <4 x i8> [[CAST]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @trunc.vec(
+; AGGRESSIVE-SAME: <2 x i32> [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
+; AGGRESSIVE-NEXT: [[SRC_EXTRACT:%.*]] = shufflevector <8 x i8> [[SRC_CAST]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AGGRESSIVE-NEXT: [[SRC_SHUFFLE:%.*]] = shufflevector <4 x i8> [[SRC_EXTRACT]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AGGRESSIVE-NEXT: [[SRC_EXTRACT2:%.*]] = shufflevector <8 x i8> [[SRC_CAST]], <8 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 4, i32 5>
+; AGGRESSIVE-NEXT: [[SRC_SHUFFLE4:%.*]] = shufflevector <4 x i8> [[SRC_EXTRACT2]], <4 x i8> [[SRC_SHUFFLE]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AGGRESSIVE-NEXT: ret <4 x i8> [[SRC_SHUFFLE4]]
+;
+ %trunc = trunc <2 x i32> %src to <2 x i16>
+ %cast = bitcast <2 x i16> %trunc to <4 x i8>
+ ret <4 x i8> %cast
+}
+
+define i32 @bitcast.0(i32 %x) {
+; LAZY-LABEL: define i32 @bitcast.0(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT: [[CAST:%.*]] = bitcast i32 [[X]] to <4 x i8>
+; LAZY-NEXT: [[BACK:%.*]] = bitcast <4 x i8> [[CAST]] to i32
+; LAZY-NEXT: ret i32 [[BACK]]
+;
+; AGGRESSIVE-LABEL: define i32 @bitcast.0(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT: ret i32 [[X]]
+;
+ %cast = bitcast i32 %x to <4 x i8>
+ %back = bitcast <4 x i8> %cast to i32
+ ret i32 %back
+}
+
+define i8 @extractelement.0(i32 %src) {
+; LAZY-LABEL: define i8 @extractelement.0(
+; LAZY-SAME: i32 [[SRC:%.*]]) {
+; LAZY-NEXT: [[CAST:%.*]] = bitcast i32 [[SRC]] to <4 x i8>
+; LAZY-NEXT: [[ELT:%.*]] = extractelement <4 x i8> [[CAST]], i64 3
+; LAZY-NEXT: ret i8 [[ELT]]
+;
+; AGGRESSIVE-LABEL: define i8 @extractelement.0(
+; AGGRESSIVE-SAME: i32 [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT: [[SRC_SHIFT:%.*]] = lshr i32 [[SRC]], 24
+; AGGRESSIVE-NEXT: [[SRC_SHIFT_CAST:%.*]] = trunc i32 [[SRC_SHIFT]] to i8
+; AGGRESSIVE-NEXT: ret i8 [[SRC_SHIFT_CAST]]
+;
+ %cast = bitcast i32 %src to <4 x i8>
+ %elt = extractelement <4 x i8> %cast, i64 3
+ ret i8 %elt
+}
+
+define i32 @insertelement.0(i8 %src) {
+; LAZY-LABEL: define i32 @insertelement.0(
+; LAZY-SAME: i8 [[SRC:%.*]]) {
+; LAZY-NEXT: [[INSERT:%.*]] = insertelement <4 x i8> zeroinitializer, i8 [[SRC]], i64 3
+; LAZY-NEXT: [[CAST:%.*]] = bitcast <4 x i8> [[INSERT]] to i32
+; LAZY-NEXT: ret i32 [[CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @insertelement.0(
+; AGGRESSIVE-SAME: i8 [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = zext i8 [[SRC]] to i32
+; AGGRESSIVE-NEXT: [[SRC_SHIFT:%.*]] = shl i32 [[SRC_CAST]], 24
+; AGGRESSIVE-NEXT: ret i32 [[SRC_SHIFT]]
+;
+ %insert = insertelement <4 x i8> zeroinitializer, i8 %src, i64 3
+ %cast = bitcast <4 x i8> %insert to i32
+ ret i32 %cast
+}
+
+define i32 @insertelement.1(i8 %a, i8 %b) {
+; LAZY-LABEL: define i32 @insertelement.1(
+; LAZY-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; LAZY-NEXT: [[INSERT_A:%.*]] = insertelement <4 x i8> zeroinitializer, i8 [[A]], i64 3
+; LAZY-NEXT: [[INSERT_B:%.*]] = insertelement <4 x i8> [[INSERT_A]], i8 [[B]], i64 1
+; LAZY-NEXT: [[CAST:%.*]] = bitcast <4 x i8> [[INSERT_B]] to i32
+; LAZY-NEXT: ret i32 [[CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @insertelement.1(
+; AGGRESSIVE-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[A_CAST:%.*]] = zext i8 [[A]] to i32
+; AGGRESSIVE-NEXT: [[B_CAST:%.*]] = zext i8 [[B]] to i32
+; AGGRESSIVE-NEXT: [[B_SHIFT:%.*]] = shl i32 [[B_CAST]], 8
+; AGGRESSIVE-NEXT: [[A_SHIFT:%.*]] = shl i32 [[A_CAST]], 24
+; AGGRESSIVE-NEXT: [[CAST_MERGE:%.*]] = or disjoint i32 [[B_SHIFT]], [[A_SHIFT]]
+; AGGRESSIVE-NEXT: ret i32 [[CAST_MERGE]]
+;
+ %insert.a = insertelement <4 x i8> zeroinitializer, i8 %a, i64 3
+ %insert.b = insertelement <4 x i8> %insert.a, i8 %b, i64 1
+ %cast = bitcast <4 x i8> %insert.b to i32
+ ret i32 %cast
+}
+
+define i64 @shufflevector.0(i32 %a, i32 %b) {
+; LAZY-LABEL: define i64 @shufflevector.0(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8>
+; LAZY-NEXT: [[B_CAST:%.*]] = bitcast i32 [[B]] to <4 x i8>
+; LAZY-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i8> [[A_CAST]], <4 x i8> [[B_CAST]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; LAZY-NEXT: [[CAST:%.*]] = bitcast <8 x i8> [[SHUFFLE]] to i64
+; LAZY-NEXT: ret i64 [[CAST]]
+;
+; AGGRESSIVE-LABEL: define i64 @shufflevector.0(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[B_CAST2:%.*]] = zext i32 [[B]] to i64
+; AGGRESSIVE-NEXT: [[A_CAST1:%.*]] = zext i32 [[A]] to i64
+; AGGRESSIVE-NEXT: [[B_SHIFT:%.*]] = shl i64 [[B_CAST2]], 32
+; AGGRESSIVE-NEXT: [[CAST_MERGE:%.*]] = or disjoint i64 [[A_CAST1]], [[B_SHIFT]]
+; AGGRESSIVE-NEXT: ret i64 [[CAST_MERGE]]
+;
+ %a.cast = bitcast i32 %a to <4 x i8>
+ %b.cast = bitcast i32 %b to <4 x i8>
+ %shuffle = shufflevector <4 x i8> %a.cast, <4 x i8> %b.cast, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cast = bitcast <8 x i8> %shuffle to i64
+ ret i64 %cast
+}
+
+define i32 @shufflevector.1(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @shufflevector.1(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8>
+; LAZY-NEXT: [[B_CAST:%.*]] = bitcast i32 [[B]] to <4 x i8>
+; LAZY-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i8> [[A_CAST]], <4 x i8> [[B_CAST]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; LAZY-NEXT: [[CAST:%.*]] = bitcast <4 x i8> [[SHUFFLE]] to i32
+; LAZY-NEXT: ret i32 [[CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @shufflevector.1(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[A_MASK:%.*]] = and i32 [[A]], 16711935
+; AGGRESSIVE-NEXT: [[B_MASK:%.*]] = and i32 [[B]], -16711936
+; AGGRESSIVE-NEXT: [[CAST_MERGE:%.*]] = or disjoint i32 [[A_MASK]], [[B_MASK]]
+; AGGRESSIVE-NEXT: ret i32 [[CAST_MERGE]]
+;
+ %a.cast = bitcast i32 %a to <4 x i8>
+ %b.cast = bitcast i32 %b to <4 x i8>
+ %shuffle = shufflevector <4 x i8> %a.cast, <4 x i8> %b.cast, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ %cast = bitcast <4 x i8> %shuffle to i32
+ ret i32 %cast
+}
+
+define i32 @shufflevector.2(i32 %a, i64 %b) {
+; LAZY-LABEL: define i32 @shufflevector.2(
+; LAZY-SAME: i32 [[A:%.*]], i64 [[B:%.*]]) {
+; LAZY-NEXT: [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8>
+; LAZY-NEXT: [[B_CAST:%.*]] = bitcast i64 [[B]] to <8 x i8>
+; LAZY-NEXT: [[SHUFFLE_0:%.*]] = shufflevector <8 x i8> [[B_CAST]], <8 x i8> poison, <4 x i32> <i32 6, i32 7, i32 poison, i32 poison>
+; LAZY-NEXT: [[SHUFFLE_1:%.*]] = shufflevector <4 x i8> [[SHUFFLE_0]], <4 x i8> [[A_CAST]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; LAZY-NEXT: [[CAST:%.*]] = bitcast <4 x i8> [[SHUFFLE_1]] to i32
+; LAZY-NEXT: ret i32 [[CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @shufflevector.2(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i64 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[B_SHIFT:%.*]] = lshr i64 [[B]], 48
+; AGGRESSIVE-NEXT: [[B_SHIFT_CAST:%.*]] = trunc i64 [[B_SHIFT]] to i32
+; AGGRESSIVE-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -65536
+; AGGRESSIVE-NEXT: [[CAST_MERGE:%.*]] = or disjoint i32 [[B_SHIFT_CAST]], [[A_MASK]]
+; AGGRESSIVE-NEXT: ret i32 [[CAST_MERGE]]
+;
+ %a.cast = bitcast i32 %a to <4 x i8>
+ %b.cast = bitcast i64 %b to <8 x i8>
+ %shuffle.0 = shufflevector <8 x i8> %b.cast, <8 x i8> poison, <4 x i32> <i32 6, i32 7, i32 poison, i32 poison>
+ %shuffle.1 = shufflevector <4 x i8> %shuffle.0, <4 x i8> %a.cast, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ %cast = bitcast <4 x i8> %shuffle.1 to i32
+ ret i32 %cast
+}
diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2int.ll b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll
new file mode 100644
index 0000000000000..fe08ce93719d0
--- /dev/null
+++ b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll
@@ -0,0 +1,302 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+define i16 @top_bytes(i32 %a, i32 %b) {
+; LAZY-LABEL: define i16 @top_bytes(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16777216
+; LAZY-NEXT: [[A_LSHR:%.*]] = lshr i32 [[A_MASK]], 16
+; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], -16777216
+; LAZY-NEXT: [[B_LSHR:%.*]] = lshr i32 [[B_MASK]], 24
+; LAZY-NEXT: [[RES:%.*]] = or i32 [[A_LSHR]], [[B_LSHR]]
+; LAZY-NEXT: [[TRUNC:%.*]] = trunc i32 [[RES]] to i16
+; LAZY-NEXT: ret i16 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i16 @top_bytes(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[B_SHIFT:%.*]] = lshr i32 [[B]], 24
+; AGGRESSIVE-NEXT: [[B_SHIFT_CAST:%.*]] = trunc i32 [[B_SHIFT]] to i16
+; AGGRESSIVE-NEXT: [[A_SHIFT:%.*]] = lshr i32 [[A]], 16
+; AGGRESSIVE-NEXT: [[A_SHIFT_CAST:%.*]] = trunc i32 [[A_SHIFT]] to i16
+; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i16 [[A_SHIFT_CAST]], -256
+; AGGRESSIVE-NEXT: [[TRUNC_MERGE:%.*]] = or disjoint i16 [[B_SHIFT_CAST]], [[A_MASK2]]
+; AGGRESSIVE-NEXT: ret i16 [[TRUNC_MERGE]]
+;
+ %a.mask = and i32 %a, u0xff000000
+ %a.lshr = lshr i32 %a.mask, 16
+ %b.mask = and i32 %b, u0xff000000
+ %b.lshr = lshr i32 %b.mask, 24
+ %res = or i32 %a.lshr, %b.lshr
+ %trunc = trunc i32 %res to i16
+ ret i16 %trunc
+}
+
+define i32 @bottom_bytes(i16 %a, i16 %b) {
+; LAZY-LABEL: define i32 @bottom_bytes(
+; LAZY-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i16 [[A]], 255
+; LAZY-NEXT: [[A_SHL:%.*]] = shl i16 [[A_MASK]], 8
+; LAZY-NEXT: [[B_MASK:%.*]] = and i16 [[B]], 255
+; LAZY-NEXT: [[RES:%.*]] = or i16 [[A_SHL]], [[B_MASK]]
+; LAZY-NEXT: [[ZEXT:%.*]] = zext i16 [[RES]] to i32
+; LAZY-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT]], 16
+; LAZY-NEXT: ret i32 [[SHL]]
+;
+; AGGRESSIVE-LABEL: define i32 @bottom_bytes(
+; AGGRESSIVE-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[A_CAST:%.*]] = zext i16 [[A]] to i32
+; AGGRESSIVE-NEXT: [[B_MASK2:%.*]] = and i16 [[B]], 255
+; AGGRESSIVE-NEXT: [[B_MASK_CAST:%.*]] = zext i16 [[B_MASK2]] to i32
+; AGGRESSIVE-NEXT: [[B_SHIFT:%.*]] = shl i32 [[B_MASK_CAST]], 16
+; AGGRESSIVE-NEXT: [[A_SHIFT:%.*]] = shl i32 [[A_CAST]], 24
+; AGGRESSIVE-NEXT: [[SHL_MERGE:%.*]] = or disjoint i32 [[B_SHIFT]], [[A_SHIFT]]
+; AGGRESSIVE-NEXT: ret i32 [[SHL_MERGE]]
+;
+ %a.mask = and i16 %a, u0x00ff
+ %a.shl = shl i16 %a.mask, 8
+ %b.mask = and i16 %b, u0x00ff
+ %res = or i16 %a.shl, %b.mask
+ %zext = zext i16 %res to i32
+ %shl = shl i32 %zext, 16
+ ret i32 %shl
+}
+
+define i32 @obtain_i32(i32 %from) {
+; LAZY-LABEL: define i32 @obtain_i32(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT: ret i32 [[FROM]]
+;
+; AGGRESSIVE-LABEL: define i32 @obtain_i32(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: ret i32 [[FROM]]
+;
+ %get.0 = and i32 %from, 255
+ %shr.1 = lshr i32 %from, 8
+ %mask.1 = and i32 %shr.1, 255
+ %get.1 = shl i32 %mask.1, 8
+ %out.1 = or i32 %get.0, %get.1
+
+ %shr.2 = lshr i32 %from, 16
+ %mask.2 = and i32 %shr.2, 255
+ %get.2 = shl i32 %mask.2, 16
+ %shr.3 = lshr i32 %from, 24
+ %mask.3 = and i32 %shr.3, 255
+ %get.3 = shl i32 %mask.3, 24
+ %out.2 = or i32 %get.2, %get.3
+
+ %out = or i32 %out.1, %out.2
+ ret i32 %out
+}
+
+;; u0xff00ffff = -16711681
+define i32 @obtain_i32_masked(i32 %from) {
+; LAZY-LABEL: define i32 @obtain_i32_masked(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM]], -16711681
+; LAZY-NEXT: ret i32 [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define i32 @obtain_i32_masked(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM]], -16711681
+; AGGRESSIVE-NEXT: ret i32 [[FROM_MASK]]
+;
+ %get.0 = and i32 %from, 255
+ %shr.1 = lshr i32 %from, 8
+ %mask.1 = and i32 %shr.1, 255
+ %get.1 = shl i32 %mask.1, 8
+ %out.1 = or i32 %get.0, %get.1
+
+ %shr.3 = lshr i32 %from, 24
+ %mask.3 = and i32 %shr.3, 255
+ %get.3 = shl i32 %mask.3, 24
+ %out.2 = or i32 %out.1, %get.3
+
+ ret i32 %out.2
+}
+
+define i64 @obtain_i64(i64 %from) {
+; LAZY-LABEL: define i64 @obtain_i64(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_MASK:%.*]] = and i64 [[FROM]], 4294967295
+; LAZY-NEXT: ret i64 [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define i64 @obtain_i64(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and i64 [[FROM]], 4294967295
+; AGGRESSIVE-NEXT: ret i64 [[FROM_MASK]]
+;
+ %mask.0 = and i64 %from, 255
+ %get.0 = shl i64 %mask.0, 0
+ %shr.1 = lshr i64 %from, 8
+ %mask.1 = and i64 %shr.1, 255
+ %get.1 = shl i64 %mask.1, 8
+ %out.1 = or i64 %get.0, %get.1
+
+ %shr.2 = lshr i64 %from, 16
+ %mask.2 = and i64 %shr.2, 255
+ %get.2 = shl i64 %mask.2, 16
+ %shr.3 = lshr i64 %from, 24
+ %mask.3 = and i64 %shr.3, 255
+ %get.3 = shl i64 %mask.3, 24
+ %out.2 = or i64 %get.2, %get.3
+
+ %out = or i64 %out.1, %out.2
+ ret i64 %out
+}
+
+define i64 @obtain_i64_shifted(i64 %from) {
+; LAZY-LABEL: define i64 @obtain_i64_shifted(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_SHIFT:%.*]] = shl i64 [[FROM]], 32
+; LAZY-NEXT: ret i64 [[FROM_SHIFT]]
+;
+; AGGRESSIVE-LABEL: define i64 @obtain_i64_shifted(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_SHIFT:%.*]] = shl i64 [[FROM]], 32
+; AGGRESSIVE-NEXT: ret i64 [[FROM_SHIFT]]
+;
+ %mask.0 = and i64 %from, 255
+ %get.0 = shl i64 %mask.0, 32
+ %shr.1 = lshr i64 %from, 8
+ %mask.1 = and i64 %shr.1, 255
+ %get.1 = shl i64 %mask.1, 40
+ %out.1 = or i64 %get.0, %get.1
+
+ %shr.2 = lshr i64 %from, 16
+ %mask.2 = and i64 %shr.2, 255
+ %get.2 = shl i64 %mask.2, 48
+ %shr.3 = lshr i64 %from, 24
+ %mask.3 = and i64 %shr.3, 255
+ %get.3 = shl i64 %mask.3, 56
+ %out.2 = or i64 %get.2, %get.3
+
+ %out = or i64 %out.1, %out.2
+ ret i64 %out
+}
+
+define i64 @obtain_i64_zext(i32 %from) {
+; LAZY-LABEL: define i64 @obtain_i64_zext(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = zext i32 [[FROM]] to i64
+; LAZY-NEXT: [[FROM_SHIFT:%.*]] = shl i64 [[FROM_CAST]], 32
+; LAZY-NEXT: ret i64 [[FROM_SHIFT]]
+;
+; AGGRESSIVE-LABEL: define i64 @obtain_i64_zext(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = zext i32 [[FROM]] to i64
+; AGGRESSIVE-NEXT: [[FROM_SHIFT:%.*]] = shl i64 [[FROM_CAST]], 32
+; AGGRESSIVE-NEXT: ret i64 [[FROM_SHIFT]]
+;
+ %mask.0 = and i32 %from, 255
+ %zext.0 = zext i32 %mask.0 to i64
+ %get.0 = shl i64 %zext.0, 32
+ %shr.1 = lshr i32 %from, 8
+ %mask.1 = and i32 %shr.1, 255
+ %zext.1 = zext i32 %mask.1 to i64
+ %get.1 = shl i64 %zext.1, 40
+ %out.1 = or i64 %get.0, %get.1
+
+ %shr.2 = lshr i32 %from, 16
+ %mask.2 = and i32 %shr.2, 255
+ %zext.2 = zext i32 %mask.2 to i64
+ %get.2 = shl i64 %zext.2, 48
+ %shr.3 = lshr i32 %from, 24
+ %mask.3 = and i32 %shr.3, 255
+ %zext.3 = zext i32 %mask.3 to i64
+ %get.3 = shl i64 %zext.3, 56
+ %out.2 = or i64 %get.2, %get.3
+
+ %out = or i64 %out.1, %out.2
+ ret i64 %out
+}
+
+define i64 @combine(i32 %bot, i32 %top) {
+; LAZY-LABEL: define i64 @combine(
+; LAZY-SAME: i32 [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; LAZY-NEXT: [[TOP_CAST:%.*]] = zext i32 [[TOP]] to i64
+; LAZY-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; LAZY-NEXT: [[TOP_SHIFT:%.*]] = shl i64 [[TOP_CAST]], 32
+; LAZY-NEXT: [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHIFT]]
+; LAZY-NEXT: ret i64 [[OUT_3_MERGE]]
+;
+; AGGRESSIVE-LABEL: define i64 @combine(
+; AGGRESSIVE-SAME: i32 [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT: [[TOP_CAST:%.*]] = zext i32 [[TOP]] to i64
+; AGGRESSIVE-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; AGGRESSIVE-NEXT: [[TOP_SHIFT:%.*]] = shl i64 [[TOP_CAST]], 32
+; AGGRESSIVE-NEXT: [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHIFT]]
+; AGGRESSIVE-NEXT: ret i64 [[OUT_3_MERGE]]
+;
+ %base = zext i32 %bot to i64
+
+ %mask.0 = and i32 %top, 255
+ %zext.0 = zext i32 %mask.0 to i64
+ %get.0 = shl i64 %zext.0, 32
+ %out.0 = or i64 %base, %get.0
+
+ %shr.1 = lshr i32 %top, 8
+ %mask.1 = and i32 %shr.1, 255
+ %zext.1 = zext i32 %mask.1 to i64
+ %get.1 = shl i64 %zext.1, 40
+ %out.1 = or i64 %out.0, %get.1
+
+ %shr.2 = lshr i32 %top, 16
+ %mask.2 = and i32 %shr.2, 255
+ %zext.2 = zext i32 %mask.2 to i64
+ %get.2 = shl i64 %zext.2, 48
+ %out.2 = or i64 %out.1, %get.2
+
+ %shr.3 = lshr i32 %top, 24
+ %mask.3 = and i32 %shr.3, 255
+ %zext.3 = zext i32 %mask.3 to i64
+ %get.3 = shl i64 %zext.3, 56
+ %out.3 = or i64 %out.2, %get.3
+
+ ret i64 %out.3
+}
+
+;; u0x0000ff00 = 65280
+;; u0x00ff0000 = 16711680
+;; u0xff000000 = -16777216
+;; u0xff0000ff = -16776961
+define i32 @shuffle_elts(i32 %x) {
+; LAZY-LABEL: define i32 @shuffle_elts(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT: [[X_0:%.*]] = and i32 [[X]], 255
+; LAZY-NEXT: [[X_1:%.*]] = and i32 [[X]], 65280
+; LAZY-NEXT: [[X_2:%.*]] = and i32 [[X]], 16711680
+; LAZY-NEXT: [[X_3:%.*]] = and i32 [[X]], -16777216
+; LAZY-NEXT: [[SHL_1:%.*]] = shl i32 [[X_1]], 8
+; LAZY-NEXT: [[OUT_1:%.*]] = or i32 [[X_0]], [[SHL_1]]
+; LAZY-NEXT: [[SHR_2:%.*]] = lshr i32 [[X_2]], 8
+; LAZY-NEXT: [[OUT_2:%.*]] = or i32 [[SHR_2]], [[X_3]]
+; LAZY-NEXT: [[OUT:%.*]] = or i32 [[OUT_1]], [[OUT_2]]
+; LAZY-NEXT: ret i32 [[OUT]]
+;
+; AGGRESSIVE-LABEL: define i32 @shuffle_elts(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT: [[X_MASK:%.*]] = and i32 [[X]], -16776961
+; AGGRESSIVE-NEXT: [[X_SHIFT:%.*]] = lshr i32 [[X]], 8
+; AGGRESSIVE-NEXT: [[X_MASK2:%.*]] = and i32 [[X_SHIFT]], 65280
+; AGGRESSIVE-NEXT: [[X_MASK4:%.*]] = and i32 [[X]], 65280
+; AGGRESSIVE-NEXT: [[X_SHIFT6:%.*]] = shl i32 [[X_MASK4]], 8
+; AGGRESSIVE-NEXT: [[OUT_MERGE:%.*]] = or disjoint i32 [[X_MASK]], [[X_MASK2]]
+; AGGRESSIVE-NEXT: [[OUT_MERGE8:%.*]] = or disjoint i32 [[OUT_MERGE]], [[X_SHIFT6]]
+; AGGRESSIVE-NEXT: ret i32 [[OUT_MERGE8]]
+;
+ %x.0 = and i32 %x, u0x000000ff
+ %x.1 = and i32 %x, u0x0000ff00
+ %x.2 = and i32 %x, u0x00ff0000
+ %x.3 = and i32 %x, u0xff000000
+
+ %shl.1 = shl i32 %x.1, 8
+ %out.1 = or i32 %x.0, %shl.1
+
+ %shr.2 = lshr i32 %x.2, 8
+ %out.2 = or i32 %shr.2, %x.3
+
+ %out = or i32 %out.1, %out.2
+
+ ret i32 %out
+}
diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
new file mode 100644
index 0000000000000..e4c1538826e0f
--- /dev/null
+++ b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
@@ -0,0 +1,393 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+define <2 x i8> @top_bytes(i32 %a, i32 %b) {
+; LAZY-LABEL: define <2 x i8> @top_bytes(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16777216
+; LAZY-NEXT: [[A_LSHR:%.*]] = lshr i32 [[A_MASK]], 24
+; LAZY-NEXT: [[A_TRUNC:%.*]] = trunc i32 [[A_LSHR]] to i8
+; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], -16777216
+; LAZY-NEXT: [[B_LSHR:%.*]] = lshr i32 [[B_MASK]], 24
+; LAZY-NEXT: [[B_TRUNC:%.*]] = trunc i32 [[B_LSHR]] to i8
+; LAZY-NEXT: [[BYTES_0:%.*]] = insertelement <2 x i8> poison, i8 [[A_TRUNC]], i32 1
+; LAZY-NEXT: [[BYTES_1:%.*]] = insertelement <2 x i8> [[BYTES_0]], i8 [[B_TRUNC]], i32 0
+; LAZY-NEXT: ret <2 x i8> [[BYTES_1]]
+;
+; AGGRESSIVE-LABEL: define <2 x i8> @top_bytes(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8>
+; AGGRESSIVE-NEXT: [[B_CAST:%.*]] = bitcast i32 [[B]] to <4 x i8>
+; AGGRESSIVE-NEXT: [[B_EXTRACT:%.*]] = shufflevector <4 x i8> [[B_CAST]], <4 x i8> poison, <2 x i32> <i32 3, i32 poison>
+; AGGRESSIVE-NEXT: [[B_SHUFFLE:%.*]] = shufflevector <2 x i8> [[B_EXTRACT]], <2 x i8> zeroinitializer, <2 x i32> <i32 0, i32 3>
+; AGGRESSIVE-NEXT: [[A_EXTRACT:%.*]] = shufflevector <4 x i8> [[A_CAST]], <4 x i8> poison, <2 x i32> <i32 poison, i32 3>
+; AGGRESSIVE-NEXT: [[A_SHUFFLE:%.*]] = shufflevector <2 x i8> [[A_EXTRACT]], <2 x i8> [[B_SHUFFLE]], <2 x i32> <i32 2, i32 1>
+; AGGRESSIVE-NEXT: ret <2 x i8> [[A_SHUFFLE]]
+;
+ %a.mask = and i32 %a, u0xff000000
+ %a.lshr = lshr i32 %a.mask, 24
+ %a.trunc = trunc i32 %a.lshr to i8
+ %b.mask = and i32 %b, u0xff000000
+ %b.lshr = lshr i32 %b.mask, 24
+ %b.trunc = trunc i32 %b.lshr to i8
+ %bytes.0 = insertelement <2 x i8> poison, i8 %a.trunc, i32 1
+ %bytes.1 = insertelement <2 x i8> %bytes.0, i8 %b.trunc, i32 0
+ ret <2 x i8> %bytes.1
+}
+
+define <2 x i16> @top_bytes.i16(i32 %a, i32 %b) {
+; LAZY-LABEL: define <2 x i16> @top_bytes.i16(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -65536
+; LAZY-NEXT: [[A_LSHR:%.*]] = lshr i32 [[A_MASK]], 16
+; LAZY-NEXT: [[A_TRUNC:%.*]] = trunc i32 [[A_LSHR]] to i16
+; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], -65536
+; LAZY-NEXT: [[B_LSHR:%.*]] = lshr i32 [[B_MASK]], 16
+; LAZY-NEXT: [[B_TRUNC:%.*]] = trunc i32 [[B_LSHR]] to i16
+; LAZY-NEXT: [[BYTES_0:%.*]] = insertelement <2 x i16> poison, i16 [[A_TRUNC]], i32 1
+; LAZY-NEXT: [[BYTES_1:%.*]] = insertelement <2 x i16> [[BYTES_0]], i16 [[B_TRUNC]], i32 0
+; LAZY-NEXT: ret <2 x i16> [[BYTES_1]]
+;
+; AGGRESSIVE-LABEL: define <2 x i16> @top_bytes.i16(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT: [[A_CAST:%.*]] = bitcast i32 [[A]] to <2 x i16>
+; AGGRESSIVE-NEXT: [[B_CAST:%.*]] = bitcast i32 [[B]] to <2 x i16>
+; AGGRESSIVE-NEXT: [[B_SHUFFLE:%.*]] = shufflevector <2 x i16> [[B_CAST]], <2 x i16> zeroinitializer, <2 x i32> <i32 1, i32 3>
+; AGGRESSIVE-NEXT: [[A_SHUFFLE:%.*]] = shufflevector <2 x i16> [[A_CAST]], <2 x i16> [[B_SHUFFLE]], <2 x i32> <i32 2, i32 1>
+; AGGRESSIVE-NEXT: ret <2 x i16> [[A_SHUFFLE]]
+;
+ %a.mask = and i32 %a, u0xffff0000
+ %a.lshr = lshr i32 %a.mask, 16
+ %a.trunc = trunc i32 %a.lshr to i16
+ %b.mask = and i32 %b, u0xffff0000
+ %b.lshr = lshr i32 %b.mask, 16
+ %b.trunc = trunc i32 %b.lshr to i16
+ %bytes.0 = insertelement <2 x i16> poison, i16 %a.trunc, i32 1
+ %bytes.1 = insertelement <2 x i16> %bytes.0, i16 %b.trunc, i32 0
+ ret <2 x i16> %bytes.1
+}
+
+define <4 x i8> @obtain_i32(i32 %from) {
+; LAZY-LABEL: define <4 x i8> @obtain_i32(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8>
+; LAZY-NEXT: ret <4 x i8> [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @obtain_i32(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8>
+; AGGRESSIVE-NEXT: ret <4 x i8> [[FROM_CAST]]
+;
+ %get.0 = trunc i32 %from to i8
+
+ %shr.1 = lshr i32 %from, 8
+ %get.1 = trunc i32 %shr.1 to i8
+
+ %shr.2 = lshr i32 %from, 16
+ %get.2 = trunc i32 %shr.2 to i8
+
+ %shr.3 = lshr i32 %from, 24
+ %get.3 = trunc i32 %shr.3 to i8
+
+ %build.0 = insertelement <4 x i8> poison, i8 %get.0, i32 0
+ %build.1 = insertelement <4 x i8> %build.0, i8 %get.1, i32 1
+ %build.2 = insertelement <4 x i8> %build.1, i8 %get.2, i32 2
+ %build.3 = insertelement <4 x i8> %build.2, i8 %get.3, i32 3
+ ret <4 x i8> %build.3
+}
+
+define <2 x i16> @obtain_i32.i16(i32 %from) {
+; LAZY-LABEL: define <2 x i16> @obtain_i32.i16(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16>
+; LAZY-NEXT: ret <2 x i16> [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define <2 x i16> @obtain_i32.i16(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16>
+; AGGRESSIVE-NEXT: ret <2 x i16> [[FROM_CAST]]
+;
+ %get.0 = trunc i32 %from to i16
+
+ %shr.1 = lshr i32 %from, 16
+ %get.1 = trunc i32 %shr.1 to i16
+
+ %build.0 = insertelement <2 x i16> poison, i16 %get.0, i32 0
+ %build.1 = insertelement <2 x i16> %build.0, i16 %get.1, i32 1
+ ret <2 x i16> %build.1
+}
+
+define <4 x i8> @obtain_i32_masked(i32 %from) {
+; LAZY-LABEL: define <4 x i8> @obtain_i32_masked(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8>
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; LAZY-NEXT: ret <4 x i8> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @obtain_i32_masked(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; AGGRESSIVE-NEXT: ret <4 x i8> [[FROM_SHUFFLE]]
+;
+ %get.0 = trunc i32 %from to i8
+
+ %shr.1 = lshr i32 %from, 8
+ %get.1 = trunc i32 %shr.1 to i8
+
+ %shr.3 = lshr i32 %from, 24
+ %get.3 = trunc i32 %shr.3 to i8
+
+ %build.0 = insertelement <4 x i8> <i8 poison, i8 poison, i8 0, i8 poison>, i8 %get.0, i32 0
+ %build.1 = insertelement <4 x i8> %build.0, i8 %get.1, i32 1
+ %build.3 = insertelement <4 x i8> %build.1, i8 %get.3, i32 3
+ ret <4 x i8> %build.3
+}
+
+define <2 x i16> @obtain_i32_masked.i16(i32 %from) {
+; LAZY-LABEL: define <2 x i16> @obtain_i32_masked.i16(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16>
+; LAZY-NEXT: [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], <i16 -1, i16 -256>
+; LAZY-NEXT: ret <2 x i16> [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define <2 x i16> @obtain_i32_masked.i16(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16>
+; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], <i16 -1, i16 -256>
+; AGGRESSIVE-NEXT: ret <2 x i16> [[FROM_MASK]]
+;
+ %get.0 = trunc i32 %from to i16
+
+ %shr.1 = lshr i32 %from, 16
+ %trunc.1 = trunc i32 %shr.1 to i16
+ %get.1 = and i16 %trunc.1, u0xff00
+
+ %build.0 = insertelement <2 x i16> poison, i16 %get.0, i32 0
+ %build.1 = insertelement <2 x i16> %build.0, i16 %get.1, i32 1
+ ret <2 x i16> %build.1
+}
+
+define <8 x i8> @obtain_i64(i64 %from) {
+; LAZY-LABEL: define <8 x i8> @obtain_i64(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8>
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; LAZY-NEXT: ret <8 x i8> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @obtain_i64(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AGGRESSIVE-NEXT: ret <8 x i8> [[FROM_SHUFFLE]]
+;
+ %get.0 = trunc i64 %from to i8
+
+ %shr.1 = lshr i64 %from, 8
+ %get.1 = trunc i64 %shr.1 to i8
+
+ %shr.2 = lshr i64 %from, 16
+ %get.2 = trunc i64 %shr.2 to i8
+
+ %shr.3 = lshr i64 %from, 24
+ %get.3 = trunc i64 %shr.3 to i8
+
+ %build.0 = insertelement <8 x i8> zeroinitializer, i8 %get.0, i32 0
+ %build.1 = insertelement <8 x i8> %build.0, i8 %get.1, i32 1
+ %build.2 = insertelement <8 x i8> %build.1, i8 %get.2, i32 2
+ %build.3 = insertelement <8 x i8> %build.2, i8 %get.3, i32 3
+ ret <8 x i8> %build.3
+}
+
+define <4 x i16> @obtain_i64.i16(i64 %from) {
+; LAZY-LABEL: define <4 x i16> @obtain_i64.i16(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16>
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; LAZY-NEXT: ret <4 x i16> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i16> @obtain_i64.i16(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AGGRESSIVE-NEXT: ret <4 x i16> [[FROM_SHUFFLE]]
+;
+ %get.0 = trunc i64 %from to i16
+
+ %shr.1 = lshr i64 %from, 16
+ %get.1 = trunc i64 %shr.1 to i16
+
+ %build.0 = insertelement <4 x i16> zeroinitializer, i16 %get.0, i32 0
+ %build.1 = insertelement <4 x i16> %build.0, i16 %get.1, i32 1
+ ret <4 x i16> %build.1
+}
+
+define <8 x i8> @obtain_i64_shifted(i64 %from) {
+; LAZY-LABEL: define <8 x i8> @obtain_i64_shifted(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8>
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT: ret <8 x i8> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @obtain_i64_shifted(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT: ret <8 x i8> [[FROM_SHUFFLE]]
+;
+ %get.0 = trunc i64 %from to i8
+
+ %shr.1 = lshr i64 %from, 8
+ %get.1 = trunc i64 %shr.1 to i8
+
+ %shr.2 = lshr i64 %from, 16
+ %get.2 = trunc i64 %shr.2 to i8
+
+ %shr.3 = lshr i64 %from, 24
+ %get.3 = trunc i64 %shr.3 to i8
+
+ %build.0 = insertelement <8 x i8> zeroinitializer, i8 %get.0, i32 4
+ %build.1 = insertelement <8 x i8> %build.0, i8 %get.1, i32 5
+ %build.2 = insertelement <8 x i8> %build.1, i8 %get.2, i32 6
+ %build.3 = insertelement <8 x i8> %build.2, i8 %get.3, i32 7
+ ret <8 x i8> %build.3
+}
+
+define <4 x i16> @obtain_i64_shifted.i16(i64 %from) {
+; LAZY-LABEL: define <4 x i16> @obtain_i64_shifted.i16(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16>
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; LAZY-NEXT: ret <4 x i16> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i16> @obtain_i64_shifted.i16(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; AGGRESSIVE-NEXT: ret <4 x i16> [[FROM_SHUFFLE]]
+;
+ %get.0 = trunc i64 %from to i16
+
+ %shr.1 = lshr i64 %from, 16
+ %get.1 = trunc i64 %shr.1 to i16
+
+ %build.0 = insertelement <4 x i16> zeroinitializer, i16 %get.0, i32 2
+ %build.1 = insertelement <4 x i16> %build.0, i16 %get.1, i32 3
+ ret <4 x i16> %build.1
+}
+
+define <8 x i8> @combine(<4 x i8> %bot, i32 %top) {
+; LAZY-LABEL: define <8 x i8> @combine(
+; LAZY-SAME: <4 x i8> [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; LAZY-NEXT: [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <4 x i8>
+; LAZY-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; LAZY-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; LAZY-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; LAZY-NEXT: ret <8 x i8> [[TOP_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @combine(
+; AGGRESSIVE-SAME: <4 x i8> [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT: [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <4 x i8>
+; AGGRESSIVE-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AGGRESSIVE-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AGGRESSIVE-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AGGRESSIVE-NEXT: ret <8 x i8> [[TOP_SHUFFLE]]
+;
+ %base = shufflevector <4 x i8> %bot, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+
+ %get.0 = trunc i32 %top to i8
+
+ %shr.1 = lshr i32 %top, 8
+ %get.1 = trunc i32 %shr.1 to i8
+
+ %shr.2 = lshr i32 %top, 16
+ %get.2 = trunc i32 %shr.2 to i8
+
+ %shr.3 = lshr i32 %top, 24
+ %get.3 = trunc i32 %shr.3 to i8
+
+ %build.0 = insertelement <8 x i8> %base, i8 %get.0, i32 4
+ %build.1 = insertelement <8 x i8> %build.0, i8 %get.1, i32 5
+ %build.2 = insertelement <8 x i8> %build.1, i8 %get.2, i32 6
+ %build.3 = insertelement <8 x i8> %build.2, i8 %get.3, i32 7
+
+ ret <8 x i8> %build.3
+}
+
+define <4 x i16> @combine.i16(<2 x i16> %bot, i32 %top) {
+; LAZY-LABEL: define <4 x i16> @combine.i16(
+; LAZY-SAME: <2 x i16> [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; LAZY-NEXT: [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <2 x i16>
+; LAZY-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; LAZY-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; LAZY-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; LAZY-NEXT: ret <4 x i16> [[TOP_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i16> @combine.i16(
+; AGGRESSIVE-SAME: <2 x i16> [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT: [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <2 x i16>
+; AGGRESSIVE-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AGGRESSIVE-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AGGRESSIVE-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AGGRESSIVE-NEXT: ret <4 x i16> [[TOP_SHUFFLE]]
+;
+ %base = shufflevector <2 x i16> %bot, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+
+ %get.0 = trunc i32 %top to i16
+
+ %shr.1 = lshr i32 %top, 16
+ %get.1 = trunc i32 %shr.1 to i16
+
+ %build.0 = insertelement <4 x i16> %base, i16 %get.0, i32 2
+ %build.1 = insertelement <4 x i16> %build.0, i16 %get.1, i32 3
+
+ ret <4 x i16> %build.1
+}
+
+define <4 x i8> @shuffle_elts(i32 %x) {
+; LAZY-LABEL: define <4 x i8> @shuffle_elts(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
+; LAZY-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
+; LAZY-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; LAZY-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16
+; LAZY-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
+; LAZY-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; LAZY-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; LAZY-NEXT: [[BUILD_0:%.*]] = insertelement <4 x i8> poison, i8 [[X_0]], i32 0
+; LAZY-NEXT: [[BUILD_1:%.*]] = insertelement <4 x i8> [[BUILD_0]], i8 [[X_1]], i32 2
+; LAZY-NEXT: [[BUILD_2:%.*]] = insertelement <4 x i8> [[BUILD_1]], i8 [[X_2]], i32 1
+; LAZY-NEXT: [[X_SHUFFLE4:%.*]] = insertelement <4 x i8> [[BUILD_2]], i8 [[X_3]], i32 3
+; LAZY-NEXT: ret <4 x i8> [[X_SHUFFLE4]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @shuffle_elts(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT: [[X_CAST:%.*]] = bitcast i32 [[X]] to <4 x i8>
+; AGGRESSIVE-NEXT: [[X_SHUFFLE:%.*]] = shufflevector <4 x i8> [[X_CAST]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AGGRESSIVE-NEXT: [[X_SHUFFLE2:%.*]] = shufflevector <4 x i8> [[X_CAST]], <4 x i8> [[X_SHUFFLE]], <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+; AGGRESSIVE-NEXT: [[X_SHUFFLE4:%.*]] = shufflevector <4 x i8> [[X_CAST]], <4 x i8> [[X_SHUFFLE2]], <4 x i32> <i32 4, i32 5, i32 1, i32 7>
+; AGGRESSIVE-NEXT: ret <4 x i8> [[X_SHUFFLE4]]
+;
+ %x.0 = trunc i32 %x to i8
+
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+
+ %shr.2 = lshr i32 %x, 16
+ %x.2 = trunc i32 %shr.2 to i8
+
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+
+ %build.0 = insertelement <4 x i8> poison, i8 %x.0, i32 0
+ %build.1 = insertelement <4 x i8> %build.0, i8 %x.1, i32 2
+ %build.2 = insertelement <4 x i8> %build.1, i8 %x.2, i32 1
+ %build.3 = insertelement <4 x i8> %build.2, i8 %x.3, i32 3
+
+ ret <4 x i8> %build.3
+}
diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
new file mode 100644
index 0000000000000..22ee7fcf5b4c6
--- /dev/null
+++ b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
@@ -0,0 +1,480 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+define i32 @extract_i32(<4 x i8> %from) {
+; LAZY-LABEL: define i32 @extract_i32(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32
+; LAZY-NEXT: ret i32 [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32
+; AGGRESSIVE-NEXT: ret i32 [[FROM_CAST]]
+;
+ %mask.0 = extractelement <4 x i8> %from, i64 0
+ %get.0 = zext i8 %mask.0 to i32
+
+ %mask.1 = extractelement <4 x i8> %from, i64 1
+ %zext.1 = zext i8 %mask.1 to i32
+ %get.1 = shl i32 %zext.1, 8
+ %out.1 = or i32 %get.0, %get.1
+
+ %mask.2 = extractelement <4 x i8> %from, i64 2
+ %zext.2 = zext i8 %mask.2 to i32
+ %get.2 = shl i32 %zext.2, 16
+
+ %mask.3 = extractelement <4 x i8> %from, i64 3
+ %zext.3 = zext i8 %mask.3 to i32
+ %get.3 = shl i32 %zext.3, 24
+ %out.2 = or i32 %get.2, %get.3
+
+ %out = or i32 %out.1, %out.2
+ ret i32 %out
+}
+
+define i32 @extract_i32.i16(<2 x i16> %from) {
+; LAZY-LABEL: define i32 @extract_i32.i16(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32
+; LAZY-NEXT: ret i32 [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32.i16(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32
+; AGGRESSIVE-NEXT: ret i32 [[FROM_CAST]]
+;
+ %mask.0 = extractelement <2 x i16> %from, i64 0
+ %get.0 = zext i16 %mask.0 to i32
+
+ %mask.1 = extractelement <2 x i16> %from, i64 1
+ %zext.1 = zext i16 %mask.1 to i32
+ %get.1 = shl i32 %zext.1, 16
+
+ %out = or i32 %get.0, %get.1
+ ret i32 %out
+}
+
+define i32 @extract_i32_lower(<8 x i8> %from) {
+; LAZY-LABEL: define i32 @extract_i32_lower(
+; LAZY-SAME: <8 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM]], <8 x i8> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i8> [[FROM_SHUFFLE]] to i32
+; LAZY-NEXT: ret i32 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32_lower(
+; AGGRESSIVE-SAME: <8 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM]], <8 x i8> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i8> [[FROM_SHUFFLE]] to i32
+; AGGRESSIVE-NEXT: ret i32 [[FROM_SHUFFLE_CAST]]
+;
+ %mask.0 = extractelement <8 x i8> %from, i64 4
+ %get.0 = zext i8 %mask.0 to i32
+
+ %mask.1 = extractelement <8 x i8> %from, i64 5
+ %zext.1 = zext i8 %mask.1 to i32
+ %get.1 = shl i32 %zext.1, 8
+ %out.1 = or i32 %get.0, %get.1
+
+ %mask.2 = extractelement <8 x i8> %from, i64 6
+ %zext.2 = zext i8 %mask.2 to i32
+ %get.2 = shl i32 %zext.2, 16
+
+ %mask.3 = extractelement <8 x i8> %from, i64 7
+ %zext.3 = zext i8 %mask.3 to i32
+ %get.3 = shl i32 %zext.3, 24
+ %out.2 = or i32 %get.2, %get.3
+
+ %out = or i32 %out.1, %out.2
+ ret i32 %out
+}
+
+define i32 @extract_i32_lower.i16(<4 x i16> %from) {
+; LAZY-LABEL: define i32 @extract_i32_lower.i16(
+; LAZY-SAME: <4 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM]], <4 x i16> zeroinitializer, <2 x i32> <i32 2, i32 3>
+; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <2 x i16> [[FROM_SHUFFLE]] to i32
+; LAZY-NEXT: ret i32 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32_lower.i16(
+; AGGRESSIVE-SAME: <4 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM]], <4 x i16> zeroinitializer, <2 x i32> <i32 2, i32 3>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <2 x i16> [[FROM_SHUFFLE]] to i32
+; AGGRESSIVE-NEXT: ret i32 [[FROM_SHUFFLE_CAST]]
+;
+ %mask.0 = extractelement <4 x i16> %from, i64 2
+ %get.0 = zext i16 %mask.0 to i32
+
+ %mask.1 = extractelement <4 x i16> %from, i64 3
+ %zext.1 = zext i16 %mask.1 to i32
+ %get.1 = shl i32 %zext.1, 16
+
+ %out = or i32 %get.0, %get.1
+ ret i32 %out
+}
+
+;; u0xff00ffff = -16711681
+define i32 @extract_i32_masked(<4 x i8> %from) {
+; LAZY-LABEL: define i32 @extract_i32_masked(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32
+; LAZY-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681
+; LAZY-NEXT: ret i32 [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32_masked(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32
+; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681
+; AGGRESSIVE-NEXT: ret i32 [[FROM_MASK]]
+;
+ %mask.0 = extractelement <4 x i8> %from, i64 0
+ %get.0 = zext i8 %mask.0 to i32
+
+ %mask.1 = extractelement <4 x i8> %from, i64 1
+ %zext.1 = zext i8 %mask.1 to i32
+ %get.1 = shl i32 %zext.1, 8
+ %out.1 = or i32 %get.0, %get.1
+
+ %mask.3 = extractelement <4 x i8> %from, i64 3
+ %zext.3 = zext i8 %mask.3 to i32
+ %get.3 = shl i32 %zext.3, 24
+ %out.2 = or i32 %out.1, %get.3
+
+ ret i32 %out.2
+}
+
+;; u0xff00ffff = -16711681
+define i32 @extract_i32_masked.i16(<2 x i16> %from) {
+; LAZY-LABEL: define i32 @extract_i32_masked.i16(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32
+; LAZY-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681
+; LAZY-NEXT: ret i32 [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32_masked.i16(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32
+; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681
+; AGGRESSIVE-NEXT: ret i32 [[FROM_MASK]]
+;
+ %mask.0 = extractelement <2 x i16> %from, i64 0
+ %get.0 = zext i16 %mask.0 to i32
+
+ %mask.1 = extractelement <2 x i16> %from, i64 1
+ %mask.1.1 = and i16 %mask.1, u0xff00
+ %zext.1 = zext i16 %mask.1.1 to i32
+ %get.1 = shl i32 %zext.1, 16
+
+ %out = or i32 %get.0, %get.1
+ ret i32 %out
+}
+
+define i64 @extract_i64(<4 x i8> %from) {
+; LAZY-LABEL: define i64 @extract_i64(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64
+; LAZY-NEXT: ret i64 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_i64(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT: ret i64 [[FROM_SHUFFLE_CAST]]
+;
+ %mask.0 = extractelement <4 x i8> %from, i64 0
+ %get.0 = zext i8 %mask.0 to i64
+
+ %mask.1 = extractelement <4 x i8> %from, i64 1
+ %zext.1 = zext i8 %mask.1 to i64
+ %get.1 = shl i64 %zext.1, 8
+ %out.1 = or i64 %get.0, %get.1
+
+ %mask.2 = extractelement <4 x i8> %from, i64 2
+ %zext.2 = zext i8 %mask.2 to i64
+ %get.2 = shl i64 %zext.2, 16
+
+ %mask.3 = extractelement <4 x i8> %from, i64 3
+ %zext.3 = zext i8 %mask.3 to i64
+ %get.3 = shl i64 %zext.3, 24
+ %out.2 = or i64 %get.2, %get.3
+
+ %out = or i64 %out.1, %out.2
+ ret i64 %out
+}
+
+define i64 @extract_i64.i16(<2 x i16> %from) {
+; LAZY-LABEL: define i64 @extract_i64.i16(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64
+; LAZY-NEXT: ret i64 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_i64.i16(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT: ret i64 [[FROM_SHUFFLE_CAST]]
+;
+ %mask.0 = extractelement <2 x i16> %from, i64 0
+ %get.0 = zext i16 %mask.0 to i64
+
+ %mask.1 = extractelement <2 x i16> %from, i64 1
+ %zext.1 = zext i16 %mask.1 to i64
+ %get.1 = shl i64 %zext.1, 16
+
+ %out = or i64 %get.0, %get.1
+ ret i64 %out
+}
+
+define i64 @extract_i64_shifted(<4 x i8> %from) {
+; LAZY-LABEL: define i64 @extract_i64_shifted(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64
+; LAZY-NEXT: ret i64 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_i64_shifted(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT: ret i64 [[FROM_SHUFFLE_CAST]]
+;
+ %mask.0 = extractelement <4 x i8> %from, i64 0
+ %zext.0 = zext i8 %mask.0 to i64
+ %get.0 = shl i64 %zext.0, 32
+
+ %mask.1 = extractelement <4 x i8> %from, i64 1
+ %zext.1 = zext i8 %mask.1 to i64
+ %get.1 = shl i64 %zext.1, 40
+ %out.1 = or i64 %get.0, %get.1
+
+ %mask.2 = extractelement <4 x i8> %from, i64 2
+ %zext.2 = zext i8 %mask.2 to i64
+ %get.2 = shl i64 %zext.2, 48
+
+ %mask.3 = extractelement <4 x i8> %from, i64 3
+ %zext.3 = zext i8 %mask.3 to i64
+ %get.3 = shl i64 %zext.3, 56
+ %out.2 = or i64 %get.2, %get.3
+
+ %out = or i64 %out.1, %out.2
+ ret i64 %out
+}
+
+define i64 @extract_i64_shifted.i16(<2 x i16> %from) {
+; LAZY-LABEL: define i64 @extract_i64_shifted.i16(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64
+; LAZY-NEXT: ret i64 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_i64_shifted.i16(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT: ret i64 [[FROM_SHUFFLE_CAST]]
+;
+ %mask.0 = extractelement <2 x i16> %from, i64 0
+ %zext.0 = zext i16 %mask.0 to i64
+ %get.0 = shl i64 %zext.0, 32
+
+ %mask.1 = extractelement <2 x i16> %from, i64 1
+ %zext.1 = zext i16 %mask.1 to i64
+ %get.1 = shl i64 %zext.1, 48
+
+ %out = or i64 %get.0, %get.1
+ ret i64 %out
+}
+
+define i64 @extract_combine(i32 %bot, <4 x i8> %top) {
+; LAZY-LABEL: define i64 @extract_combine(
+; LAZY-SAME: i32 [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) {
+; LAZY-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i8> [[TOP]], <4 x i8> zeroinitializer, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT: [[TOP_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[TOP_SHUFFLE]] to i64
+; LAZY-NEXT: [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]]
+; LAZY-NEXT: ret i64 [[OUT_3_MERGE]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_combine(
+; AGGRESSIVE-SAME: i32 [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i8> [[TOP]], <4 x i8> zeroinitializer, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT: [[TOP_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[TOP_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT: [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]]
+; AGGRESSIVE-NEXT: ret i64 [[OUT_3_MERGE]]
+;
+ %base = zext i32 %bot to i64
+
+ %mask.0 = extractelement <4 x i8> %top, i64 0
+ %zext.0 = zext i8 %mask.0 to i64
+ %get.0 = shl i64 %zext.0, 32
+ %out.0 = or i64 %base, %get.0
+
+ %mask.1 = extractelement <4 x i8> %top, i64 1
+ %zext.1 = zext i8 %mask.1 to i64
+ %get.1 = shl i64 %zext.1, 40
+ %out.1 = or i64 %out.0, %get.1
+
+ %mask.2 = extractelement <4 x i8> %top, i64 2
+ %zext.2 = zext i8 %mask.2 to i64
+ %get.2 = shl i64 %zext.2, 48
+ %out.2 = or i64 %out.1, %get.2
+
+ %mask.3 = extractelement <4 x i8> %top, i64 3
+ %zext.3 = zext i8 %mask.3 to i64
+ %get.3 = shl i64 %zext.3, 56
+ %out.3 = or i64 %out.2, %get.3
+
+ ret i64 %out.3
+}
+
+define i64 @extract_combine.i16(i32 %bot, <2 x i16> %top) {
+; LAZY-LABEL: define i64 @extract_combine.i16(
+; LAZY-SAME: i32 [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) {
+; LAZY-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <2 x i16> [[TOP]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+; LAZY-NEXT: [[TOP_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[TOP_SHUFFLE]] to i64
+; LAZY-NEXT: [[OUT_1_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]]
+; LAZY-NEXT: ret i64 [[OUT_1_MERGE]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_combine.i16(
+; AGGRESSIVE-SAME: i32 [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <2 x i16> [[TOP]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+; AGGRESSIVE-NEXT: [[TOP_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[TOP_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT: [[OUT_1_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]]
+; AGGRESSIVE-NEXT: ret i64 [[OUT_1_MERGE]]
+;
+ %base = zext i32 %bot to i64
+
+ %mask.0 = extractelement <2 x i16> %top, i64 0
+ %zext.0 = zext i16 %mask.0 to i64
+ %get.0 = shl i64 %zext.0, 32
+ %out.0 = or i64 %base, %get.0
+
+ %mask.1 = extractelement <2 x i16> %top, i64 1
+ %zext.1 = zext i16 %mask.1 to i64
+ %get.1 = shl i64 %zext.1, 48
+ %out.1 = or i64 %out.0, %get.1
+
+ ret i64 %out.1
+}
+
+define i32 @extract_bigelt.0(<4 x i64> %src) {
+; LAZY-LABEL: define i32 @extract_bigelt.0(
+; LAZY-SAME: <4 x i64> [[SRC:%.*]]) {
+; LAZY-NEXT: [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3
+; LAZY-NEXT: [[TRUNC:%.*]] = trunc i64 [[SRC_0]] to i32
+; LAZY-NEXT: ret i32 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_bigelt.0(
+; AGGRESSIVE-SAME: <4 x i64> [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = bitcast <4 x i64> [[SRC]] to <8 x i32>
+; AGGRESSIVE-NEXT: [[SRC_EXTRACT:%.*]] = extractelement <8 x i32> [[SRC_CAST]], i32 6
+; AGGRESSIVE-NEXT: ret i32 [[SRC_EXTRACT]]
+;
+ %src.0 = extractelement <4 x i64> %src, i64 3
+ %trunc = trunc i64 %src.0 to i32
+ ret i32 %trunc
+}
+
+define i32 @extract_bigelt.1(<4 x i64> %src) {
+; LAZY-LABEL: define i32 @extract_bigelt.1(
+; LAZY-SAME: <4 x i64> [[SRC:%.*]]) {
+; LAZY-NEXT: [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3
+; LAZY-NEXT: [[SHR:%.*]] = lshr i64 [[SRC_0]], 32
+; LAZY-NEXT: [[TRUNC:%.*]] = trunc i64 [[SHR]] to i32
+; LAZY-NEXT: ret i32 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_bigelt.1(
+; AGGRESSIVE-SAME: <4 x i64> [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = bitcast <4 x i64> [[SRC]] to <8 x i32>
+; AGGRESSIVE-NEXT: [[SRC_EXTRACT:%.*]] = extractelement <8 x i32> [[SRC_CAST]], i32 7
+; AGGRESSIVE-NEXT: ret i32 [[SRC_EXTRACT]]
+;
+ %src.0 = extractelement <4 x i64> %src, i64 3
+ %shr = lshr i64 %src.0, 32
+ %trunc = trunc i64 %shr to i32
+ ret i32 %trunc
+}
+
+;; Nothing happens because the shift amount is too small.
+define i32 @extract_bigelt.2(<4 x i64> %src) {
+; LAZY-LABEL: define i32 @extract_bigelt.2(
+; LAZY-SAME: <4 x i64> [[SRC:%.*]]) {
+; LAZY-NEXT: [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3
+; LAZY-NEXT: [[SHR:%.*]] = lshr i64 [[SRC_0]], 16
+; LAZY-NEXT: [[TRUNC:%.*]] = trunc i64 [[SHR]] to i32
+; LAZY-NEXT: ret i32 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_bigelt.2(
+; AGGRESSIVE-SAME: <4 x i64> [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT: [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3
+; AGGRESSIVE-NEXT: [[SHR:%.*]] = lshr i64 [[SRC_0]], 16
+; AGGRESSIVE-NEXT: [[TRUNC:%.*]] = trunc i64 [[SHR]] to i32
+; AGGRESSIVE-NEXT: ret i32 [[TRUNC]]
+;
+ %src.0 = extractelement <4 x i64> %src, i64 3
+ %shr = lshr i64 %src.0, 16
+ %trunc = trunc i64 %shr to i32
+ ret i32 %trunc
+}
+
+;; u0x0000ff00 = 65280
+;; u0x00ff0000 = 16711680
+;; u0xff0000ff = -16776961
+define i32 @shuffle_elts(<4 x i8> %vec) {
+; LAZY-LABEL: define i32 @shuffle_elts(
+; LAZY-SAME: <4 x i8> [[VEC:%.*]]) {
+; LAZY-NEXT: [[VEC_0:%.*]] = extractelement <4 x i8> [[VEC]], i32 0
+; LAZY-NEXT: [[VEC_1:%.*]] = extractelement <4 x i8> [[VEC]], i32 1
+; LAZY-NEXT: [[VEC_2:%.*]] = extractelement <4 x i8> [[VEC]], i32 2
+; LAZY-NEXT: [[VEC_3:%.*]] = extractelement <4 x i8> [[VEC]], i32 3
+; LAZY-NEXT: [[ZEXT_0:%.*]] = zext i8 [[VEC_0]] to i32
+; LAZY-NEXT: [[ZEXT_1:%.*]] = zext i8 [[VEC_1]] to i32
+; LAZY-NEXT: [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
+; LAZY-NEXT: [[OUT_1:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
+; LAZY-NEXT: [[ZEXT_2:%.*]] = zext i8 [[VEC_2]] to i32
+; LAZY-NEXT: [[SHL_2:%.*]] = shl i32 [[ZEXT_2]], 8
+; LAZY-NEXT: [[ZEXT_3:%.*]] = zext i8 [[VEC_3]] to i32
+; LAZY-NEXT: [[SHL_3:%.*]] = shl i32 [[ZEXT_3]], 24
+; LAZY-NEXT: [[OUT_2:%.*]] = or i32 [[SHL_2]], [[SHL_3]]
+; LAZY-NEXT: [[OUT:%.*]] = or i32 [[OUT_1]], [[OUT_2]]
+; LAZY-NEXT: ret i32 [[OUT]]
+;
+; AGGRESSIVE-LABEL: define i32 @shuffle_elts(
+; AGGRESSIVE-SAME: <4 x i8> [[VEC:%.*]]) {
+; AGGRESSIVE-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x i8> [[VEC]] to i32
+; AGGRESSIVE-NEXT: [[VEC_MASK:%.*]] = and i32 [[VEC_CAST]], -16776961
+; AGGRESSIVE-NEXT: [[VEC_SHIFT:%.*]] = lshr i32 [[VEC_CAST]], 8
+; AGGRESSIVE-NEXT: [[VEC_MASK2:%.*]] = and i32 [[VEC_SHIFT]], 65280
+; AGGRESSIVE-NEXT: [[VEC_SHIFT4:%.*]] = shl i32 [[VEC_CAST]], 8
+; AGGRESSIVE-NEXT: [[VEC_MASK6:%.*]] = and i32 [[VEC_SHIFT4]], 16711680
+; AGGRESSIVE-NEXT: [[OUT_MERGE:%.*]] = or disjoint i32 [[VEC_MASK]], [[VEC_MASK2]]
+; AGGRESSIVE-NEXT: [[OUT_MERGE8:%.*]] = or disjoint i32 [[OUT_MERGE]], [[VEC_MASK6]]
+; AGGRESSIVE-NEXT: ret i32 [[OUT_MERGE8]]
+;
+ %vec.0 = extractelement <4 x i8> %vec, i32 0
+ %vec.1 = extractelement <4 x i8> %vec, i32 1
+ %vec.2 = extractelement <4 x i8> %vec, i32 2
+ %vec.3 = extractelement <4 x i8> %vec, i32 3
+
+ %zext.0 = zext i8 %vec.0 to i32
+
+ %zext.1 = zext i8 %vec.1 to i32
+ %shl.1 = shl i32 %zext.1, 16
+ %out.1 = or i32 %zext.0, %shl.1
+
+ %zext.2 = zext i8 %vec.2 to i32
+ %shl.2 = shl i32 %zext.2, 8
+
+ %zext.3 = zext i8 %vec.3 to i32
+ %shl.3 = shl i32 %zext.3, 24
+ %out.2 = or i32 %shl.2, %shl.3
+
+ %out = or i32 %out.1, %out.2
+
+ ret i32 %out
+}
diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
new file mode 100644
index 0000000000000..5baefc7fb6cda
--- /dev/null
+++ b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+define <4 x i8> @obtain_v4i8(<2 x i16> %from) {
+; LAZY-LABEL: define <4 x i8> @obtain_v4i8(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; LAZY-NEXT: ret <4 x i8> [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @obtain_v4i8(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; AGGRESSIVE-NEXT: ret <4 x i8> [[FROM_CAST]]
+;
+ %from.0 = extractelement <2 x i16> %from, i64 0
+ %mask.0 = trunc i16 %from.0 to i8
+ %shr.1 = lshr i16 %from.0, 8
+ %mask.1 = trunc i16 %shr.1 to i8
+
+ %from.1 = extractelement <2 x i16> %from, i64 1
+ %mask.2 = trunc i16 %from.1 to i8
+ %shr.3 = lshr i16 %from.1, 8
+ %mask.3 = trunc i16 %shr.3 to i8
+
+ %build.0 = insertelement <4 x i8> poison, i8 %mask.0, i64 0
+ %build.1 = insertelement <4 x i8> %build.0, i8 %mask.1, i64 1
+ %build.2 = insertelement <4 x i8> %build.1, i8 %mask.2, i64 2
+ %build.3 = insertelement <4 x i8> %build.2, i8 %mask.3, i64 3
+
+ ret <4 x i8> %build.3
+}
+
+define <2 x i16> @obtain_v2i16(<4 x i8> %from) {
+; LAZY-LABEL: define <2 x i16> @obtain_v2i16(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; LAZY-NEXT: ret <2 x i16> [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define <2 x i16> @obtain_v2i16(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; AGGRESSIVE-NEXT: ret <2 x i16> [[FROM_CAST]]
+;
+ %from.0 = extractelement <4 x i8> %from, i64 0
+ %zext.0 = zext i8 %from.0 to i16
+
+ %from.1 = extractelement <4 x i8> %from, i64 1
+ %zext.1 = zext i8 %from.1 to i16
+ %shl.1 = shl i16 %zext.1, 8
+ %out.1 = or i16 %zext.0, %shl.1
+
+ %from.2 = extractelement <4 x i8> %from, i64 2
+ %zext.2 = zext i8 %from.2 to i16
+
+ %from.3 = extractelement <4 x i8> %from, i64 3
+ %zext.3 = zext i8 %from.3 to i16
+ %shl.3 = shl i16 %zext.3, 8
+ %out.2 = or i16 %zext.2, %shl.3
+
+ %build.0 = insertelement <2 x i16> poison, i16 %out.1, i64 0
+ %build.1 = insertelement <2 x i16> %build.0, i16 %out.2, i64 1
+
+ ret <2 x i16> %build.1
+}
+
+define <4 x i8> @obtain_v4i8_masked(<2 x i16> %from) {
+; LAZY-LABEL: define <4 x i8> @obtain_v4i8_masked(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; LAZY-NEXT: ret <4 x i8> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @obtain_v4i8_masked(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; AGGRESSIVE-NEXT: ret <4 x i8> [[FROM_SHUFFLE]]
+;
+ %from.0 = extractelement <2 x i16> %from, i64 0
+ %mask.0 = trunc i16 %from.0 to i8
+ %shr.1 = lshr i16 %from.0, 8
+ %mask.1 = trunc i16 %shr.1 to i8
+
+ %from.1 = extractelement <2 x i16> %from, i64 1
+ %shr.3 = lshr i16 %from.1, 8
+ %mask.3 = trunc i16 %shr.3 to i8
+
+ %build.0 = insertelement <4 x i8> <i8 poison, i8 poison, i8 0, i8 poison>, i8 %mask.0, i64 0
+ %build.1 = insertelement <4 x i8> %build.0, i8 %mask.1, i64 1
+ %build.3 = insertelement <4 x i8> %build.1, i8 %mask.3, i64 3
+
+ ret <4 x i8> %build.3
+}
+
+define <2 x i16> @obtain_v2i16_masked(<4 x i8> %from) {
+; LAZY-LABEL: define <2 x i16> @obtain_v2i16_masked(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; LAZY-NEXT: [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], <i16 -1, i16 -256>
+; LAZY-NEXT: ret <2 x i16> [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define <2 x i16> @obtain_v2i16_masked(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], <i16 -1, i16 -256>
+; AGGRESSIVE-NEXT: ret <2 x i16> [[FROM_MASK]]
+;
+ %from.0 = extractelement <4 x i8> %from, i64 0
+ %zext.0 = zext i8 %from.0 to i16
+
+ %from.1 = extractelement <4 x i8> %from, i64 1
+ %zext.1 = zext i8 %from.1 to i16
+ %shl.1 = shl i16 %zext.1, 8
+ %out.1 = or i16 %zext.0, %shl.1
+
+ %from.3 = extractelement <4 x i8> %from, i64 3
+ %zext.3 = zext i8 %from.3 to i16
+ %shl.3 = shl i16 %zext.3, 8
+
+ %build.0 = insertelement <2 x i16> poison, i16 %out.1, i64 0
+ %build.1 = insertelement <2 x i16> %build.0, i16 %shl.3, i64 1
+
+ ret <2 x i16> %build.1
+}
+
+define <8 x i8> @obtain_v4i8_shifted(<2 x i16> %from) {
+; LAZY-LABEL: define <8 x i8> @obtain_v4i8_shifted(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; LAZY-NEXT: [[FROM_EXTRACT:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; LAZY-NEXT: ret <8 x i8> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @obtain_v4i8_shifted(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; AGGRESSIVE-NEXT: [[FROM_EXTRACT:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AGGRESSIVE-NEXT: ret <8 x i8> [[FROM_SHUFFLE]]
+;
+ %from.0 = extractelement <2 x i16> %from, i64 0
+ %mask.0 = trunc i16 %from.0 to i8
+ %shr.1 = lshr i16 %from.0, 8
+ %mask.1 = trunc i16 %shr.1 to i8
+
+ %from.1 = extractelement <2 x i16> %from, i64 1
+ %mask.2 = trunc i16 %from.1 to i8
+ %shr.3 = lshr i16 %from.1, 8
+ %mask.3 = trunc i16 %shr.3 to i8
+
+ %build.0 = insertelement <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison>, i8 %mask.0, i64 4
+ %build.1 = insertelement <8 x i8> %build.0, i8 %mask.1, i64 5
+ %build.2 = insertelement <8 x i8> %build.1, i8 %mask.2, i64 6
+ %build.3 = insertelement <8 x i8> %build.2, i8 %mask.3, i64 7
+
+ ret <8 x i8> %build.3
+}
+
+define <4 x i16> @obtain_v2i16_shifted(<4 x i8> %from) {
+; LAZY-LABEL: define <4 x i16> @obtain_v2i16_shifted(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; LAZY-NEXT: [[FROM_EXTRACT:%.*]] = shufflevector <2 x i16> [[FROM_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; LAZY-NEXT: ret <4 x i16> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i16> @obtain_v2i16_shifted(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; AGGRESSIVE-NEXT: [[FROM_EXTRACT:%.*]] = shufflevector <2 x i16> [[FROM_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AGGRESSIVE-NEXT: ret <4 x i16> [[FROM_SHUFFLE]]
+;
+ %from.0 = extractelement <4 x i8> %from, i64 0
+ %zext.0 = zext i8 %from.0 to i16
+
+ %from.1 = extractelement <4 x i8> %from, i64 1
+ %zext.1 = zext i8 %from.1 to i16
+ %shl.1 = shl i16 %zext.1, 8
+ %out.1 = or i16 %zext.0, %shl.1
+
+ %from.2 = extractelement <4 x i8> %from, i64 2
+ %zext.2 = zext i8 %from.2 to i16
+
+ %from.3 = extractelement <4 x i8> %from, i64 3
+ %zext.3 = zext i8 %from.3 to i16
+ %shl.3 = shl i16 %zext.3, 8
+ %out.2 = or i16 %zext.2, %shl.3
+
+ %build.0 = insertelement <4 x i16> <i16 0, i16 0, i16 poison, i16 poison>, i16 %out.1, i64 2
+ %build.1 = insertelement <4 x i16> %build.0, i16 %out.2, i64 3
+
+ ret <4 x i16> %build.1
+}
+
+define <8 x i8> @combine_v4i8(<4 x i8> %bot, <2 x i16> %top) {
+; LAZY-LABEL: define <8 x i8> @combine_v4i8(
+; LAZY-SAME: <4 x i8> [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) {
+; LAZY-NEXT: [[TOP_CAST:%.*]] = bitcast <2 x i16> [[TOP]] to <4 x i8>
+; LAZY-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; LAZY-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; LAZY-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; LAZY-NEXT: ret <8 x i8> [[TOP_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @combine_v4i8(
+; AGGRESSIVE-SAME: <4 x i8> [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT: [[TOP_CAST:%.*]] = bitcast <2 x i16> [[TOP]] to <4 x i8>
+; AGGRESSIVE-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AGGRESSIVE-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AGGRESSIVE-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AGGRESSIVE-NEXT: ret <8 x i8> [[TOP_SHUFFLE]]
+;
+ %base = shufflevector <4 x i8> %bot, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+
+ %top.0 = extractelement <2 x i16> %top, i64 0
+ %mask.0 = trunc i16 %top.0 to i8
+ %shr.1 = lshr i16 %top.0, 8
+ %mask.1 = trunc i16 %shr.1 to i8
+
+ %top.1 = extractelement <2 x i16> %top, i64 1
+ %mask.2 = trunc i16 %top.1 to i8
+ %shr.3 = lshr i16 %top.1, 8
+ %mask.3 = trunc i16 %shr.3 to i8
+
+ %build.0 = insertelement <8 x i8> %base, i8 %mask.0, i64 4
+ %build.1 = insertelement <8 x i8> %build.0, i8 %mask.1, i64 5
+ %build.2 = insertelement <8 x i8> %build.1, i8 %mask.2, i64 6
+ %build.3 = insertelement <8 x i8> %build.2, i8 %mask.3, i64 7
+
+ ret <8 x i8> %build.3
+}
+
+define <4 x i16> @combine_v2i16(<2 x i16> %bot, <4 x i8> %top) {
+; LAZY-LABEL: define <4 x i16> @combine_v2i16(
+; LAZY-SAME: <2 x i16> [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) {
+; LAZY-NEXT: [[TOP_CAST:%.*]] = bitcast <4 x i8> [[TOP]] to <2 x i16>
+; LAZY-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; LAZY-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; LAZY-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; LAZY-NEXT: ret <4 x i16> [[TOP_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i16> @combine_v2i16(
+; AGGRESSIVE-SAME: <2 x i16> [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT: [[TOP_CAST:%.*]] = bitcast <4 x i8> [[TOP]] to <2 x i16>
+; AGGRESSIVE-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AGGRESSIVE-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AGGRESSIVE-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AGGRESSIVE-NEXT: ret <4 x i16> [[TOP_SHUFFLE]]
+;
+ %base = shufflevector <2 x i16> %bot, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+
+ %top.0 = extractelement <4 x i8> %top, i64 0
+ %zext.0 = zext i8 %top.0 to i16
+
+ %top.1 = extractelement <4 x i8> %top, i64 1
+ %zext.1 = zext i8 %top.1 to i16
+ %shl.1 = shl i16 %zext.1, 8
+ %out.1 = or i16 %zext.0, %shl.1
+
+ %top.2 = extractelement <4 x i8> %top, i64 2
+ %zext.2 = zext i8 %top.2 to i16
+
+ %top.3 = extractelement <4 x i8> %top, i64 3
+ %zext.3 = zext i8 %top.3 to i16
+ %shl.3 = shl i16 %zext.3, 8
+ %out.2 = or i16 %zext.2, %shl.3
+
+ %build.0 = insertelement <4 x i16> %base, i16 %out.1, i64 2
+ %build.1 = insertelement <4 x i16> %build.0, i16 %out.2, i64 3
+
+ ret <4 x i16> %build.1
+}
+
+define <4 x i8> @shuffle_elts(<4 x i8> %vec) {
+; LAZY-LABEL: define <4 x i8> @shuffle_elts(
+; LAZY-SAME: <4 x i8> [[VEC:%.*]]) {
+; LAZY-NEXT: [[SHUFFLED:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; LAZY-NEXT: ret <4 x i8> [[SHUFFLED]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @shuffle_elts(
+; AGGRESSIVE-SAME: <4 x i8> [[VEC:%.*]]) {
+; AGGRESSIVE-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AGGRESSIVE-NEXT: [[VEC_SHUFFLE2:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> [[VEC_SHUFFLE]], <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+; AGGRESSIVE-NEXT: [[VEC_SHUFFLE4:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> [[VEC_SHUFFLE2]], <4 x i32> <i32 4, i32 5, i32 1, i32 7>
+; AGGRESSIVE-NEXT: ret <4 x i8> [[VEC_SHUFFLE4]]
+;
+ %shuffled = shufflevector <4 x i8> %vec, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x i8> %shuffled
+}
>From 932bc6bf48264f927a3e051d5273808ee891ba8d Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe at amd.com>
Date: Thu, 3 Jul 2025 18:42:57 -0500
Subject: [PATCH 2/2] Address a subset of the reviewer feedback.
---
llvm/include/llvm/Transforms/Scalar.h | 4 +-
.../Scalar/PackedIntegerCombinePass.h | 18 +-
llvm/lib/Passes/PassBuilder.cpp | 29 ++
llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +-
llvm/lib/Passes/PassRegistry.def | 7 +-
.../Scalar/PackedIntegerCombinePass.cpp | 392 ++++++++----------
.../PackedIntegerCombine/instructions.ll | 51 +--
.../PackedIntegerCombine/int2int.ll | 4 +-
.../PackedIntegerCombine/int2vec.ll | 4 +-
.../PackedIntegerCombine/vec2int.ll | 4 +-
.../PackedIntegerCombine/vec2vec.ll | 4 +-
11 files changed, 228 insertions(+), 291 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index ec9d89507c375..bd5b112ed6105 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -159,7 +159,9 @@ LLVM_ABI extern char &InferAddressSpacesID;
// PackedIntegerCombinePass - Tracks individual bytes through instructions to
// systematically identify redundant byte packing or unpacking operations.
//
-LLVM_ABI FunctionPass *createPackedIntegerCombinePass();
+LLVM_ABI FunctionPass *
+createPackedIntegerCombinePass(unsigned MaxCollectionIterations = 2,
+ bool AggressiveRewriting = false);
//===----------------------------------------------------------------------===//
//
diff --git a/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
index a5916e2e611cf..2ca56c06784dc 100644
--- a/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
+++ b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
@@ -21,10 +21,26 @@
namespace llvm {
+struct PackedIntegerCombineOptions {
+ /// Maximum number of iterations to isolate final packed instructions.
+ unsigned MaxCollectionIterations = 2;
+ /// Aggressively rewrite packed instructions.
+ bool AggressiveRewriting = false;
+};
+
class PackedIntegerCombinePass
: public PassInfoMixin<PackedIntegerCombinePass> {
+
+ PackedIntegerCombineOptions Options;
+
public:
- PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+ PackedIntegerCombinePass(PackedIntegerCombineOptions Options = {})
+ : Options(Options) {}
+
+ LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+ LLVM_ABI void
+ printPipeline(raw_ostream &OS,
+ function_ref<StringRef(StringRef)> MapClassName2PassName);
};
} // end namespace llvm
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 7a382ace34dbc..911a043fbae53 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1170,6 +1170,35 @@ Expected<bool> parseMergedLoadStoreMotionOptions(StringRef Params) {
return Result;
}
+Expected<PackedIntegerCombineOptions>
+parsePackedIntegerCombineOptions(StringRef Params) {
+ PackedIntegerCombineOptions Options;
+ while (!Params.empty()) {
+ StringRef ParamName;
+ std::tie(ParamName, Params) = Params.split(';');
+
+ if (ParamName.consume_front("max-iterations=")) {
+ if (ParamName.getAsInteger(0, Options.MaxCollectionIterations))
+ return make_error<StringError>(
+ formatv("invalid max iteration count for PackedIntegerCombine "
+ "pass: '{}'",
+ ParamName)
+ .str(),
+ inconvertibleErrorCode());
+ } else if (ParamName == "aggressive") {
+ Options.AggressiveRewriting = true;
+ } else {
+ return make_error<StringError>(
+ formatv("invalid argument for PackedIntegerCombinePass: '{}'",
+ ParamName)
+ .str(),
+ inconvertibleErrorCode());
+ }
+ }
+
+ return Options;
+}
+
Expected<GVNOptions> parseGVNOptions(StringRef Params) {
GVNOptions Result;
while (!Params.empty()) {
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 2da72606bc47a..9b2254607d954 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -2361,4 +2361,4 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
bool PassBuilder::isInstrumentedPGOUse() const {
return (PGOOpt && PGOOpt->Action == PGOOptions::IRUse) ||
!UseCtxProfile.empty();
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 6f1c405a5efa7..812eb5d3c15c4 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -476,7 +476,6 @@ FUNCTION_PASS("objc-arc", ObjCARCOptPass())
FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass())
FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass())
FUNCTION_PASS("pa-eval", PAEvalPass())
-FUNCTION_PASS("packedintcombine", PackedIntegerCombinePass())
FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
FUNCTION_PASS("place-safepoints", PlaceSafepointsPass())
@@ -628,6 +627,12 @@ FUNCTION_PASS_WITH_PARAMS(
return MergedLoadStoreMotionPass(Opts);
},
parseMergedLoadStoreMotionOptions, "no-split-footer-bb;split-footer-bb")
+FUNCTION_PASS_WITH_PARAMS(
+ "packed-integer-combine", "PackedIntegerCombinePass",
+ [](PackedIntegerCombineOptions Options) {
+ return PackedIntegerCombinePass(Options);
+ },
+ parsePackedIntegerCombineOptions, "max-iterations=N;aggressive")
FUNCTION_PASS_WITH_PARAMS(
"print<access-info>", "LoopAccessInfoPrinterPass",
[](bool AllowPartial) {
diff --git a/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
index 31edd28069a2b..fc3b57b71bf45 100644
--- a/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
+++ b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
///
-/// This file provides the interface for LLVM's Packed Integer Combine pass.
+/// This file implements the interface for LLVM's Packed Integer Combine pass.
/// This pass tries to treat integers as packed chunks of individual bytes,
/// and leverage this to coalesce needlessly fragmented
/// computations.
@@ -24,21 +24,11 @@
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
+#include <variant>
using namespace llvm;
-#define DEBUG_TYPE "packedintcombine"
-
-static cl::opt<unsigned> MaxCollectionIterations(
- "packedint-max-iterations",
- cl::desc("Maximum number of iterations to isolate final packed "
- "instructions. Set to 0 to iterate until convergence."),
- cl::init(2), cl::Hidden);
-
-static cl::opt<bool>
- AggressiveRewriting("packedint-aggressive-rewriter",
- cl::desc("Aggressively rewrite packed instructions."),
- cl::init(false), cl::Hidden);
+#define DEBUG_TYPE "packed-integer-combine"
namespace {
@@ -101,10 +91,10 @@ class Byte {
else
Base->printAsOperand(ROS, false);
- ROS << "[" << Integer << "]";
+ ROS << '[' << Integer << ']';
if (NewLine)
- ROS << "\n";
+ ROS << '\n';
}
LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
@@ -128,7 +118,7 @@ struct ByteLayout {
};
/// Interpret the given type as a number of packed bytes, if possible.
-static std::optional<ByteLayout> tryGetByteLayout(const Type *Ty) {
+static std::optional<ByteLayout> getByteLayout(const Type *Ty) {
unsigned IntBitWidth, NumElts;
if (const auto *IntTy = dyn_cast<IntegerType>(Ty)) {
IntBitWidth = IntTy->getBitWidth();
@@ -148,13 +138,6 @@ static std::optional<ByteLayout> tryGetByteLayout(const Type *Ty) {
return ByteLayout{IntBitWidth / Byte::BitWidth, NumElts};
}
-/// Interpret the given type as a number of backed bytes (aborts if impossible).
-static ByteLayout getByteLayout(const Type *Ty) {
- const std::optional<ByteLayout> Layout = tryGetByteLayout(Ty);
- assert(Layout);
- return *Layout;
-}
-
/// A convenience class for combining Byte instances obtained from the same base
/// value, and with a common relative offset, which can hence be obtained
/// simultaneously.
@@ -167,7 +150,7 @@ struct CoalescedBytes {
///
/// For instance, if bytes 3, 4, 5 of some value %val are coalesced to provide
/// bytes 0, 1, 2 of the target %tgt, then ShrByteOffset = 3.
- signed SignedShrByteOffset;
+ int SignedShrByteOffset;
/// The bitmask identifying which bytes of the target value are covered by
/// these coalesced bytes.
///
@@ -176,12 +159,12 @@ struct CoalescedBytes {
/// be set, corresponding to the first three bits of %tgt.
SmallBitVector Mask;
- explicit CoalescedBytes(Value &Base, signed Offset, SmallBitVector Mask)
+ explicit CoalescedBytes(Value &Base, int Offset, SmallBitVector Mask)
: Base(&Base), SignedShrByteOffset(Offset), Mask(Mask) {}
- explicit CoalescedBytes(Value &Base, signed Offset, unsigned NumBytes)
+ explicit CoalescedBytes(Value &Base, int Offset, unsigned NumBytes)
: Base(&Base), SignedShrByteOffset(Offset), Mask(NumBytes) {}
- bool alignsWith(Value *V, signed VOffset) const {
+ bool alignsWith(Value *V, int VOffset) const {
return Base == V && SignedShrByteOffset == VOffset;
}
@@ -206,16 +189,16 @@ struct CoalescedBytes {
for (unsigned Idx = 0; Idx < Mask.size(); ++Idx) {
if (Mask.test(Idx)) {
Base->printAsOperand(ROS, false);
- ROS << "[" << (static_cast<int>(Idx) + SignedShrByteOffset) << "]";
+ ROS << '[' << (static_cast<int>(Idx) + SignedShrByteOffset) << ']';
} else
ROS << 0;
ROS << "; ";
}
- ROS << "}";
+ ROS << '}';
if (NewLine)
- ROS << "\n";
+ ROS << '\n';
}
LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
@@ -254,67 +237,23 @@ using ByteVector = SmallVector<ByteUse, 8>;
/// The decomposition of an IR value into its individual bytes, tracking where
/// each byte is obtained.
-class ByteDefinition {
- /// Enum classifying what Ptr points to.
- enum ByteType : uint8_t {
- /// Ptr's value is undefined.
- INVALID,
- /// The byte definition is given by a ByteVector, which is referenced (but
- /// not captured) by Ptr.
- VECTOR,
- /// The bytes are obtained from a (currently opaque) IR value, held by Ptr.
- VALUE,
- /// The bytes are obtained from a constant integer, held by Ptr.
- CONST_INT,
- /// The bytes are obtained from a constant vector of integers, held by Ptr.
- CONST_VEC,
- };
-
- ByteType DefType;
- void *Ptr;
+struct ByteDefinition {
+ std::variant<std::nullopt_t, ByteVector *, Value *> Ptr;
ByteLayout Layout;
- ByteDefinition(ByteType DefType, void *Ptr, ByteLayout Layout)
- : DefType(DefType), Ptr(Ptr), Layout(Layout) {}
public:
/// Indicate that a value cannot be decomposed into bytes in a known way.
- static ByteDefinition invalid() { return {INVALID, nullptr, {0, 0}}; }
+ static ByteDefinition invalid() { return {std::nullopt, {0, 0}}; }
/// Indicate that a value's bytes are known, and track their producers.
static ByteDefinition vector(ByteVector &Ref, ByteLayout Layout) {
- return {VECTOR, &Ref, Layout};
+ return {&Ref, Layout};
}
/// Indicate that a value's bytes are opaque.
static ByteDefinition value(Value &V) {
- return {VALUE, &V, getByteLayout(V.getType())};
- }
- /// Indicate that the bytes come from a constant integer.
- static ByteDefinition constInt(ConstantInt &Int) {
- return {CONST_INT, &Int, getByteLayout(Int.getType())};
- }
- /// Indicate that the bytes come from a constant vector of integers.
- static ByteDefinition constVec(Constant &Vec) {
- assert(Vec.getType()->isVectorTy());
- return {CONST_VEC, &Vec, getByteLayout(Vec.getType())};
- }
-
- ByteVector &getVector() const {
- assert(DefType == VECTOR);
- return *static_cast<ByteVector *>(Ptr);
- }
- Value &getValue() const {
- assert(DefType == VALUE);
- return *static_cast<Value *>(Ptr);
- }
- ConstantInt &getConstInt() const {
- assert(DefType == CONST_INT);
- return *static_cast<ConstantInt *>(Ptr);
- }
- Constant &getConstVec() const {
- assert(DefType == CONST_VEC);
- return *static_cast<Constant *>(Ptr);
+ return {&V, *getByteLayout(V.getType())};
}
- bool isValid() const { return DefType != INVALID; }
+ bool isValid() const { return !std::holds_alternative<std::nullopt_t>(Ptr); }
/// Return true iff the byte definition is valid.
operator bool() const { return isValid(); }
@@ -322,62 +261,65 @@ class ByteDefinition {
/// Get the definition of the byte at the specified byte offset, where 0 is
/// the least significant byte.
Byte getByte(unsigned Idx) const {
- switch (DefType) {
- default:
- llvm_unreachable("Invalid byte definition");
- case VECTOR:
- return getVector()[Idx].getByte();
- case VALUE:
- return Byte(getValue(), Idx);
- case CONST_INT:
- return Byte(getConstInt().getValue().extractBitsAsZExtValue(
- Byte::BitWidth, Idx * Byte::BitWidth));
- case CONST_VEC: {
- const auto &Vec = getConstVec();
- const ByteLayout Layout = getByteLayout(Vec.getType());
- const unsigned VecIdx = Idx / Layout.NumBytesPerElement;
- const unsigned EltIdx = Idx % Layout.NumBytesPerElement;
-
- Constant *Elt = Vec.getAggregateElement(VecIdx);
- if (const auto *Int = dyn_cast<ConstantInt>(Elt))
- return Byte(Int->getValue().extractBitsAsZExtValue(
- Byte::BitWidth, EltIdx * Byte::BitWidth));
-
- return Byte(*Elt, EltIdx);
- }
- }
+ struct Visitor {
+ unsigned Idx;
+
+ Byte operator()(std::nullopt_t) {
+ llvm_unreachable("Invalid byte definition");
+ }
+ Byte operator()(ByteVector *BV) { return (*BV)[Idx].getByte(); }
+ Byte operator()(Value *V) {
+ if (auto *Int = dyn_cast<ConstantInt>(V))
+ return Byte(Int->getValue().extractBitsAsZExtValue(
+ Byte::BitWidth, Idx * Byte::BitWidth));
+
+ if (V->getType()->isVectorTy()) {
+ if (auto *Vec = dyn_cast<Constant>(V)) {
+ const ByteLayout Layout = *getByteLayout(Vec->getType());
+ const unsigned VecIdx = Idx / Layout.NumBytesPerElement;
+ const unsigned EltIdx = Idx % Layout.NumBytesPerElement;
+
+ if (Constant *Elt = Vec->getAggregateElement(VecIdx)) {
+ if (const auto *Int = dyn_cast<ConstantInt>(Elt))
+ return Byte(Int->getValue().extractBitsAsZExtValue(
+ Byte::BitWidth, EltIdx * Byte::BitWidth));
+
+ return Byte(*Elt, EltIdx);
+ }
+ }
+ }
+
+ return Byte(*V, Idx);
+ }
+ };
+
+ return std::visit(Visitor{Idx}, Ptr);
}
const ByteLayout &getLayout() const { return Layout; }
void print(raw_ostream &ROS, bool NewLine = true) const {
- switch (DefType) {
- default:
- ROS << "[INVALID]";
- break;
- case VECTOR: {
- ByteVector &BV = getVector();
- ROS << "{ ";
- for (unsigned ByteIdx = 0; ByteIdx < BV.size(); ++ByteIdx)
- ROS << ByteIdx << ": " << BV[ByteIdx].getByte() << "; ";
- ROS << "}";
- break;
- }
- case VALUE:
- ROS << "(";
- getValue().printAsOperand(ROS);
- ROS << ")[0:" << Layout.getNumBytes() << "]";
- break;
- case CONST_INT:
- ROS << getConstInt();
- break;
- case CONST_VEC:
- ROS << getConstVec();
- break;
- }
+ struct Visitor {
+ raw_ostream &ROS;
+ const ByteLayout &Layout;
+
+ void operator()(std::nullopt_t) { ROS << "[INVALID]"; }
+ void operator()(ByteVector *BV) {
+ ROS << "{ ";
+ for (unsigned ByteIdx = 0; ByteIdx < BV->size(); ++ByteIdx)
+ ROS << ByteIdx << ": " << (*BV)[ByteIdx].getByte() << "; ";
+ ROS << '}';
+ }
+ void operator()(Value *V) {
+ ROS << '(';
+ V->printAsOperand(ROS);
+ ROS << ")[0:" << Layout.getNumBytes() << ']';
+ }
+ };
+ std::visit(Visitor{ROS, Layout}, Ptr);
if (NewLine)
- ROS << "\n";
+ ROS << '\n';
}
LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
@@ -415,7 +357,6 @@ class ByteExpander final : public InstVisitor<ByteExpander, ByteVector> {
// Visitation implementations return `true` iff a new byte definition was
// successfully constructed.
- ByteVector visitAdd(BinaryOperator &I);
ByteVector visitAnd(BinaryOperator &I);
ByteVector visitOr(BinaryOperator &I);
ByteVector visitXor(BinaryOperator &I);
@@ -455,47 +396,18 @@ class ByteExpander final : public InstVisitor<ByteExpander, ByteVector> {
/// Iterate over all instructions in a function over several passes to
/// identify all final values and their byte definitions.
- std::vector<Instruction *> collectPIICandidates(Function &F);
+ std::vector<Instruction *>
+ collectPIICandidates(Function &F, unsigned MaxCollectionIterations);
};
-ByteVector ByteExpander::visitAdd(BinaryOperator &I) {
- const ByteDefinition LhsDef =
- getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ByteVector ByteExpander::visitAnd(BinaryOperator &I) {
const ByteDefinition RhsDef =
getByteDefinitionIfIntermediateOperand(I.getOperand(1));
- if (!LhsDef || !RhsDef)
+ if (!RhsDef)
return {};
-
- const ByteLayout &Layout = LhsDef.getLayout();
- const unsigned NumBytes = Layout.getNumBytes();
-
- ByteVector BV;
- BV.reserve(NumBytes);
-
- for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) {
- const Byte Lhs = LhsDef.getByte(ByteIdx);
- const Byte Rhs = RhsDef.getByte(ByteIdx);
-
- const bool LhsIsZero = Lhs.isConstant() && Lhs.getConstant() == 0;
- const bool RhsIsZero = Rhs.isConstant() && Rhs.getConstant() == 0;
- if (LhsIsZero)
- BV.emplace_back(Rhs, RhsIsZero ? ByteUse::AllOperands : 1);
- else if (RhsIsZero)
- BV.emplace_back(Lhs, 0);
- else
- return {};
- }
-
- assert(BV.size() == NumBytes);
- return BV;
-}
-
-ByteVector ByteExpander::visitAnd(BinaryOperator &I) {
const ByteDefinition LhsDef =
getByteDefinitionIfIntermediateOperand(I.getOperand(0));
- const ByteDefinition RhsDef =
- getByteDefinitionIfIntermediateOperand(I.getOperand(1));
- if (!LhsDef || !RhsDef)
+ if (!LhsDef)
return {};
const ByteLayout &Layout = LhsDef.getLayout();
@@ -546,11 +458,13 @@ ByteVector ByteExpander::visitAnd(BinaryOperator &I) {
}
ByteVector ByteExpander::visitOr(BinaryOperator &I) {
- const ByteDefinition LhsDef =
- getByteDefinitionIfIntermediateOperand(I.getOperand(0));
const ByteDefinition RhsDef =
getByteDefinitionIfIntermediateOperand(I.getOperand(1));
- if (!LhsDef || !RhsDef)
+ if (!RhsDef)
+ return {};
+ const ByteDefinition LhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ if (!LhsDef)
return {};
const ByteLayout &Layout = LhsDef.getLayout();
@@ -598,11 +512,13 @@ ByteVector ByteExpander::visitOr(BinaryOperator &I) {
}
ByteVector ByteExpander::visitXor(BinaryOperator &I) {
- const ByteDefinition LhsDef =
- getByteDefinitionIfIntermediateOperand(I.getOperand(0));
const ByteDefinition RhsDef =
getByteDefinitionIfIntermediateOperand(I.getOperand(1));
- if (!LhsDef || !RhsDef)
+ if (!RhsDef)
+ return {};
+ const ByteDefinition LhsDef =
+ getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ if (!LhsDef)
return {};
const ByteLayout &Layout = LhsDef.getLayout();
@@ -629,6 +545,10 @@ ByteVector ByteExpander::visitXor(BinaryOperator &I) {
}
ByteVector ByteExpander::visitShl(BinaryOperator &I) {
+ const auto *Const = dyn_cast<Constant>(I.getOperand(1));
+ if (!Const)
+ return {};
+
const ByteDefinition BaseDef =
getByteDefinitionIfIntermediateOperand(I.getOperand(0));
if (!BaseDef)
@@ -636,10 +556,6 @@ ByteVector ByteExpander::visitShl(BinaryOperator &I) {
const unsigned NumBytes = BaseDef.getLayout().getNumBytes();
- const auto *Const = dyn_cast<Constant>(I.getOperand(1));
- if (!Const)
- return {};
-
if (isa<ConstantInt>(Const)) {
const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue();
if (ShAmt % Byte::BitWidth != 0)
@@ -683,6 +599,10 @@ ByteVector ByteExpander::visitShl(BinaryOperator &I) {
}
ByteVector ByteExpander::visitLShr(BinaryOperator &I) {
+ const auto *Const = dyn_cast<Constant>(I.getOperand(1));
+ if (!Const)
+ return {};
+
const ByteDefinition BaseDef =
getByteDefinitionIfIntermediateOperand(I.getOperand(0));
if (!BaseDef)
@@ -690,10 +610,6 @@ ByteVector ByteExpander::visitLShr(BinaryOperator &I) {
const unsigned NumBytes = BaseDef.getLayout().getNumBytes();
- const auto *Const = dyn_cast<Constant>(I.getOperand(1));
- if (!Const)
- return {};
-
if (isa<ConstantInt>(Const)) {
const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue();
if (ShAmt % Byte::BitWidth != 0)
@@ -739,12 +655,12 @@ ByteVector ByteExpander::visitLShr(BinaryOperator &I) {
}
ByteVector ByteExpander::visitTruncInst(TruncInst &I) {
- const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+ const std::optional<ByteLayout> Layout = getByteLayout(I.getType());
if (!Layout)
return {};
const std::optional<ByteLayout> SrcLayout =
- tryGetByteLayout(I.getOperand(0)->getType());
+ getByteLayout(I.getOperand(0)->getType());
if (!SrcLayout)
return {};
@@ -768,7 +684,7 @@ ByteVector ByteExpander::visitTruncInst(TruncInst &I) {
}
ByteVector ByteExpander::visitZExtInst(ZExtInst &I) {
- const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+ const std::optional<ByteLayout> Layout = getByteLayout(I.getType());
if (!Layout)
return {};
@@ -798,7 +714,7 @@ ByteVector ByteExpander::visitZExtInst(ZExtInst &I) {
}
ByteVector ByteExpander::visitBitCastInst(BitCastInst &I) {
- const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+ const std::optional<ByteLayout> Layout = getByteLayout(I.getType());
if (!Layout)
return {};
@@ -875,7 +791,7 @@ ByteVector ByteExpander::visitInsertElementInst(InsertElementInst &I) {
}
ByteVector ByteExpander::visitShuffleVectorInst(ShuffleVectorInst &I) {
- const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+ const std::optional<ByteLayout> Layout = getByteLayout(I.getType());
if (!Layout)
return {};
@@ -961,16 +877,10 @@ ByteVector *ByteExpander::expandByteDefinition(Value *V) {
}
ByteDefinition ByteExpander::getByteDefinition(Value *V, bool ExpandDef) {
- const std::optional<ByteLayout> Layout = tryGetByteLayout(V->getType());
+ const std::optional<ByteLayout> Layout = getByteLayout(V->getType());
if (!Layout)
return ByteDefinition::invalid();
- if (auto *ConstInt = dyn_cast<ConstantInt>(V))
- return ByteDefinition::constInt(*ConstInt);
- if (auto *Const = dyn_cast<Constant>(V))
- if (Const->getType()->isVectorTy())
- return ByteDefinition::constVec(*Const);
-
if (ExpandDef)
if (ByteVector *BV = expandByteDefinition(V))
return ByteDefinition::vector(*BV, *Layout);
@@ -986,7 +896,7 @@ bool ByteExpander::checkIfIntermediate(Value *V, bool IsOperand) {
if (isa<Constant>(V))
return true;
- /// Short-circuit check.
+ // Short-circuit check.
if (IsOperand && V->hasOneUse())
return true;
@@ -997,13 +907,14 @@ bool ByteExpander::checkIfIntermediate(Value *V, bool IsOperand) {
return Definitions.contains(*FU.begin());
}
-std::vector<Instruction *> ByteExpander::collectPIICandidates(Function &F) {
+std::vector<Instruction *>
+ByteExpander::collectPIICandidates(Function &F,
+ unsigned MaxCollectionIterations) {
std::vector<Instruction *> PackedIntInsts;
- LLVM_DEBUG(dbgs() << "PICP: Entering function " << F.getName() << "\n");
unsigned NumIterations = 1;
for (;;) {
- LLVM_DEBUG(dbgs() << "PICP: Iteration " << NumIterations << "\n");
+ LLVM_DEBUG(dbgs() << "PICP: Iteration " << NumIterations << '\n');
bool Converged = true;
std::vector<Instruction *> CollectedInsts;
@@ -1034,7 +945,7 @@ std::vector<Instruction *> ByteExpander::collectPIICandidates(Function &F) {
LLVM_DEBUG({
dbgs() << "PICP: Updating definition: ";
I.printAsOperand(dbgs());
- dbgs() << " = " << getByteDefinition(&I) << "\n";
+ dbgs() << " = " << getByteDefinition(&I) << '\n';
});
}
}
@@ -1056,7 +967,7 @@ std::vector<Instruction *> ByteExpander::collectPIICandidates(Function &F) {
++NumIterations;
}
- LLVM_DEBUG(dbgs() << "PICP: Total iterations: " << NumIterations << "\n");
+ LLVM_DEBUG(dbgs() << "PICP: Total iterations: " << NumIterations << '\n');
return PackedIntInsts;
}
@@ -1203,7 +1114,7 @@ class BytePackFolder {
LLVM_DEBUG({
dbgs() << "PICP [";
TargetInst->printAsOperand(dbgs());
- dbgs() << "]: Queuing cast " << *CI << "\n";
+ dbgs() << "]: Queuing cast " << *CI << '\n';
});
return CI;
}
@@ -1216,7 +1127,7 @@ class BytePackFolder {
LLVM_DEBUG({
dbgs() << "PICP [";
TargetInst->printAsOperand(dbgs());
- dbgs() << "]: Queuing inst " << *I << "\n";
+ dbgs() << "]: Queuing inst " << *I << '\n';
});
return I;
}
@@ -1346,7 +1257,7 @@ class BytePackFolder {
const unsigned NumTargetBytes = Layout.getNumBytes();
Value *V = CB.Base;
- const unsigned NumSrcBytes = getByteLayout(V->getType()).getNumBytes();
+ const unsigned NumSrcBytes = getByteLayout(V->getType())->getNumBytes();
const StringRef &Name = V->getName();
// Transformation: shr -> trunc -> mask -> zext -> shl
@@ -1418,7 +1329,7 @@ class BytePackFolder {
const unsigned NumTargetBytes = Layout.getNumBytes();
Value *V = CB.Base;
const StringRef &Name = V->getName();
- ByteLayout VecLayout = getByteLayout(V->getType());
+ ByteLayout VecLayout = *getByteLayout(V->getType());
// For sub-element accesses, try to subdivide the vector into smaller
// elements.
@@ -1431,7 +1342,7 @@ class BytePackFolder {
auto *NewTy = FixedVectorType::get(TargetIntTy, VecLayout.NumVecElements *
SplitFactor);
V = pushCast(Instruction::BitCast, V, NewTy);
- VecLayout = getByteLayout(V->getType());
+ VecLayout = *getByteLayout(V->getType());
}
// Give up if bytes are obtained from a strange offset.
@@ -1526,7 +1437,7 @@ class BytePackFolder {
auto *TargetVecTy = cast<FixedVectorType>(TargetInst->getType());
Type *TargetEltTy = TargetVecTy->getElementType();
- const ByteLayout SrcLayout = getByteLayout(CB.Base->getType());
+ const ByteLayout SrcLayout = *getByteLayout(CB.Base->getType());
Value *V = CB.Base;
const StringRef &Name = V->getName();
@@ -1602,7 +1513,7 @@ class BytePackFolder {
public:
BytePackFolder(Instruction *TargetV)
- : TargetInst(TargetV), Layout(getByteLayout(TargetV->getType())),
+ : TargetInst(TargetV), Layout(*getByteLayout(TargetV->getType())),
VectorAlignedPack(PartialBytePack::invalid()) {}
~BytePackFolder() {
@@ -1612,7 +1523,7 @@ class BytePackFolder {
LLVM_DEBUG({
dbgs() << "PICP [";
TargetInst->printAsOperand(dbgs());
- dbgs() << "]: Dequeuing cast " << *I << "\n";
+ dbgs() << "]: Dequeuing cast " << *I << '\n';
});
I->replaceAllUsesWith(PoisonValue::get(I->getType()));
I->deleteValue();
@@ -1622,7 +1533,7 @@ class BytePackFolder {
LLVM_DEBUG({
dbgs() << "PICP [";
TargetInst->printAsOperand(dbgs());
- dbgs() << "]: Dequeuing inst " << *Insts.back() << "\n";
+ dbgs() << "]: Dequeuing inst " << *Insts.back() << '\n';
});
Insts.back()->deleteValue();
Insts.pop_back();
@@ -1632,15 +1543,17 @@ class BytePackFolder {
/// Try to generate instructions for coalescing the given bytes and aligning
/// them to the target value. Returns true iff this is successful.
bool pushCoalescedBytes(CoalescedBytes CB) {
- if (isa<Constant>(CB.Base) && CB.SignedShrByteOffset == 0) {
- WorkList.emplace_back(CB.Base, CB.Mask);
- return true;
- }
+ if (CB.SignedShrByteOffset == 0)
+ if (auto *Const = dyn_cast<Constant>(CB.Base)) {
+ WorkList.emplace_back(
+ ConstantExpr::getBitCast(Const, TargetInst->getType()), CB.Mask);
+ return true;
+ }
LLVM_DEBUG({
dbgs() << "PICP [";
TargetInst->printAsOperand(dbgs());
- dbgs() << "]: Preparing bytes " << CB << "\n";
+ dbgs() << "]: Preparing bytes " << CB << '\n';
});
if (isa<FixedVectorType>(TargetInst->getType())) {
if (isa<FixedVectorType>(CB.Base->getType()))
@@ -1735,8 +1648,9 @@ struct PackedIntInstruction {
/// If the rewriter is non-aggressive, return nullopt if the rewriting is
/// determined to be unnecessary.
static std::optional<SmallVector<CoalescedBytes, 8>>
-getCoalescingOpportunity(Type *Ty, const ByteVector &BV) {
- const ByteLayout Layout = getByteLayout(Ty);
+getCoalescingOpportunity(Type *Ty, const ByteVector &BV,
+ bool AggressiveRewriting) {
+ const ByteLayout Layout = *getByteLayout(Ty);
assert(Layout.getNumBytes() == BV.size() &&
"Byte definition has unexpected width.");
@@ -1761,8 +1675,8 @@ getCoalescingOpportunity(Type *Ty, const ByteVector &BV) {
} else {
CoalescedBytes *CB = nullptr;
Value *Base = B.getBase();
- const signed Offset =
- static_cast<signed>(B.getIndex()) - static_cast<signed>(ByteIdx);
+ const int Offset =
+ static_cast<int>(B.getIndex()) - static_cast<int>(ByteIdx);
for (unsigned CBIdx = 0; CBIdx < CBV.size(); ++CBIdx) {
if (CBV[CBIdx].alignsWith(Base, Offset)) {
CB = &CBV[CBIdx];
@@ -1773,7 +1687,7 @@ getCoalescingOpportunity(Type *Ty, const ByteVector &BV) {
LLVM_DEBUG(dbgs()
<< "PICP: Bytes " << *CB << " from operand " << OpIdx
<< " can be coalesced with byte " << B
- << " from operand " << BU.getOperandIndex() << "\n");
+ << " from operand " << BU.getOperandIndex() << '\n');
OperandsAlreadyCoalesced = false;
}
}
@@ -1831,7 +1745,8 @@ getCoalescingOpportunity(Type *Ty, const ByteVector &BV) {
/// Queue into \p PIIV the set of final values (or operands thereof, if the
/// rewriter is non-aggressive) which are deemed beneficial to rewrite.
static void queueRewriting(std::vector<PackedIntInstruction> &PIIV,
- Instruction &FinalInst, ByteExpander &BE) {
+ Instruction &FinalInst, ByteExpander &BE,
+ bool AggressiveRewriting) {
SmallVector<Instruction *, 8> WorkList{&FinalInst};
SmallPtrSet<Instruction *, 8> Seen{&FinalInst};
@@ -1844,15 +1759,15 @@ static void queueRewriting(std::vector<PackedIntInstruction> &PIIV,
// This instruction is beyond the analysis scope of PICP.
continue;
- LLVM_DEBUG(dbgs() << "PICP rewrite candidate: " << *I << "\n"
+ LLVM_DEBUG(dbgs() << "PICP rewrite candidate: " << *I << '\n'
<< " byte pack: " << BE.getByteDefinition(I)
- << "\n");
+ << '\n');
auto CBV = [&]() -> std::optional<SmallVector<CoalescedBytes, 8>> {
// Short-circuit check for casts.
if (!AggressiveRewriting && I->getNumOperands() == 1)
return std::nullopt;
- return getCoalescingOpportunity(I->getType(), *BV);
+ return getCoalescingOpportunity(I->getType(), *BV, AggressiveRewriting);
}();
if (!CBV) {
@@ -1868,19 +1783,20 @@ static void queueRewriting(std::vector<PackedIntInstruction> &PIIV,
} while (!WorkList.empty());
}
-static bool runImpl(Function &F) {
+static bool runImpl(Function &F, PackedIntegerCombineOptions Options) {
ByteExpander BE;
- std::vector<Instruction *> PIICandidates = BE.collectPIICandidates(F);
+ std::vector<Instruction *> PIICandidates =
+ BE.collectPIICandidates(F, Options.MaxCollectionIterations);
std::vector<PackedIntInstruction> PIIV;
for (Instruction *I : PIICandidates) {
if (!BE.checkIfIntermediate(I))
- queueRewriting(PIIV, *I, BE);
+ queueRewriting(PIIV, *I, BE, Options.AggressiveRewriting);
else
- LLVM_DEBUG(dbgs() << "PICP intermediate inst: " << *I << "\n"
+ LLVM_DEBUG(dbgs() << "PICP intermediate inst: " << *I << '\n'
<< " final user: "
- << **BE.getFinalUsers(I).begin() << "\n");
+ << **BE.getFinalUsers(I).begin() << '\n');
}
DenseMap<Instruction *, Value *> InstSubs;
@@ -1888,7 +1804,7 @@ static bool runImpl(Function &F) {
for (const PackedIntInstruction &PII : PIIV)
if (Value *V = PII.rewrite(IRB)) {
LLVM_DEBUG(dbgs() << "PICP rewrite successful for " << *PII.TargetInst
- << "\n");
+ << '\n');
InstSubs[PII.TargetInst] = V;
}
@@ -1907,12 +1823,15 @@ static bool runImpl(Function &F) {
}
class PackedIntegerCombineLegacyPass : public FunctionPass {
+ PackedIntegerCombineOptions Options;
+
public:
static char ID;
- PackedIntegerCombineLegacyPass() : FunctionPass(ID) {}
+ PackedIntegerCombineLegacyPass(PackedIntegerCombineOptions Options)
+ : FunctionPass(ID), Options(Options) {}
- bool runOnFunction(Function &F) override { return runImpl(F); }
+ bool runOnFunction(Function &F) override { return runImpl(F, Options); }
};
char PackedIntegerCombineLegacyPass::ID = 0;
@@ -1920,7 +1839,7 @@ char PackedIntegerCombineLegacyPass::ID = 0;
PreservedAnalyses PackedIntegerCombinePass::run(Function &F,
FunctionAnalysisManager &AM) {
- if (!runImpl(F))
+ if (!runImpl(F, Options))
return PreservedAnalyses::all();
PreservedAnalyses PA;
@@ -1928,9 +1847,22 @@ PreservedAnalyses PackedIntegerCombinePass::run(Function &F,
return PA;
}
+void PackedIntegerCombinePass::printPipeline(
+ raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+ static_cast<PassInfoMixin<PackedIntegerCombinePass> *>(this)->printPipeline(
+ OS, MapClassName2PassName);
+ OS << '<';
+ if (Options.AggressiveRewriting)
+ OS << "aggressive;";
+ OS << "max-iterations=" << Options.MaxCollectionIterations << '>';
+}
+
INITIALIZE_PASS(PackedIntegerCombineLegacyPass, DEBUG_TYPE,
"Packed Integer Combine", false, false)
-FunctionPass *llvm::createPackedIntegerCombinePass() {
- return new PackedIntegerCombineLegacyPass();
+FunctionPass *
+llvm::createPackedIntegerCombinePass(unsigned MaxCollectionIterations,
+ bool AggressiveRewriting) {
+ return new PackedIntegerCombineLegacyPass(
+ {MaxCollectionIterations, AggressiveRewriting});
}
diff --git a/llvm/test/Transforms/PackedIntegerCombine/instructions.ll b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll
index e7b6a6bc66fa1..c3804eeff6891 100644
--- a/llvm/test/Transforms/PackedIntegerCombine/instructions.ll
+++ b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll
@@ -1,53 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
-; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
-
-;; u0xff00ff00 = -16711936
-;; u0x00ff00ff = 16711935
-define i32 @add.0(i32 %a, i32 %b) {
-; LAZY-LABEL: define i32 @add.0(
-; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711936
-; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935
-; LAZY-NEXT: [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]]
-; LAZY-NEXT: ret i32 [[ADD]]
-;
-; AGGRESSIVE-LABEL: define i32 @add.0(
-; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; AGGRESSIVE-NEXT: [[B_MASK2:%.*]] = and i32 [[B]], 16711935
-; AGGRESSIVE-NEXT: [[A_MASK4:%.*]] = and i32 [[A]], -16711936
-; AGGRESSIVE-NEXT: [[ADD_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]]
-; AGGRESSIVE-NEXT: ret i32 [[ADD_MERGE]]
-;
- %a.mask = and i32 %a, u0xff00ff00
- %b.mask = and i32 %b, u0x00ff00ff
- %add = add i32 %a.mask, %b.mask
- ret i32 %add
-}
-
-;; u0xff00ffff = -16711681
-;; u0x00ff00ff = 16711935
-;; Nothing happens in this case because of the overlapping bytes.
-define i32 @add.1(i32 %a, i32 %b) {
-; LAZY-LABEL: define i32 @add.1(
-; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711681
-; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935
-; LAZY-NEXT: [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]]
-; LAZY-NEXT: ret i32 [[ADD]]
-;
-; AGGRESSIVE-LABEL: define i32 @add.1(
-; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i32 [[A]], -16711681
-; AGGRESSIVE-NEXT: [[B_MASK4:%.*]] = and i32 [[B]], 16711935
-; AGGRESSIVE-NEXT: [[ADD:%.*]] = add i32 [[A_MASK2]], [[B_MASK4]]
-; AGGRESSIVE-NEXT: ret i32 [[ADD]]
-;
- %a.mask = and i32 %a, u0xff00ffff
- %b.mask = and i32 %b, u0x00ff00ff
- %add = add i32 %a.mask, %b.mask
- ret i32 %add
-}
+; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes="packed-integer-combine<aggressive>" %s | FileCheck %s --check-prefix=AGGRESSIVE
define i32 @and.0(i32 %a, i32 %b) {
; LAZY-LABEL: define i32 @and.0(
diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2int.ll b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll
index fe08ce93719d0..274d01f4ad70f 100644
--- a/llvm/test/Transforms/PackedIntegerCombine/int2int.ll
+++ b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
-; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes="packed-integer-combine<aggressive>" %s | FileCheck %s --check-prefix=AGGRESSIVE
define i16 @top_bytes(i32 %a, i32 %b) {
; LAZY-LABEL: define i16 @top_bytes(
diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
index e4c1538826e0f..ad4424a84a6cc 100644
--- a/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
+++ b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
-; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes="packed-integer-combine<aggressive>" %s | FileCheck %s --check-prefix=AGGRESSIVE
define <2 x i8> @top_bytes(i32 %a, i32 %b) {
; LAZY-LABEL: define <2 x i8> @top_bytes(
diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
index 22ee7fcf5b4c6..624f2b4f8acac 100644
--- a/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
+++ b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
-; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes="packed-integer-combine<aggressive>" %s | FileCheck %s --check-prefix=AGGRESSIVE
define i32 @extract_i32(<4 x i8> %from) {
; LAZY-LABEL: define i32 @extract_i32(
diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
index 5baefc7fb6cda..b6dbaa546c718 100644
--- a/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
+++ b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
-; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes="packed-integer-combine<aggressive>" %s | FileCheck %s --check-prefix=AGGRESSIVE
define <4 x i8> @obtain_v4i8(<2 x i16> %from) {
; LAZY-LABEL: define <4 x i8> @obtain_v4i8(
More information about the llvm-commits
mailing list