[llvm] reapply "[VectorCombine] Fold scalar selects from bitcast into vector select" (PR #174762)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 7 04:48:37 PST 2026
https://github.com/PankajDwivedi-25 created https://github.com/llvm/llvm-project/pull/174762
Reapply https://github.com/llvm/llvm-project/pull/173990
>From a5260ed026e6431a21f41897b8da3a4b59a876a2 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Tue, 30 Dec 2025 18:53:14 +0530
Subject: [PATCH 01/11] Combine scalarized selects back into vector selects
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 149 ++++
.../CodeGen/AMDGPU/combine-scalar-selects.ll | 638 ++++++++++++++++++
2 files changed, 787 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 63e265612cbf7..9e5b076409862 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -14,6 +14,7 @@
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -21,14 +22,17 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/Local.h"
#define DEBUG_TYPE "amdgpu-late-codegenprepare"
using namespace llvm;
+using namespace llvm::PatternMatch;
// Scalar load widening needs running after load-store-vectorizer as that pass
// doesn't handle overlapping cases. In addition, this pass enhances the
@@ -40,6 +44,12 @@ static cl::opt<bool>
"AMDGPULateCodeGenPrepare"),
cl::ReallyHidden, cl::init(true));
+static cl::opt<bool> CombineScalarSelects(
+ "amdgpu-late-codegenprepare-combine-scalar-selects",
+ cl::desc("Combine scalarized selects back into vector selects in "
+ "AMDGPULateCodeGenPrepare"),
+ cl::ReallyHidden, cl::init(true));
+
namespace {
class AMDGPULateCodeGenPrepare
@@ -68,6 +78,24 @@ class AMDGPULateCodeGenPrepare
bool canWidenScalarExtLoad(LoadInst &LI) const;
bool visitLoadInst(LoadInst &LI);
+
+ /// Combine scalarized selects from a bitcast back into a vector select.
+ ///
+ /// This optimization addresses VGPR bloat from patterns like:
+ /// %vec = bitcast <4 x i32> %src to <16 x i8>
+ /// %e0 = extractelement <16 x i8> %vec, i64 0
+ /// %s0 = select i1 %cond, i8 %e0, i8 0
+ /// ... (repeated for all 16 elements)
+ ///
+ /// Which generates 16 separate v_cndmask_b32 instructions. Instead, we
+ /// transform it to:
+ /// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
+ /// %vec = bitcast <4 x i32> %sel to <16 x i8>
+ /// %e0 = extractelement <16 x i8> %vec, i64 0
+ /// ...
+ ///
+ /// This produces only 4 v_cndmask_b32 instructions operating on dwords.
+ bool tryCombineSelectsFromBitcast(BitCastInst &BC);
};
using ValueToValueMap = DenseMap<const Value *, Value *>;
@@ -225,6 +253,20 @@ bool AMDGPULateCodeGenPrepare::run() {
Changed |= LRO.optimizeLiveType(&I, DeadInsts);
}
+ // Combine scalarized selects back into vector selects.
+ // This uses a top-down approach: iterate over bitcasts (i32 vec -> i8 vec)
+ // and collect all select instructions that use extracted elements with a
+ // zero false value. By starting from the bitcast, we process each source
+ // exactly once, avoiding redundant work when multiple selects share a source.
+ if (CombineScalarSelects) {
+ for (auto &BB : F) {
+ for (Instruction &I : make_early_inc_range(BB)) {
+ if (auto *BC = dyn_cast<BitCastInst>(&I))
+ Changed |= tryCombineSelectsFromBitcast(*BC);
+ }
+ }
+ }
+
RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);
return Changed;
}
@@ -551,6 +593,113 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
return true;
}
+bool AMDGPULateCodeGenPrepare::tryCombineSelectsFromBitcast(BitCastInst &BC) {
+ auto *SrcVecTy = dyn_cast<FixedVectorType>(BC.getSrcTy());
+ auto *DstVecTy = dyn_cast<FixedVectorType>(BC.getDestTy());
+ if (!SrcVecTy || !DstVecTy)
+ return false;
+
+ // Must be: bitcast <N x i32> to <M x i8>
+ if (!SrcVecTy->getElementType()->isIntegerTy(32) ||
+ !DstVecTy->getElementType()->isIntegerTy(8))
+ return false;
+
+ unsigned NumDstElts = DstVecTy->getNumElements();
+ BasicBlock *BB = BC.getParent();
+
+ // Require at least half the elements to have matching selects.
+ // For v16i8 (from v4i32), this means at least 8 selects must match.
+ // This threshold ensures the transformation is profitable.
+ unsigned MinRequired = NumDstElts / 2;
+
+ // Early exit: not enough users to possibly meet the threshold.
+ if (BC.getNumUses() < MinRequired)
+ return false;
+
+ // Group selects by their condition value. Different conditions selecting
+ // from the same bitcast are handled as independent groups, allowing us to
+ // optimize multiple select patterns from a single bitcast.
+ struct SelectGroup {
+ // Map from element index to (select, extractelement) pair.
+ SmallDenseMap<unsigned, std::pair<SelectInst *, ExtractElementInst *>, 16>
+ Selects;
+ // Track the earliest select instruction for correct insertion point.
+ SelectInst *FirstSelect = nullptr;
+ };
+ DenseMap<Value *, SelectGroup> ConditionGroups;
+
+ // Collect all matching select patterns in a single pass.
+ // Pattern: select i1 %cond, i8 (extractelement %bc, idx), i8 0
+ for (User *U : BC.users()) {
+ auto *Ext = dyn_cast<ExtractElementInst>(U);
+ if (!Ext || Ext->getParent() != BB)
+ continue;
+
+ auto *IdxC = dyn_cast<ConstantInt>(Ext->getIndexOperand());
+ if (!IdxC || IdxC->getZExtValue() >= NumDstElts)
+ continue;
+
+ unsigned Idx = IdxC->getZExtValue();
+
+ for (User *EU : Ext->users()) {
+ auto *Sel = dyn_cast<SelectInst>(EU);
+ // Must be: select %cond, %extract, 0 (in same BB)
+ if (!Sel || Sel->getParent() != BB || Sel->getTrueValue() != Ext ||
+ !match(Sel->getFalseValue(), m_Zero()))
+ continue;
+
+ auto &Group = ConditionGroups[Sel->getCondition()];
+ Group.Selects[Idx] = {Sel, Ext};
+
+ // Track earliest select to ensure correct dominance for insertion.
+ if (!Group.FirstSelect || Sel->comesBefore(Group.FirstSelect))
+ Group.FirstSelect = Sel;
+ }
+ }
+
+ bool Changed = false;
+
+ // Process each condition group that meets the threshold.
+ for (auto &[Cond, Group] : ConditionGroups) {
+ if (Group.Selects.size() < MinRequired)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "AMDGPULateCodeGenPrepare: Combining "
+ << Group.Selects.size()
+ << " scalar selects into vector select\n");
+
+ // Insert before the first select to maintain dominance.
+ IRBuilder<> Builder(Group.FirstSelect);
+
+ // Create vector select: select i1 %cond, <N x i32> %src, zeroinitializer
+ Value *VecSel =
+ Builder.CreateSelect(Cond, BC.getOperand(0),
+ Constant::getNullValue(SrcVecTy), "combined.sel");
+
+ // Bitcast the selected vector back to the byte vector type.
+ Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy, "combined.bc");
+
+ // Replace each scalar select with an extract from the combined result.
+ for (auto &[Idx, Pair] : Group.Selects) {
+ Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
+ Pair.first->replaceAllUsesWith(NewExt);
+ DeadInsts.emplace_back(Pair.first);
+
+ // Mark the original extract as dead if it has no remaining uses.
+ if (Pair.second->use_empty())
+ DeadInsts.emplace_back(Pair.second);
+ }
+
+ Changed = true;
+ }
+
+ // Mark the original bitcast as dead if all its users were replaced.
+ if (Changed && BC.use_empty())
+ DeadInsts.emplace_back(&BC);
+
+ return Changed;
+}
+
PreservedAnalyses
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
new file mode 100644
index 0000000000000..973da749d9daf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
@@ -0,0 +1,638 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-late-codegenprepare -S %s | FileCheck %s --check-prefix=CHECK-OPT
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-late-codegenprepare -amdgpu-late-codegenprepare-combine-scalar-selects=false -S %s | FileCheck %s --check-prefix=CHECK-NOOPT
+
+; Test that multiple scalar selects from the same vector source are combined
+; back into a vector select when the optimization is enabled, and remain as
+; individual scalar selects when disabled.
+
+; This pattern occurs when buffer_load_dwordx4 results are bitcast to v16i8,
+; then each byte is extracted and conditionally selected with zero.
+
+define amdgpu_kernel void @combine_scalar_selects_v16i8(
+;
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_scalar_selects_v16i8(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[BUFFER_RESOURCE:%.*]], i32 [[OFFSET:%.*]], i1 [[VALID:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[LOADED:%.*]] = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> [[BUFFER_RESOURCE]], i32 [[OFFSET]], i32 0, i32 0)
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[VALID]], <4 x i32> [[LOADED]], <4 x i32> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x i32> [[COMBINED_SEL]] to <16 x i8>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
+; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
+; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
+; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
+; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
+; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i8 [[TMP14]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-OPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
+; CHECK-OPT-NEXT: store i8 [[TMP10]], ptr addrspace(1) [[PTR8]], align 1
+; CHECK-OPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
+; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR9]], align 1
+; CHECK-OPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
+; CHECK-OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[PTR10]], align 1
+; CHECK-OPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
+; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR11]], align 1
+; CHECK-OPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
+; CHECK-OPT-NEXT: store i8 [[TMP15]], ptr addrspace(1) [[PTR12]], align 1
+; CHECK-OPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
+; CHECK-OPT-NEXT: store i8 [[TMP8]], ptr addrspace(1) [[PTR13]], align 1
+; CHECK-OPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
+; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR14]], align 1
+; CHECK-OPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
+; CHECK-OPT-NEXT: store i8 [[TMP11]], ptr addrspace(1) [[PTR15]], align 1
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_scalar_selects_v16i8(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[BUFFER_RESOURCE:%.*]], i32 [[OFFSET:%.*]], i1 [[VALID:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[LOADED:%.*]] = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> [[BUFFER_RESOURCE]], i32 [[OFFSET]], i32 0, i32 0)
+; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[LOADED]] to <16 x i8>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[BYTES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <16 x i8> [[BYTES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <16 x i8> [[BYTES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <16 x i8> [[BYTES]], i64 7
+; CHECK-NOOPT-NEXT: [[E8:%.*]] = extractelement <16 x i8> [[BYTES]], i64 8
+; CHECK-NOOPT-NEXT: [[E9:%.*]] = extractelement <16 x i8> [[BYTES]], i64 9
+; CHECK-NOOPT-NEXT: [[E10:%.*]] = extractelement <16 x i8> [[BYTES]], i64 10
+; CHECK-NOOPT-NEXT: [[E11:%.*]] = extractelement <16 x i8> [[BYTES]], i64 11
+; CHECK-NOOPT-NEXT: [[E12:%.*]] = extractelement <16 x i8> [[BYTES]], i64 12
+; CHECK-NOOPT-NEXT: [[E13:%.*]] = extractelement <16 x i8> [[BYTES]], i64 13
+; CHECK-NOOPT-NEXT: [[E14:%.*]] = extractelement <16 x i8> [[BYTES]], i64 14
+; CHECK-NOOPT-NEXT: [[E15:%.*]] = extractelement <16 x i8> [[BYTES]], i64 15
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[VALID]], i8 [[E0]], i8 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[VALID]], i8 [[E1]], i8 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[VALID]], i8 [[E2]], i8 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[VALID]], i8 [[E3]], i8 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[VALID]], i8 [[E4]], i8 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[VALID]], i8 [[E5]], i8 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[VALID]], i8 [[E6]], i8 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[VALID]], i8 [[E7]], i8 0
+; CHECK-NOOPT-NEXT: [[S8:%.*]] = select i1 [[VALID]], i8 [[E8]], i8 0
+; CHECK-NOOPT-NEXT: [[S9:%.*]] = select i1 [[VALID]], i8 [[E9]], i8 0
+; CHECK-NOOPT-NEXT: [[S10:%.*]] = select i1 [[VALID]], i8 [[E10]], i8 0
+; CHECK-NOOPT-NEXT: [[S11:%.*]] = select i1 [[VALID]], i8 [[E11]], i8 0
+; CHECK-NOOPT-NEXT: [[S12:%.*]] = select i1 [[VALID]], i8 [[E12]], i8 0
+; CHECK-NOOPT-NEXT: [[S13:%.*]] = select i1 [[VALID]], i8 [[E13]], i8 0
+; CHECK-NOOPT-NEXT: [[S14:%.*]] = select i1 [[VALID]], i8 [[E14]], i8 0
+; CHECK-NOOPT-NEXT: [[S15:%.*]] = select i1 [[VALID]], i8 [[E15]], i8 0
+; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-NOOPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
+; CHECK-NOOPT-NEXT: store i8 [[S8]], ptr addrspace(1) [[PTR8]], align 1
+; CHECK-NOOPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
+; CHECK-NOOPT-NEXT: store i8 [[S9]], ptr addrspace(1) [[PTR9]], align 1
+; CHECK-NOOPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
+; CHECK-NOOPT-NEXT: store i8 [[S10]], ptr addrspace(1) [[PTR10]], align 1
+; CHECK-NOOPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
+; CHECK-NOOPT-NEXT: store i8 [[S11]], ptr addrspace(1) [[PTR11]], align 1
+; CHECK-NOOPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
+; CHECK-NOOPT-NEXT: store i8 [[S12]], ptr addrspace(1) [[PTR12]], align 1
+; CHECK-NOOPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
+; CHECK-NOOPT-NEXT: store i8 [[S13]], ptr addrspace(1) [[PTR13]], align 1
+; CHECK-NOOPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
+; CHECK-NOOPT-NEXT: store i8 [[S14]], ptr addrspace(1) [[PTR14]], align 1
+; CHECK-NOOPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
+; CHECK-NOOPT-NEXT: store i8 [[S15]], ptr addrspace(1) [[PTR15]], align 1
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <4 x i32> %buffer_resource,
+ i32 %offset,
+ i1 %valid
+) {
+entry:
+ %loaded = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %buffer_resource, i32 %offset, i32 0, i32 0)
+ %bytes = bitcast <4 x i32> %loaded to <16 x i8>
+
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %e1 = extractelement <16 x i8> %bytes, i64 1
+ %e2 = extractelement <16 x i8> %bytes, i64 2
+ %e3 = extractelement <16 x i8> %bytes, i64 3
+ %e4 = extractelement <16 x i8> %bytes, i64 4
+ %e5 = extractelement <16 x i8> %bytes, i64 5
+ %e6 = extractelement <16 x i8> %bytes, i64 6
+ %e7 = extractelement <16 x i8> %bytes, i64 7
+ %e8 = extractelement <16 x i8> %bytes, i64 8
+ %e9 = extractelement <16 x i8> %bytes, i64 9
+ %e10 = extractelement <16 x i8> %bytes, i64 10
+ %e11 = extractelement <16 x i8> %bytes, i64 11
+ %e12 = extractelement <16 x i8> %bytes, i64 12
+ %e13 = extractelement <16 x i8> %bytes, i64 13
+ %e14 = extractelement <16 x i8> %bytes, i64 14
+ %e15 = extractelement <16 x i8> %bytes, i64 15
+
+ %s0 = select i1 %valid, i8 %e0, i8 0
+ %s1 = select i1 %valid, i8 %e1, i8 0
+ %s2 = select i1 %valid, i8 %e2, i8 0
+ %s3 = select i1 %valid, i8 %e3, i8 0
+ %s4 = select i1 %valid, i8 %e4, i8 0
+ %s5 = select i1 %valid, i8 %e5, i8 0
+ %s6 = select i1 %valid, i8 %e6, i8 0
+ %s7 = select i1 %valid, i8 %e7, i8 0
+ %s8 = select i1 %valid, i8 %e8, i8 0
+ %s9 = select i1 %valid, i8 %e9, i8 0
+ %s10 = select i1 %valid, i8 %e10, i8 0
+ %s11 = select i1 %valid, i8 %e11, i8 0
+ %s12 = select i1 %valid, i8 %e12, i8 0
+ %s13 = select i1 %valid, i8 %e13, i8 0
+ %s14 = select i1 %valid, i8 %e14, i8 0
+ %s15 = select i1 %valid, i8 %e15, i8 0
+
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
+ store i8 %s8, ptr addrspace(1) %ptr8, align 1
+ %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
+ store i8 %s9, ptr addrspace(1) %ptr9, align 1
+ %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
+ store i8 %s10, ptr addrspace(1) %ptr10, align 1
+ %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
+ store i8 %s11, ptr addrspace(1) %ptr11, align 1
+ %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
+ store i8 %s12, ptr addrspace(1) %ptr12, align 1
+ %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
+ store i8 %s13, ptr addrspace(1) %ptr13, align 1
+ %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
+ store i8 %s14, ptr addrspace(1) %ptr14, align 1
+ %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
+ store i8 %s15, ptr addrspace(1) %ptr15, align 1
+
+ ret void
+}
+
+; Test with v8i8 from v2i32 (smaller vector)
+define amdgpu_kernel void @combine_scalar_selects_v8i8(
+;
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_scalar_selects_v8i8(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i32> [[SRC]], <2 x i32> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i32> [[COMBINED_SEL]] to <8 x i8>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_scalar_selects_v8i8(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i8> [[BYTES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i8> [[BYTES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i8> [[BYTES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i8> [[BYTES]], i64 7
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
+; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <2 x i32> %src,
+ i1 %cond
+) {
+entry:
+ %bytes = bitcast <2 x i32> %src to <8 x i8>
+ %e0 = extractelement <8 x i8> %bytes, i64 0
+ %e1 = extractelement <8 x i8> %bytes, i64 1
+ %e2 = extractelement <8 x i8> %bytes, i64 2
+ %e3 = extractelement <8 x i8> %bytes, i64 3
+ %e4 = extractelement <8 x i8> %bytes, i64 4
+ %e5 = extractelement <8 x i8> %bytes, i64 5
+ %e6 = extractelement <8 x i8> %bytes, i64 6
+ %e7 = extractelement <8 x i8> %bytes, i64 7
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ ret void
+}
+
+; Test partial coverage: 10 out of 16 elements (should still combine, >= half)
+define amdgpu_kernel void @combine_partial_selects(
+;
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_partial_selects(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x i32> [[SRC]], <4 x i32> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x i32> [[COMBINED_SEL]] to <16 x i8>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i8 [[TMP8]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-OPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
+; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR8]], align 1
+; CHECK-OPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
+; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR9]], align 1
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_partial_selects(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[SRC]] to <16 x i8>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[BYTES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <16 x i8> [[BYTES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <16 x i8> [[BYTES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <16 x i8> [[BYTES]], i64 7
+; CHECK-NOOPT-NEXT: [[E8:%.*]] = extractelement <16 x i8> [[BYTES]], i64 8
+; CHECK-NOOPT-NEXT: [[E9:%.*]] = extractelement <16 x i8> [[BYTES]], i64 9
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
+; CHECK-NOOPT-NEXT: [[S8:%.*]] = select i1 [[COND]], i8 [[E8]], i8 0
+; CHECK-NOOPT-NEXT: [[S9:%.*]] = select i1 [[COND]], i8 [[E9]], i8 0
+; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-NOOPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
+; CHECK-NOOPT-NEXT: store i8 [[S8]], ptr addrspace(1) [[PTR8]], align 1
+; CHECK-NOOPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
+; CHECK-NOOPT-NEXT: store i8 [[S9]], ptr addrspace(1) [[PTR9]], align 1
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <4 x i32> %src,
+ i1 %cond
+) {
+entry:
+ %bytes = bitcast <4 x i32> %src to <16 x i8>
+ ; Only extract and select 10 elements (indices 0-9)
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %e1 = extractelement <16 x i8> %bytes, i64 1
+ %e2 = extractelement <16 x i8> %bytes, i64 2
+ %e3 = extractelement <16 x i8> %bytes, i64 3
+ %e4 = extractelement <16 x i8> %bytes, i64 4
+ %e5 = extractelement <16 x i8> %bytes, i64 5
+ %e6 = extractelement <16 x i8> %bytes, i64 6
+ %e7 = extractelement <16 x i8> %bytes, i64 7
+ %e8 = extractelement <16 x i8> %bytes, i64 8
+ %e9 = extractelement <16 x i8> %bytes, i64 9
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ %s8 = select i1 %cond, i8 %e8, i8 0
+ %s9 = select i1 %cond, i8 %e9, i8 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
+ store i8 %s8, ptr addrspace(1) %ptr8, align 1
+ %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
+ store i8 %s9, ptr addrspace(1) %ptr9, align 1
+ ret void
+}
+
+; Negative test: should not combine if false value is not zero
+define amdgpu_kernel void @no_combine_non_zero_false(
+;
+; CHECK-OPT-LABEL: define amdgpu_kernel void @no_combine_non_zero_false(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[BUFFER_RESOURCE:%.*]], i32 [[OFFSET:%.*]], i1 [[VALID:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[LOADED:%.*]] = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> [[BUFFER_RESOURCE]], i32 [[OFFSET]], i32 0, i32 0)
+; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[LOADED]] to <16 x i8>
+; CHECK-OPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-OPT-NEXT: [[S0:%.*]] = select i1 [[VALID]], i8 [[E0]], i8 1
+; CHECK-OPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @no_combine_non_zero_false(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[BUFFER_RESOURCE:%.*]], i32 [[OFFSET:%.*]], i1 [[VALID:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[LOADED:%.*]] = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> [[BUFFER_RESOURCE]], i32 [[OFFSET]], i32 0, i32 0)
+; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[LOADED]] to <16 x i8>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[VALID]], i8 [[E0]], i8 1
+; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <4 x i32> %buffer_resource,
+ i32 %offset,
+ i1 %valid
+) {
+entry:
+ %loaded = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %buffer_resource, i32 %offset, i32 0, i32 0)
+ %bytes = bitcast <4 x i32> %loaded to <16 x i8>
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %s0 = select i1 %valid, i8 %e0, i8 1 ; false value is 1, not 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ ret void
+}
+
+; Negative test: too few selects (only 4 out of 16, less than half)
+define amdgpu_kernel void @no_combine_too_few_selects(
+;
+; CHECK-OPT-LABEL: define amdgpu_kernel void @no_combine_too_few_selects(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[SRC]] to <16 x i8>
+; CHECK-OPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-OPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
+; CHECK-OPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
+; CHECK-OPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
+; CHECK-OPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
+; CHECK-OPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
+; CHECK-OPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
+; CHECK-OPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
+; CHECK-OPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @no_combine_too_few_selects(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[SRC]] to <16 x i8>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
+; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <4 x i32> %src,
+ i1 %cond
+) {
+entry:
+ %bytes = bitcast <4 x i32> %src to <16 x i8>
+ ; Only 4 selects - less than half of 16
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %e1 = extractelement <16 x i8> %bytes, i64 1
+ %e2 = extractelement <16 x i8> %bytes, i64 2
+ %e3 = extractelement <16 x i8> %bytes, i64 3
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ ret void
+}
+
+; Negative test: select with extract as false value (wrong operand position)
+define amdgpu_kernel void @no_combine_wrong_operand_order(
+;
+; CHECK-OPT-LABEL: define amdgpu_kernel void @no_combine_wrong_operand_order(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
+; CHECK-OPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
+; CHECK-OPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
+; CHECK-OPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
+; CHECK-OPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
+; CHECK-OPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 0, i8 [[E0]]
+; CHECK-OPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 0, i8 [[E1]]
+; CHECK-OPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 0, i8 [[E2]]
+; CHECK-OPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 0, i8 [[E3]]
+; CHECK-OPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @no_combine_wrong_operand_order(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 0, i8 [[E0]]
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 0, i8 [[E1]]
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 0, i8 [[E2]]
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 0, i8 [[E3]]
+; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <2 x i32> %src,
+ i1 %cond
+) {
+entry:
+ %bytes = bitcast <2 x i32> %src to <8 x i8>
+ %e0 = extractelement <8 x i8> %bytes, i64 0
+ %e1 = extractelement <8 x i8> %bytes, i64 1
+ %e2 = extractelement <8 x i8> %bytes, i64 2
+ %e3 = extractelement <8 x i8> %bytes, i64 3
+ ; Extract is false value, 0 is true value - should not combine
+ %s0 = select i1 %cond, i8 0, i8 %e0
+ %s1 = select i1 %cond, i8 0, i8 %e1
+ %s2 = select i1 %cond, i8 0, i8 %e2
+ %s3 = select i1 %cond, i8 0, i8 %e3
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ ret void
+}
+
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
+
>From f7c2a62a81fc78d17830e2c55b92217dc698d2ce Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Tue, 30 Dec 2025 20:12:56 +0530
Subject: [PATCH 02/11] added asm test file
---
.../AMDGPU/combine-scalar-selects-asm.ll | 268 ++++++++++++++++++
1 file changed, 268 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
new file mode 100644
index 0000000000000..0be2a7f95b11c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s --check-prefix=CHECK-OPT
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-late-codegenprepare-combine-scalar-selects=false < %s | FileCheck %s --check-prefix=CHECK-NOOPT
+
+define amdgpu_kernel void @combine_scalar_selects_v16i8(
+; CHECK-OPT-LABEL: combine_scalar_selects_v16i8:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_scalar_selects_v16i8:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %valid
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <4 x i32>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <4 x i32>, ptr addrspace(1) %gep, align 16
+ %bytes = bitcast <4 x i32> %loaded to <16 x i8>
+
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %e1 = extractelement <16 x i8> %bytes, i64 1
+ %e2 = extractelement <16 x i8> %bytes, i64 2
+ %e3 = extractelement <16 x i8> %bytes, i64 3
+ %e4 = extractelement <16 x i8> %bytes, i64 4
+ %e5 = extractelement <16 x i8> %bytes, i64 5
+ %e6 = extractelement <16 x i8> %bytes, i64 6
+ %e7 = extractelement <16 x i8> %bytes, i64 7
+ %e8 = extractelement <16 x i8> %bytes, i64 8
+ %e9 = extractelement <16 x i8> %bytes, i64 9
+ %e10 = extractelement <16 x i8> %bytes, i64 10
+ %e11 = extractelement <16 x i8> %bytes, i64 11
+ %e12 = extractelement <16 x i8> %bytes, i64 12
+ %e13 = extractelement <16 x i8> %bytes, i64 13
+ %e14 = extractelement <16 x i8> %bytes, i64 14
+ %e15 = extractelement <16 x i8> %bytes, i64 15
+
+ %s0 = select i1 %valid, i8 %e0, i8 0
+ %s1 = select i1 %valid, i8 %e1, i8 0
+ %s2 = select i1 %valid, i8 %e2, i8 0
+ %s3 = select i1 %valid, i8 %e3, i8 0
+ %s4 = select i1 %valid, i8 %e4, i8 0
+ %s5 = select i1 %valid, i8 %e5, i8 0
+ %s6 = select i1 %valid, i8 %e6, i8 0
+ %s7 = select i1 %valid, i8 %e7, i8 0
+ %s8 = select i1 %valid, i8 %e8, i8 0
+ %s9 = select i1 %valid, i8 %e9, i8 0
+ %s10 = select i1 %valid, i8 %e10, i8 0
+ %s11 = select i1 %valid, i8 %e11, i8 0
+ %s12 = select i1 %valid, i8 %e12, i8 0
+ %s13 = select i1 %valid, i8 %e13, i8 0
+ %s14 = select i1 %valid, i8 %e14, i8 0
+ %s15 = select i1 %valid, i8 %e15, i8 0
+
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
+ store i8 %s8, ptr addrspace(1) %ptr8, align 1
+ %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
+ store i8 %s9, ptr addrspace(1) %ptr9, align 1
+ %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
+ store i8 %s10, ptr addrspace(1) %ptr10, align 1
+ %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
+ store i8 %s11, ptr addrspace(1) %ptr11, align 1
+ %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
+ store i8 %s12, ptr addrspace(1) %ptr12, align 1
+ %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
+ store i8 %s13, ptr addrspace(1) %ptr13, align 1
+ %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
+ store i8 %s14, ptr addrspace(1) %ptr14, align 1
+ %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
+ store i8 %s15, ptr addrspace(1) %ptr15, align 1
+
+ ret void
+}
+
+; Test with v8i8 from v2i32 (smaller vector)
+define amdgpu_kernel void @combine_scalar_selects_v8i8(
+; CHECK-OPT-LABEL: combine_scalar_selects_v8i8:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_scalar_selects_v8i8:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 24, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 8, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 24, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v6, 8, v6
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v6, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <2 x i32>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <2 x i32>, ptr addrspace(1) %gep, align 8
+ %bytes = bitcast <2 x i32> %loaded to <8 x i8>
+ %e0 = extractelement <8 x i8> %bytes, i64 0
+ %e1 = extractelement <8 x i8> %bytes, i64 1
+ %e2 = extractelement <8 x i8> %bytes, i64 2
+ %e3 = extractelement <8 x i8> %bytes, i64 3
+ %e4 = extractelement <8 x i8> %bytes, i64 4
+ %e5 = extractelement <8 x i8> %bytes, i64 5
+ %e6 = extractelement <8 x i8> %bytes, i64 6
+ %e7 = extractelement <8 x i8> %bytes, i64 7
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
>From 5a2162c0e3ff63b41ba897439244fabce07aebb9 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Tue, 30 Dec 2025 22:02:32 +0530
Subject: [PATCH 03/11] incorporate handling of other data type casting
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 23 +-
.../AMDGPU/combine-scalar-selects-asm.ll | 979 ++++++++++++++
.../CodeGen/AMDGPU/combine-scalar-selects.ll | 1157 +++++++++++++++++
3 files changed, 2152 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 9e5b076409862..3bcad6dd9cfff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -599,16 +599,24 @@ bool AMDGPULateCodeGenPrepare::tryCombineSelectsFromBitcast(BitCastInst &BC) {
if (!SrcVecTy || !DstVecTy)
return false;
- // Must be: bitcast <N x i32> to <M x i8>
- if (!SrcVecTy->getElementType()->isIntegerTy(32) ||
- !DstVecTy->getElementType()->isIntegerTy(8))
+ // Source can be any 32-bit or 64-bit element type (i32, i64, float, double).
+ // Destination must be smaller integer elements (i8, i16, or i32 from i64).
+ // Zero in all these types is all-bits-zero, so the transformation is valid.
+ Type *SrcEltTy = SrcVecTy->getElementType();
+ Type *DstEltTy = DstVecTy->getElementType();
+ unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
+ unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
+
+ if (SrcEltBits != 32 && SrcEltBits != 64)
+ return false;
+
+ if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
return false;
unsigned NumDstElts = DstVecTy->getNumElements();
BasicBlock *BB = BC.getParent();
// Require at least half the elements to have matching selects.
- // For v16i8 (from v4i32), this means at least 8 selects must match.
// This threshold ensures the transformation is profitable.
unsigned MinRequired = NumDstElts / 2;
@@ -642,10 +650,11 @@ bool AMDGPULateCodeGenPrepare::tryCombineSelectsFromBitcast(BitCastInst &BC) {
unsigned Idx = IdxC->getZExtValue();
for (User *EU : Ext->users()) {
- auto *Sel = dyn_cast<SelectInst>(EU);
// Must be: select %cond, %extract, 0 (in same BB)
- if (!Sel || Sel->getParent() != BB || Sel->getTrueValue() != Ext ||
- !match(Sel->getFalseValue(), m_Zero()))
+ if (!match(EU, m_Select(m_Value(), m_Specific(Ext), m_Zero())))
+ continue;
+ SelectInst *Sel = cast<SelectInst>(EU);
+ if (Sel->getParent() != BB)
continue;
auto &Group = ConditionGroups[Sel->getCondition()];
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
index 0be2a7f95b11c..4c60a78ccc716 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
@@ -265,4 +265,983 @@ entry:
ret void
}
+; Test <4 x i32> to <8 x i16> (32-bit elements to 16-bit elements)
+define amdgpu_kernel void @combine_v4i32_to_v8i16_asm(
+; CHECK-OPT-LABEL: combine_v4i32_to_v8i16_asm:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_v4i32_to_v8i16_asm:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
+; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <4 x i32>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <4 x i32>, ptr addrspace(1) %gep, align 16
+ %halves = bitcast <4 x i32> %loaded to <8 x i16>
+ %e0 = extractelement <8 x i16> %halves, i64 0
+ %e1 = extractelement <8 x i16> %halves, i64 1
+ %e2 = extractelement <8 x i16> %halves, i64 2
+ %e3 = extractelement <8 x i16> %halves, i64 3
+ %e4 = extractelement <8 x i16> %halves, i64 4
+ %e5 = extractelement <8 x i16> %halves, i64 5
+ %e6 = extractelement <8 x i16> %halves, i64 6
+ %e7 = extractelement <8 x i16> %halves, i64 7
+ %s0 = select i1 %cond, i16 %e0, i16 0
+ %s1 = select i1 %cond, i16 %e1, i16 0
+ %s2 = select i1 %cond, i16 %e2, i16 0
+ %s3 = select i1 %cond, i16 %e3, i16 0
+ %s4 = select i1 %cond, i16 %e4, i16 0
+ %s5 = select i1 %cond, i16 %e5, i16 0
+ %s6 = select i1 %cond, i16 %e6, i16 0
+ %s7 = select i1 %cond, i16 %e7, i16 0
+ store i16 %s0, ptr addrspace(1) %out, align 2
+ %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
+ store i16 %s1, ptr addrspace(1) %ptr1, align 2
+ %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
+ store i16 %s2, ptr addrspace(1) %ptr2, align 2
+ %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
+ store i16 %s3, ptr addrspace(1) %ptr3, align 2
+ %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
+ store i16 %s4, ptr addrspace(1) %ptr4, align 2
+ %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
+ store i16 %s5, ptr addrspace(1) %ptr5, align 2
+ %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
+ store i16 %s6, ptr addrspace(1) %ptr6, align 2
+ %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
+ store i16 %s7, ptr addrspace(1) %ptr7, align 2
+ ret void
+}
+
+; Test <4 x float> to <16 x i8> (float elements to byte elements)
+define amdgpu_kernel void @combine_v4f32_to_v16i8_asm(
+; CHECK-OPT-LABEL: combine_v4f32_to_v16i8_asm:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_v4f32_to_v16i8_asm:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <4 x float>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <4 x float>, ptr addrspace(1) %gep, align 16
+ %bytes = bitcast <4 x float> %loaded to <16 x i8>
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %e1 = extractelement <16 x i8> %bytes, i64 1
+ %e2 = extractelement <16 x i8> %bytes, i64 2
+ %e3 = extractelement <16 x i8> %bytes, i64 3
+ %e4 = extractelement <16 x i8> %bytes, i64 4
+ %e5 = extractelement <16 x i8> %bytes, i64 5
+ %e6 = extractelement <16 x i8> %bytes, i64 6
+ %e7 = extractelement <16 x i8> %bytes, i64 7
+ %e8 = extractelement <16 x i8> %bytes, i64 8
+ %e9 = extractelement <16 x i8> %bytes, i64 9
+ %e10 = extractelement <16 x i8> %bytes, i64 10
+ %e11 = extractelement <16 x i8> %bytes, i64 11
+ %e12 = extractelement <16 x i8> %bytes, i64 12
+ %e13 = extractelement <16 x i8> %bytes, i64 13
+ %e14 = extractelement <16 x i8> %bytes, i64 14
+ %e15 = extractelement <16 x i8> %bytes, i64 15
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ %s8 = select i1 %cond, i8 %e8, i8 0
+ %s9 = select i1 %cond, i8 %e9, i8 0
+ %s10 = select i1 %cond, i8 %e10, i8 0
+ %s11 = select i1 %cond, i8 %e11, i8 0
+ %s12 = select i1 %cond, i8 %e12, i8 0
+ %s13 = select i1 %cond, i8 %e13, i8 0
+ %s14 = select i1 %cond, i8 %e14, i8 0
+ %s15 = select i1 %cond, i8 %e15, i8 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
+ store i8 %s8, ptr addrspace(1) %ptr8, align 1
+ %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
+ store i8 %s9, ptr addrspace(1) %ptr9, align 1
+ %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
+ store i8 %s10, ptr addrspace(1) %ptr10, align 1
+ %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
+ store i8 %s11, ptr addrspace(1) %ptr11, align 1
+ %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
+ store i8 %s12, ptr addrspace(1) %ptr12, align 1
+ %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
+ store i8 %s13, ptr addrspace(1) %ptr13, align 1
+ %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
+ store i8 %s14, ptr addrspace(1) %ptr14, align 1
+ %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
+ store i8 %s15, ptr addrspace(1) %ptr15, align 1
+ ret void
+}
+
+; Test <4 x float> to <8 x i16> (float elements to 16-bit elements)
+define amdgpu_kernel void @combine_v4f32_to_v8i16_asm(
+; CHECK-OPT-LABEL: combine_v4f32_to_v8i16_asm:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_v4f32_to_v8i16_asm:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
+; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <4 x float>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <4 x float>, ptr addrspace(1) %gep, align 16
+ %halves = bitcast <4 x float> %loaded to <8 x i16>
+ %e0 = extractelement <8 x i16> %halves, i64 0
+ %e1 = extractelement <8 x i16> %halves, i64 1
+ %e2 = extractelement <8 x i16> %halves, i64 2
+ %e3 = extractelement <8 x i16> %halves, i64 3
+ %e4 = extractelement <8 x i16> %halves, i64 4
+ %e5 = extractelement <8 x i16> %halves, i64 5
+ %e6 = extractelement <8 x i16> %halves, i64 6
+ %e7 = extractelement <8 x i16> %halves, i64 7
+ %s0 = select i1 %cond, i16 %e0, i16 0
+ %s1 = select i1 %cond, i16 %e1, i16 0
+ %s2 = select i1 %cond, i16 %e2, i16 0
+ %s3 = select i1 %cond, i16 %e3, i16 0
+ %s4 = select i1 %cond, i16 %e4, i16 0
+ %s5 = select i1 %cond, i16 %e5, i16 0
+ %s6 = select i1 %cond, i16 %e6, i16 0
+ %s7 = select i1 %cond, i16 %e7, i16 0
+ store i16 %s0, ptr addrspace(1) %out, align 2
+ %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
+ store i16 %s1, ptr addrspace(1) %ptr1, align 2
+ %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
+ store i16 %s2, ptr addrspace(1) %ptr2, align 2
+ %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
+ store i16 %s3, ptr addrspace(1) %ptr3, align 2
+ %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
+ store i16 %s4, ptr addrspace(1) %ptr4, align 2
+ %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
+ store i16 %s5, ptr addrspace(1) %ptr5, align 2
+ %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
+ store i16 %s6, ptr addrspace(1) %ptr6, align 2
+ %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
+ store i16 %s7, ptr addrspace(1) %ptr7, align 2
+ ret void
+}
+
+; Test <2 x i64> to <16 x i8> (64-bit elements to byte elements)
+define amdgpu_kernel void @combine_v2i64_to_v16i8_asm(
+; CHECK-OPT-LABEL: combine_v2i64_to_v16i8_asm:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_v2i64_to_v16i8_asm:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <2 x i64>, ptr addrspace(1) %gep, align 16
+ %bytes = bitcast <2 x i64> %loaded to <16 x i8>
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %e1 = extractelement <16 x i8> %bytes, i64 1
+ %e2 = extractelement <16 x i8> %bytes, i64 2
+ %e3 = extractelement <16 x i8> %bytes, i64 3
+ %e4 = extractelement <16 x i8> %bytes, i64 4
+ %e5 = extractelement <16 x i8> %bytes, i64 5
+ %e6 = extractelement <16 x i8> %bytes, i64 6
+ %e7 = extractelement <16 x i8> %bytes, i64 7
+ %e8 = extractelement <16 x i8> %bytes, i64 8
+ %e9 = extractelement <16 x i8> %bytes, i64 9
+ %e10 = extractelement <16 x i8> %bytes, i64 10
+ %e11 = extractelement <16 x i8> %bytes, i64 11
+ %e12 = extractelement <16 x i8> %bytes, i64 12
+ %e13 = extractelement <16 x i8> %bytes, i64 13
+ %e14 = extractelement <16 x i8> %bytes, i64 14
+ %e15 = extractelement <16 x i8> %bytes, i64 15
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ %s8 = select i1 %cond, i8 %e8, i8 0
+ %s9 = select i1 %cond, i8 %e9, i8 0
+ %s10 = select i1 %cond, i8 %e10, i8 0
+ %s11 = select i1 %cond, i8 %e11, i8 0
+ %s12 = select i1 %cond, i8 %e12, i8 0
+ %s13 = select i1 %cond, i8 %e13, i8 0
+ %s14 = select i1 %cond, i8 %e14, i8 0
+ %s15 = select i1 %cond, i8 %e15, i8 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
+ store i8 %s8, ptr addrspace(1) %ptr8, align 1
+ %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
+ store i8 %s9, ptr addrspace(1) %ptr9, align 1
+ %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
+ store i8 %s10, ptr addrspace(1) %ptr10, align 1
+ %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
+ store i8 %s11, ptr addrspace(1) %ptr11, align 1
+ %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
+ store i8 %s12, ptr addrspace(1) %ptr12, align 1
+ %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
+ store i8 %s13, ptr addrspace(1) %ptr13, align 1
+ %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
+ store i8 %s14, ptr addrspace(1) %ptr14, align 1
+ %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
+ store i8 %s15, ptr addrspace(1) %ptr15, align 1
+ ret void
+}
+
+; Test <2 x i64> to <8 x i16> (64-bit elements to 16-bit elements)
+define amdgpu_kernel void @combine_v2i64_to_v8i16_asm(
+; CHECK-OPT-LABEL: combine_v2i64_to_v8i16_asm:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_v2i64_to_v8i16_asm:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
+; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <2 x i64>, ptr addrspace(1) %gep, align 16
+ %halves = bitcast <2 x i64> %loaded to <8 x i16>
+ %e0 = extractelement <8 x i16> %halves, i64 0
+ %e1 = extractelement <8 x i16> %halves, i64 1
+ %e2 = extractelement <8 x i16> %halves, i64 2
+ %e3 = extractelement <8 x i16> %halves, i64 3
+ %e4 = extractelement <8 x i16> %halves, i64 4
+ %e5 = extractelement <8 x i16> %halves, i64 5
+ %e6 = extractelement <8 x i16> %halves, i64 6
+ %e7 = extractelement <8 x i16> %halves, i64 7
+ %s0 = select i1 %cond, i16 %e0, i16 0
+ %s1 = select i1 %cond, i16 %e1, i16 0
+ %s2 = select i1 %cond, i16 %e2, i16 0
+ %s3 = select i1 %cond, i16 %e3, i16 0
+ %s4 = select i1 %cond, i16 %e4, i16 0
+ %s5 = select i1 %cond, i16 %e5, i16 0
+ %s6 = select i1 %cond, i16 %e6, i16 0
+ %s7 = select i1 %cond, i16 %e7, i16 0
+ store i16 %s0, ptr addrspace(1) %out, align 2
+ %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
+ store i16 %s1, ptr addrspace(1) %ptr1, align 2
+ %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
+ store i16 %s2, ptr addrspace(1) %ptr2, align 2
+ %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
+ store i16 %s3, ptr addrspace(1) %ptr3, align 2
+ %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
+ store i16 %s4, ptr addrspace(1) %ptr4, align 2
+ %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
+ store i16 %s5, ptr addrspace(1) %ptr5, align 2
+ %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
+ store i16 %s6, ptr addrspace(1) %ptr6, align 2
+ %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
+ store i16 %s7, ptr addrspace(1) %ptr7, align 2
+ ret void
+}
+
+; Test <2 x i64> to <4 x i32> (64-bit elements to 32-bit elements)
+define amdgpu_kernel void @combine_v2i64_to_v4i32_asm(
+; CHECK-OPT-LABEL: combine_v2i64_to_v4i32_asm:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_v2i64_to_v4i32_asm:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <2 x i64>, ptr addrspace(1) %gep, align 16
+ %words = bitcast <2 x i64> %loaded to <4 x i32>
+ %e0 = extractelement <4 x i32> %words, i64 0
+ %e1 = extractelement <4 x i32> %words, i64 1
+ %e2 = extractelement <4 x i32> %words, i64 2
+ %e3 = extractelement <4 x i32> %words, i64 3
+ %s0 = select i1 %cond, i32 %e0, i32 0
+ %s1 = select i1 %cond, i32 %e1, i32 0
+ %s2 = select i1 %cond, i32 %e2, i32 0
+ %s3 = select i1 %cond, i32 %e3, i32 0
+ store i32 %s0, ptr addrspace(1) %out, align 4
+ %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 1
+ store i32 %s1, ptr addrspace(1) %ptr1, align 4
+ %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 2
+ store i32 %s2, ptr addrspace(1) %ptr2, align 4
+ %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 3
+ store i32 %s3, ptr addrspace(1) %ptr3, align 4
+ ret void
+}
+
+; Test <2 x double> to <16 x i8> (double elements to byte elements)
+define amdgpu_kernel void @combine_v2f64_to_v16i8_asm(
+; CHECK-OPT-LABEL: combine_v2f64_to_v16i8_asm:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_v2f64_to_v16i8_asm:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <2 x double>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <2 x double>, ptr addrspace(1) %gep, align 16
+ %bytes = bitcast <2 x double> %loaded to <16 x i8>
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %e1 = extractelement <16 x i8> %bytes, i64 1
+ %e2 = extractelement <16 x i8> %bytes, i64 2
+ %e3 = extractelement <16 x i8> %bytes, i64 3
+ %e4 = extractelement <16 x i8> %bytes, i64 4
+ %e5 = extractelement <16 x i8> %bytes, i64 5
+ %e6 = extractelement <16 x i8> %bytes, i64 6
+ %e7 = extractelement <16 x i8> %bytes, i64 7
+ %e8 = extractelement <16 x i8> %bytes, i64 8
+ %e9 = extractelement <16 x i8> %bytes, i64 9
+ %e10 = extractelement <16 x i8> %bytes, i64 10
+ %e11 = extractelement <16 x i8> %bytes, i64 11
+ %e12 = extractelement <16 x i8> %bytes, i64 12
+ %e13 = extractelement <16 x i8> %bytes, i64 13
+ %e14 = extractelement <16 x i8> %bytes, i64 14
+ %e15 = extractelement <16 x i8> %bytes, i64 15
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ %s8 = select i1 %cond, i8 %e8, i8 0
+ %s9 = select i1 %cond, i8 %e9, i8 0
+ %s10 = select i1 %cond, i8 %e10, i8 0
+ %s11 = select i1 %cond, i8 %e11, i8 0
+ %s12 = select i1 %cond, i8 %e12, i8 0
+ %s13 = select i1 %cond, i8 %e13, i8 0
+ %s14 = select i1 %cond, i8 %e14, i8 0
+ %s15 = select i1 %cond, i8 %e15, i8 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
+ store i8 %s8, ptr addrspace(1) %ptr8, align 1
+ %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
+ store i8 %s9, ptr addrspace(1) %ptr9, align 1
+ %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
+ store i8 %s10, ptr addrspace(1) %ptr10, align 1
+ %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
+ store i8 %s11, ptr addrspace(1) %ptr11, align 1
+ %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
+ store i8 %s12, ptr addrspace(1) %ptr12, align 1
+ %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
+ store i8 %s13, ptr addrspace(1) %ptr13, align 1
+ %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
+ store i8 %s14, ptr addrspace(1) %ptr14, align 1
+ %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
+ store i8 %s15, ptr addrspace(1) %ptr15, align 1
+ ret void
+}
+
+; Test <2 x double> to <8 x i16> (double elements to 16-bit elements)
+define amdgpu_kernel void @combine_v2f64_to_v8i16_asm(
+; CHECK-OPT-LABEL: combine_v2f64_to_v8i16_asm:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_v2f64_to_v8i16_asm:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
+; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
+; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <2 x double>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <2 x double>, ptr addrspace(1) %gep, align 16
+ %halves = bitcast <2 x double> %loaded to <8 x i16>
+ %e0 = extractelement <8 x i16> %halves, i64 0
+ %e1 = extractelement <8 x i16> %halves, i64 1
+ %e2 = extractelement <8 x i16> %halves, i64 2
+ %e3 = extractelement <8 x i16> %halves, i64 3
+ %e4 = extractelement <8 x i16> %halves, i64 4
+ %e5 = extractelement <8 x i16> %halves, i64 5
+ %e6 = extractelement <8 x i16> %halves, i64 6
+ %e7 = extractelement <8 x i16> %halves, i64 7
+ %s0 = select i1 %cond, i16 %e0, i16 0
+ %s1 = select i1 %cond, i16 %e1, i16 0
+ %s2 = select i1 %cond, i16 %e2, i16 0
+ %s3 = select i1 %cond, i16 %e3, i16 0
+ %s4 = select i1 %cond, i16 %e4, i16 0
+ %s5 = select i1 %cond, i16 %e5, i16 0
+ %s6 = select i1 %cond, i16 %e6, i16 0
+ %s7 = select i1 %cond, i16 %e7, i16 0
+ store i16 %s0, ptr addrspace(1) %out, align 2
+ %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
+ store i16 %s1, ptr addrspace(1) %ptr1, align 2
+ %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
+ store i16 %s2, ptr addrspace(1) %ptr2, align 2
+ %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
+ store i16 %s3, ptr addrspace(1) %ptr3, align 2
+ %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
+ store i16 %s4, ptr addrspace(1) %ptr4, align 2
+ %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
+ store i16 %s5, ptr addrspace(1) %ptr5, align 2
+ %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
+ store i16 %s6, ptr addrspace(1) %ptr6, align 2
+ %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
+ store i16 %s7, ptr addrspace(1) %ptr7, align 2
+ ret void
+}
+
+; Test <2 x double> to <4 x i32> (double elements to 32-bit elements)
+define amdgpu_kernel void @combine_v2f64_to_v4i32_asm(
+; CHECK-OPT-LABEL: combine_v2f64_to_v4i32_asm:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_v2f64_to_v4i32_asm:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <2 x double>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <2 x double>, ptr addrspace(1) %gep, align 16
+ %words = bitcast <2 x double> %loaded to <4 x i32>
+ %e0 = extractelement <4 x i32> %words, i64 0
+ %e1 = extractelement <4 x i32> %words, i64 1
+ %e2 = extractelement <4 x i32> %words, i64 2
+ %e3 = extractelement <4 x i32> %words, i64 3
+ %s0 = select i1 %cond, i32 %e0, i32 0
+ %s1 = select i1 %cond, i32 %e1, i32 0
+ %s2 = select i1 %cond, i32 %e2, i32 0
+ %s3 = select i1 %cond, i32 %e3, i32 0
+ store i32 %s0, ptr addrspace(1) %out, align 4
+ %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 1
+ store i32 %s1, ptr addrspace(1) %ptr1, align 4
+ %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 2
+ store i32 %s2, ptr addrspace(1) %ptr2, align 4
+ %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 3
+ store i32 %s3, ptr addrspace(1) %ptr3, align 4
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
index 973da749d9daf..38e20647d4214 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
@@ -634,5 +634,1162 @@ entry:
ret void
}
+; Test <4 x i32> to <8 x i16> (32-bit elements to 16-bit elements)
+define amdgpu_kernel void @combine_v4i32_to_v8i16(
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v4i32_to_v8i16(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x i32> [[SRC]], <4 x i32> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x i32> [[COMBINED_SEL]] to <8 x i16>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[PTR2]], align 2
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[PTR3]], align 2
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[PTR4]], align 2
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[PTR5]], align 2
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[PTR6]], align 2
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[PTR7]], align 2
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v4i32_to_v8i16(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[HALVES:%.*]] = bitcast <4 x i32> [[SRC]] to <8 x i16>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i16> [[HALVES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i16> [[HALVES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i16> [[HALVES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i16> [[HALVES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i16> [[HALVES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i16> [[HALVES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i16> [[HALVES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i16> [[HALVES]], i64 7
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i16 [[E0]], i16 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i16 [[E1]], i16 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i16 [[E2]], i16 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i16 [[E3]], i16 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i16 [[E4]], i16 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i16 [[E5]], i16 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i16 [[E6]], i16 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i16 [[E7]], i16 0
+; CHECK-NOOPT-NEXT: store i16 [[S0]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i16 [[S1]], ptr addrspace(1) [[PTR1]], align 2
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i16 [[S2]], ptr addrspace(1) [[PTR2]], align 2
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i16 [[S3]], ptr addrspace(1) [[PTR3]], align 2
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i16 [[S4]], ptr addrspace(1) [[PTR4]], align 2
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i16 [[S5]], ptr addrspace(1) [[PTR5]], align 2
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i16 [[S6]], ptr addrspace(1) [[PTR6]], align 2
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i16 [[S7]], ptr addrspace(1) [[PTR7]], align 2
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <4 x i32> %src,
+ i1 %cond
+) {
+entry:
+ %halves = bitcast <4 x i32> %src to <8 x i16>
+ %e0 = extractelement <8 x i16> %halves, i64 0
+ %e1 = extractelement <8 x i16> %halves, i64 1
+ %e2 = extractelement <8 x i16> %halves, i64 2
+ %e3 = extractelement <8 x i16> %halves, i64 3
+ %e4 = extractelement <8 x i16> %halves, i64 4
+ %e5 = extractelement <8 x i16> %halves, i64 5
+ %e6 = extractelement <8 x i16> %halves, i64 6
+ %e7 = extractelement <8 x i16> %halves, i64 7
+ %s0 = select i1 %cond, i16 %e0, i16 0
+ %s1 = select i1 %cond, i16 %e1, i16 0
+ %s2 = select i1 %cond, i16 %e2, i16 0
+ %s3 = select i1 %cond, i16 %e3, i16 0
+ %s4 = select i1 %cond, i16 %e4, i16 0
+ %s5 = select i1 %cond, i16 %e5, i16 0
+ %s6 = select i1 %cond, i16 %e6, i16 0
+ %s7 = select i1 %cond, i16 %e7, i16 0
+ store i16 %s0, ptr addrspace(1) %out, align 2
+ %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
+ store i16 %s1, ptr addrspace(1) %ptr1, align 2
+ %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
+ store i16 %s2, ptr addrspace(1) %ptr2, align 2
+ %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
+ store i16 %s3, ptr addrspace(1) %ptr3, align 2
+ %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
+ store i16 %s4, ptr addrspace(1) %ptr4, align 2
+ %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
+ store i16 %s5, ptr addrspace(1) %ptr5, align 2
+ %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
+ store i16 %s6, ptr addrspace(1) %ptr6, align 2
+ %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
+ store i16 %s7, ptr addrspace(1) %ptr7, align 2
+ ret void
+}
+
+; Test <4 x float> to <16 x i8> (float elements to byte elements)
+define amdgpu_kernel void @combine_v4f32_to_v16i8(
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v4f32_to_v16i8(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x float> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x float> [[SRC]], <4 x float> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x float> [[COMBINED_SEL]] to <16 x i8>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
+; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
+; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
+; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
+; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
+; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i8 [[TMP14]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-OPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
+; CHECK-OPT-NEXT: store i8 [[TMP10]], ptr addrspace(1) [[PTR8]], align 1
+; CHECK-OPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
+; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR9]], align 1
+; CHECK-OPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
+; CHECK-OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[PTR10]], align 1
+; CHECK-OPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
+; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR11]], align 1
+; CHECK-OPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
+; CHECK-OPT-NEXT: store i8 [[TMP15]], ptr addrspace(1) [[PTR12]], align 1
+; CHECK-OPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
+; CHECK-OPT-NEXT: store i8 [[TMP8]], ptr addrspace(1) [[PTR13]], align 1
+; CHECK-OPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
+; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR14]], align 1
+; CHECK-OPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
+; CHECK-OPT-NEXT: store i8 [[TMP11]], ptr addrspace(1) [[PTR15]], align 1
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v4f32_to_v16i8(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x float> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <4 x float> [[SRC]] to <16 x i8>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[BYTES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <16 x i8> [[BYTES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <16 x i8> [[BYTES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <16 x i8> [[BYTES]], i64 7
+; CHECK-NOOPT-NEXT: [[E8:%.*]] = extractelement <16 x i8> [[BYTES]], i64 8
+; CHECK-NOOPT-NEXT: [[E9:%.*]] = extractelement <16 x i8> [[BYTES]], i64 9
+; CHECK-NOOPT-NEXT: [[E10:%.*]] = extractelement <16 x i8> [[BYTES]], i64 10
+; CHECK-NOOPT-NEXT: [[E11:%.*]] = extractelement <16 x i8> [[BYTES]], i64 11
+; CHECK-NOOPT-NEXT: [[E12:%.*]] = extractelement <16 x i8> [[BYTES]], i64 12
+; CHECK-NOOPT-NEXT: [[E13:%.*]] = extractelement <16 x i8> [[BYTES]], i64 13
+; CHECK-NOOPT-NEXT: [[E14:%.*]] = extractelement <16 x i8> [[BYTES]], i64 14
+; CHECK-NOOPT-NEXT: [[E15:%.*]] = extractelement <16 x i8> [[BYTES]], i64 15
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
+; CHECK-NOOPT-NEXT: [[S8:%.*]] = select i1 [[COND]], i8 [[E8]], i8 0
+; CHECK-NOOPT-NEXT: [[S9:%.*]] = select i1 [[COND]], i8 [[E9]], i8 0
+; CHECK-NOOPT-NEXT: [[S10:%.*]] = select i1 [[COND]], i8 [[E10]], i8 0
+; CHECK-NOOPT-NEXT: [[S11:%.*]] = select i1 [[COND]], i8 [[E11]], i8 0
+; CHECK-NOOPT-NEXT: [[S12:%.*]] = select i1 [[COND]], i8 [[E12]], i8 0
+; CHECK-NOOPT-NEXT: [[S13:%.*]] = select i1 [[COND]], i8 [[E13]], i8 0
+; CHECK-NOOPT-NEXT: [[S14:%.*]] = select i1 [[COND]], i8 [[E14]], i8 0
+; CHECK-NOOPT-NEXT: [[S15:%.*]] = select i1 [[COND]], i8 [[E15]], i8 0
+; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-NOOPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
+; CHECK-NOOPT-NEXT: store i8 [[S8]], ptr addrspace(1) [[PTR8]], align 1
+; CHECK-NOOPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
+; CHECK-NOOPT-NEXT: store i8 [[S9]], ptr addrspace(1) [[PTR9]], align 1
+; CHECK-NOOPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
+; CHECK-NOOPT-NEXT: store i8 [[S10]], ptr addrspace(1) [[PTR10]], align 1
+; CHECK-NOOPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
+; CHECK-NOOPT-NEXT: store i8 [[S11]], ptr addrspace(1) [[PTR11]], align 1
+; CHECK-NOOPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
+; CHECK-NOOPT-NEXT: store i8 [[S12]], ptr addrspace(1) [[PTR12]], align 1
+; CHECK-NOOPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
+; CHECK-NOOPT-NEXT: store i8 [[S13]], ptr addrspace(1) [[PTR13]], align 1
+; CHECK-NOOPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
+; CHECK-NOOPT-NEXT: store i8 [[S14]], ptr addrspace(1) [[PTR14]], align 1
+; CHECK-NOOPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
+; CHECK-NOOPT-NEXT: store i8 [[S15]], ptr addrspace(1) [[PTR15]], align 1
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <4 x float> %src,
+ i1 %cond
+) {
+entry:
+ %bytes = bitcast <4 x float> %src to <16 x i8>
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %e1 = extractelement <16 x i8> %bytes, i64 1
+ %e2 = extractelement <16 x i8> %bytes, i64 2
+ %e3 = extractelement <16 x i8> %bytes, i64 3
+ %e4 = extractelement <16 x i8> %bytes, i64 4
+ %e5 = extractelement <16 x i8> %bytes, i64 5
+ %e6 = extractelement <16 x i8> %bytes, i64 6
+ %e7 = extractelement <16 x i8> %bytes, i64 7
+ %e8 = extractelement <16 x i8> %bytes, i64 8
+ %e9 = extractelement <16 x i8> %bytes, i64 9
+ %e10 = extractelement <16 x i8> %bytes, i64 10
+ %e11 = extractelement <16 x i8> %bytes, i64 11
+ %e12 = extractelement <16 x i8> %bytes, i64 12
+ %e13 = extractelement <16 x i8> %bytes, i64 13
+ %e14 = extractelement <16 x i8> %bytes, i64 14
+ %e15 = extractelement <16 x i8> %bytes, i64 15
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ %s8 = select i1 %cond, i8 %e8, i8 0
+ %s9 = select i1 %cond, i8 %e9, i8 0
+ %s10 = select i1 %cond, i8 %e10, i8 0
+ %s11 = select i1 %cond, i8 %e11, i8 0
+ %s12 = select i1 %cond, i8 %e12, i8 0
+ %s13 = select i1 %cond, i8 %e13, i8 0
+ %s14 = select i1 %cond, i8 %e14, i8 0
+ %s15 = select i1 %cond, i8 %e15, i8 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
+ store i8 %s8, ptr addrspace(1) %ptr8, align 1
+ %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
+ store i8 %s9, ptr addrspace(1) %ptr9, align 1
+ %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
+ store i8 %s10, ptr addrspace(1) %ptr10, align 1
+ %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
+ store i8 %s11, ptr addrspace(1) %ptr11, align 1
+ %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
+ store i8 %s12, ptr addrspace(1) %ptr12, align 1
+ %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
+ store i8 %s13, ptr addrspace(1) %ptr13, align 1
+ %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
+ store i8 %s14, ptr addrspace(1) %ptr14, align 1
+ %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
+ store i8 %s15, ptr addrspace(1) %ptr15, align 1
+ ret void
+}
+
+; Test <4 x float> to <8 x i16> (float elements to 16-bit elements)
+define amdgpu_kernel void @combine_v4f32_to_v8i16(
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v4f32_to_v8i16(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x float> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x float> [[SRC]], <4 x float> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x float> [[COMBINED_SEL]] to <8 x i16>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[PTR2]], align 2
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[PTR3]], align 2
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[PTR4]], align 2
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[PTR5]], align 2
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[PTR6]], align 2
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[PTR7]], align 2
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v4f32_to_v8i16(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x float> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[HALVES:%.*]] = bitcast <4 x float> [[SRC]] to <8 x i16>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i16> [[HALVES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i16> [[HALVES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i16> [[HALVES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i16> [[HALVES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i16> [[HALVES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i16> [[HALVES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i16> [[HALVES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i16> [[HALVES]], i64 7
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i16 [[E0]], i16 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i16 [[E1]], i16 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i16 [[E2]], i16 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i16 [[E3]], i16 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i16 [[E4]], i16 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i16 [[E5]], i16 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i16 [[E6]], i16 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i16 [[E7]], i16 0
+; CHECK-NOOPT-NEXT: store i16 [[S0]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i16 [[S1]], ptr addrspace(1) [[PTR1]], align 2
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i16 [[S2]], ptr addrspace(1) [[PTR2]], align 2
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i16 [[S3]], ptr addrspace(1) [[PTR3]], align 2
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i16 [[S4]], ptr addrspace(1) [[PTR4]], align 2
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i16 [[S5]], ptr addrspace(1) [[PTR5]], align 2
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i16 [[S6]], ptr addrspace(1) [[PTR6]], align 2
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i16 [[S7]], ptr addrspace(1) [[PTR7]], align 2
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <4 x float> %src,
+ i1 %cond
+) {
+entry:
+ %halves = bitcast <4 x float> %src to <8 x i16>
+ %e0 = extractelement <8 x i16> %halves, i64 0
+ %e1 = extractelement <8 x i16> %halves, i64 1
+ %e2 = extractelement <8 x i16> %halves, i64 2
+ %e3 = extractelement <8 x i16> %halves, i64 3
+ %e4 = extractelement <8 x i16> %halves, i64 4
+ %e5 = extractelement <8 x i16> %halves, i64 5
+ %e6 = extractelement <8 x i16> %halves, i64 6
+ %e7 = extractelement <8 x i16> %halves, i64 7
+ %s0 = select i1 %cond, i16 %e0, i16 0
+ %s1 = select i1 %cond, i16 %e1, i16 0
+ %s2 = select i1 %cond, i16 %e2, i16 0
+ %s3 = select i1 %cond, i16 %e3, i16 0
+ %s4 = select i1 %cond, i16 %e4, i16 0
+ %s5 = select i1 %cond, i16 %e5, i16 0
+ %s6 = select i1 %cond, i16 %e6, i16 0
+ %s7 = select i1 %cond, i16 %e7, i16 0
+ store i16 %s0, ptr addrspace(1) %out, align 2
+ %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
+ store i16 %s1, ptr addrspace(1) %ptr1, align 2
+ %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
+ store i16 %s2, ptr addrspace(1) %ptr2, align 2
+ %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
+ store i16 %s3, ptr addrspace(1) %ptr3, align 2
+ %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
+ store i16 %s4, ptr addrspace(1) %ptr4, align 2
+ %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
+ store i16 %s5, ptr addrspace(1) %ptr5, align 2
+ %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
+ store i16 %s6, ptr addrspace(1) %ptr6, align 2
+ %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
+ store i16 %s7, ptr addrspace(1) %ptr7, align 2
+ ret void
+}
+
+; Test <2 x i64> to <16 x i8> (64-bit elements to byte elements)
+define amdgpu_kernel void @combine_v2i64_to_v16i8(
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v16i8(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i64> [[SRC]], <2 x i64> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i64> [[COMBINED_SEL]] to <16 x i8>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
+; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
+; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
+; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
+; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
+; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i8 [[TMP14]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-OPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
+; CHECK-OPT-NEXT: store i8 [[TMP10]], ptr addrspace(1) [[PTR8]], align 1
+; CHECK-OPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
+; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR9]], align 1
+; CHECK-OPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
+; CHECK-OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[PTR10]], align 1
+; CHECK-OPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
+; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR11]], align 1
+; CHECK-OPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
+; CHECK-OPT-NEXT: store i8 [[TMP15]], ptr addrspace(1) [[PTR12]], align 1
+; CHECK-OPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
+; CHECK-OPT-NEXT: store i8 [[TMP8]], ptr addrspace(1) [[PTR13]], align 1
+; CHECK-OPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
+; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR14]], align 1
+; CHECK-OPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
+; CHECK-OPT-NEXT: store i8 [[TMP11]], ptr addrspace(1) [[PTR15]], align 1
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v16i8(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i64> [[SRC]] to <16 x i8>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[BYTES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <16 x i8> [[BYTES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <16 x i8> [[BYTES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <16 x i8> [[BYTES]], i64 7
+; CHECK-NOOPT-NEXT: [[E8:%.*]] = extractelement <16 x i8> [[BYTES]], i64 8
+; CHECK-NOOPT-NEXT: [[E9:%.*]] = extractelement <16 x i8> [[BYTES]], i64 9
+; CHECK-NOOPT-NEXT: [[E10:%.*]] = extractelement <16 x i8> [[BYTES]], i64 10
+; CHECK-NOOPT-NEXT: [[E11:%.*]] = extractelement <16 x i8> [[BYTES]], i64 11
+; CHECK-NOOPT-NEXT: [[E12:%.*]] = extractelement <16 x i8> [[BYTES]], i64 12
+; CHECK-NOOPT-NEXT: [[E13:%.*]] = extractelement <16 x i8> [[BYTES]], i64 13
+; CHECK-NOOPT-NEXT: [[E14:%.*]] = extractelement <16 x i8> [[BYTES]], i64 14
+; CHECK-NOOPT-NEXT: [[E15:%.*]] = extractelement <16 x i8> [[BYTES]], i64 15
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
+; CHECK-NOOPT-NEXT: [[S8:%.*]] = select i1 [[COND]], i8 [[E8]], i8 0
+; CHECK-NOOPT-NEXT: [[S9:%.*]] = select i1 [[COND]], i8 [[E9]], i8 0
+; CHECK-NOOPT-NEXT: [[S10:%.*]] = select i1 [[COND]], i8 [[E10]], i8 0
+; CHECK-NOOPT-NEXT: [[S11:%.*]] = select i1 [[COND]], i8 [[E11]], i8 0
+; CHECK-NOOPT-NEXT: [[S12:%.*]] = select i1 [[COND]], i8 [[E12]], i8 0
+; CHECK-NOOPT-NEXT: [[S13:%.*]] = select i1 [[COND]], i8 [[E13]], i8 0
+; CHECK-NOOPT-NEXT: [[S14:%.*]] = select i1 [[COND]], i8 [[E14]], i8 0
+; CHECK-NOOPT-NEXT: [[S15:%.*]] = select i1 [[COND]], i8 [[E15]], i8 0
+; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-NOOPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
+; CHECK-NOOPT-NEXT: store i8 [[S8]], ptr addrspace(1) [[PTR8]], align 1
+; CHECK-NOOPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
+; CHECK-NOOPT-NEXT: store i8 [[S9]], ptr addrspace(1) [[PTR9]], align 1
+; CHECK-NOOPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
+; CHECK-NOOPT-NEXT: store i8 [[S10]], ptr addrspace(1) [[PTR10]], align 1
+; CHECK-NOOPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
+; CHECK-NOOPT-NEXT: store i8 [[S11]], ptr addrspace(1) [[PTR11]], align 1
+; CHECK-NOOPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
+; CHECK-NOOPT-NEXT: store i8 [[S12]], ptr addrspace(1) [[PTR12]], align 1
+; CHECK-NOOPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
+; CHECK-NOOPT-NEXT: store i8 [[S13]], ptr addrspace(1) [[PTR13]], align 1
+; CHECK-NOOPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
+; CHECK-NOOPT-NEXT: store i8 [[S14]], ptr addrspace(1) [[PTR14]], align 1
+; CHECK-NOOPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
+; CHECK-NOOPT-NEXT: store i8 [[S15]], ptr addrspace(1) [[PTR15]], align 1
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <2 x i64> %src,
+ i1 %cond
+) {
+entry:
+ %bytes = bitcast <2 x i64> %src to <16 x i8>
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %e1 = extractelement <16 x i8> %bytes, i64 1
+ %e2 = extractelement <16 x i8> %bytes, i64 2
+ %e3 = extractelement <16 x i8> %bytes, i64 3
+ %e4 = extractelement <16 x i8> %bytes, i64 4
+ %e5 = extractelement <16 x i8> %bytes, i64 5
+ %e6 = extractelement <16 x i8> %bytes, i64 6
+ %e7 = extractelement <16 x i8> %bytes, i64 7
+ %e8 = extractelement <16 x i8> %bytes, i64 8
+ %e9 = extractelement <16 x i8> %bytes, i64 9
+ %e10 = extractelement <16 x i8> %bytes, i64 10
+ %e11 = extractelement <16 x i8> %bytes, i64 11
+ %e12 = extractelement <16 x i8> %bytes, i64 12
+ %e13 = extractelement <16 x i8> %bytes, i64 13
+ %e14 = extractelement <16 x i8> %bytes, i64 14
+ %e15 = extractelement <16 x i8> %bytes, i64 15
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ %s8 = select i1 %cond, i8 %e8, i8 0
+ %s9 = select i1 %cond, i8 %e9, i8 0
+ %s10 = select i1 %cond, i8 %e10, i8 0
+ %s11 = select i1 %cond, i8 %e11, i8 0
+ %s12 = select i1 %cond, i8 %e12, i8 0
+ %s13 = select i1 %cond, i8 %e13, i8 0
+ %s14 = select i1 %cond, i8 %e14, i8 0
+ %s15 = select i1 %cond, i8 %e15, i8 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
+ store i8 %s8, ptr addrspace(1) %ptr8, align 1
+ %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
+ store i8 %s9, ptr addrspace(1) %ptr9, align 1
+ %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
+ store i8 %s10, ptr addrspace(1) %ptr10, align 1
+ %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
+ store i8 %s11, ptr addrspace(1) %ptr11, align 1
+ %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
+ store i8 %s12, ptr addrspace(1) %ptr12, align 1
+ %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
+ store i8 %s13, ptr addrspace(1) %ptr13, align 1
+ %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
+ store i8 %s14, ptr addrspace(1) %ptr14, align 1
+ %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
+ store i8 %s15, ptr addrspace(1) %ptr15, align 1
+ ret void
+}
+
+; Test <2 x i64> to <8 x i16> (64-bit elements to 16-bit elements)
+define amdgpu_kernel void @combine_v2i64_to_v8i16(
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v8i16(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i64> [[SRC]], <2 x i64> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i64> [[COMBINED_SEL]] to <8 x i16>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[PTR2]], align 2
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[PTR3]], align 2
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[PTR4]], align 2
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[PTR5]], align 2
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[PTR6]], align 2
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[PTR7]], align 2
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v8i16(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[HALVES:%.*]] = bitcast <2 x i64> [[SRC]] to <8 x i16>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i16> [[HALVES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i16> [[HALVES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i16> [[HALVES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i16> [[HALVES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i16> [[HALVES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i16> [[HALVES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i16> [[HALVES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i16> [[HALVES]], i64 7
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i16 [[E0]], i16 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i16 [[E1]], i16 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i16 [[E2]], i16 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i16 [[E3]], i16 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i16 [[E4]], i16 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i16 [[E5]], i16 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i16 [[E6]], i16 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i16 [[E7]], i16 0
+; CHECK-NOOPT-NEXT: store i16 [[S0]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i16 [[S1]], ptr addrspace(1) [[PTR1]], align 2
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i16 [[S2]], ptr addrspace(1) [[PTR2]], align 2
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i16 [[S3]], ptr addrspace(1) [[PTR3]], align 2
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i16 [[S4]], ptr addrspace(1) [[PTR4]], align 2
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i16 [[S5]], ptr addrspace(1) [[PTR5]], align 2
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i16 [[S6]], ptr addrspace(1) [[PTR6]], align 2
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i16 [[S7]], ptr addrspace(1) [[PTR7]], align 2
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <2 x i64> %src,
+ i1 %cond
+) {
+entry:
+ %halves = bitcast <2 x i64> %src to <8 x i16>
+ %e0 = extractelement <8 x i16> %halves, i64 0
+ %e1 = extractelement <8 x i16> %halves, i64 1
+ %e2 = extractelement <8 x i16> %halves, i64 2
+ %e3 = extractelement <8 x i16> %halves, i64 3
+ %e4 = extractelement <8 x i16> %halves, i64 4
+ %e5 = extractelement <8 x i16> %halves, i64 5
+ %e6 = extractelement <8 x i16> %halves, i64 6
+ %e7 = extractelement <8 x i16> %halves, i64 7
+ %s0 = select i1 %cond, i16 %e0, i16 0
+ %s1 = select i1 %cond, i16 %e1, i16 0
+ %s2 = select i1 %cond, i16 %e2, i16 0
+ %s3 = select i1 %cond, i16 %e3, i16 0
+ %s4 = select i1 %cond, i16 %e4, i16 0
+ %s5 = select i1 %cond, i16 %e5, i16 0
+ %s6 = select i1 %cond, i16 %e6, i16 0
+ %s7 = select i1 %cond, i16 %e7, i16 0
+ store i16 %s0, ptr addrspace(1) %out, align 2
+ %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
+ store i16 %s1, ptr addrspace(1) %ptr1, align 2
+ %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
+ store i16 %s2, ptr addrspace(1) %ptr2, align 2
+ %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
+ store i16 %s3, ptr addrspace(1) %ptr3, align 2
+ %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
+ store i16 %s4, ptr addrspace(1) %ptr4, align 2
+ %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
+ store i16 %s5, ptr addrspace(1) %ptr5, align 2
+ %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
+ store i16 %s6, ptr addrspace(1) %ptr6, align 2
+ %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
+ store i16 %s7, ptr addrspace(1) %ptr7, align 2
+ ret void
+}
+
+; Test <2 x i64> to <4 x i32> (64-bit elements to 32-bit elements)
+define amdgpu_kernel void @combine_v2i64_to_v4i32(
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v4i32(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i64> [[SRC]], <2 x i64> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i64> [[COMBINED_SEL]] to <4 x i32>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[PTR1]], align 4
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[PTR2]], align 4
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[PTR3]], align 4
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v4i32(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[WORDS:%.*]] = bitcast <2 x i64> [[SRC]] to <4 x i32>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[WORDS]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[WORDS]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[WORDS]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[WORDS]], i64 3
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i32 [[E0]], i32 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i32 [[E1]], i32 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i32 [[E2]], i32 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i32 [[E3]], i32 0
+; CHECK-NOOPT-NEXT: store i32 [[S0]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i32 [[S1]], ptr addrspace(1) [[PTR1]], align 4
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i32 [[S2]], ptr addrspace(1) [[PTR2]], align 4
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i32 [[S3]], ptr addrspace(1) [[PTR3]], align 4
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <2 x i64> %src,
+ i1 %cond
+) {
+entry:
+ %words = bitcast <2 x i64> %src to <4 x i32>
+ %e0 = extractelement <4 x i32> %words, i64 0
+ %e1 = extractelement <4 x i32> %words, i64 1
+ %e2 = extractelement <4 x i32> %words, i64 2
+ %e3 = extractelement <4 x i32> %words, i64 3
+ %s0 = select i1 %cond, i32 %e0, i32 0
+ %s1 = select i1 %cond, i32 %e1, i32 0
+ %s2 = select i1 %cond, i32 %e2, i32 0
+ %s3 = select i1 %cond, i32 %e3, i32 0
+ store i32 %s0, ptr addrspace(1) %out, align 4
+ %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 1
+ store i32 %s1, ptr addrspace(1) %ptr1, align 4
+ %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 2
+ store i32 %s2, ptr addrspace(1) %ptr2, align 4
+ %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 3
+ store i32 %s3, ptr addrspace(1) %ptr3, align 4
+ ret void
+}
+
+; Test <2 x double> to <16 x i8> (double elements to byte elements)
+define amdgpu_kernel void @combine_v2f64_to_v16i8(
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v16i8(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x double> [[SRC]], <2 x double> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x double> [[COMBINED_SEL]] to <16 x i8>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
+; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
+; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
+; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
+; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
+; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i8 [[TMP14]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-OPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
+; CHECK-OPT-NEXT: store i8 [[TMP10]], ptr addrspace(1) [[PTR8]], align 1
+; CHECK-OPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
+; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR9]], align 1
+; CHECK-OPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
+; CHECK-OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[PTR10]], align 1
+; CHECK-OPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
+; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR11]], align 1
+; CHECK-OPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
+; CHECK-OPT-NEXT: store i8 [[TMP15]], ptr addrspace(1) [[PTR12]], align 1
+; CHECK-OPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
+; CHECK-OPT-NEXT: store i8 [[TMP8]], ptr addrspace(1) [[PTR13]], align 1
+; CHECK-OPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
+; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR14]], align 1
+; CHECK-OPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
+; CHECK-OPT-NEXT: store i8 [[TMP11]], ptr addrspace(1) [[PTR15]], align 1
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v16i8(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <2 x double> [[SRC]] to <16 x i8>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[BYTES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <16 x i8> [[BYTES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <16 x i8> [[BYTES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <16 x i8> [[BYTES]], i64 7
+; CHECK-NOOPT-NEXT: [[E8:%.*]] = extractelement <16 x i8> [[BYTES]], i64 8
+; CHECK-NOOPT-NEXT: [[E9:%.*]] = extractelement <16 x i8> [[BYTES]], i64 9
+; CHECK-NOOPT-NEXT: [[E10:%.*]] = extractelement <16 x i8> [[BYTES]], i64 10
+; CHECK-NOOPT-NEXT: [[E11:%.*]] = extractelement <16 x i8> [[BYTES]], i64 11
+; CHECK-NOOPT-NEXT: [[E12:%.*]] = extractelement <16 x i8> [[BYTES]], i64 12
+; CHECK-NOOPT-NEXT: [[E13:%.*]] = extractelement <16 x i8> [[BYTES]], i64 13
+; CHECK-NOOPT-NEXT: [[E14:%.*]] = extractelement <16 x i8> [[BYTES]], i64 14
+; CHECK-NOOPT-NEXT: [[E15:%.*]] = extractelement <16 x i8> [[BYTES]], i64 15
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
+; CHECK-NOOPT-NEXT: [[S8:%.*]] = select i1 [[COND]], i8 [[E8]], i8 0
+; CHECK-NOOPT-NEXT: [[S9:%.*]] = select i1 [[COND]], i8 [[E9]], i8 0
+; CHECK-NOOPT-NEXT: [[S10:%.*]] = select i1 [[COND]], i8 [[E10]], i8 0
+; CHECK-NOOPT-NEXT: [[S11:%.*]] = select i1 [[COND]], i8 [[E11]], i8 0
+; CHECK-NOOPT-NEXT: [[S12:%.*]] = select i1 [[COND]], i8 [[E12]], i8 0
+; CHECK-NOOPT-NEXT: [[S13:%.*]] = select i1 [[COND]], i8 [[E13]], i8 0
+; CHECK-NOOPT-NEXT: [[S14:%.*]] = select i1 [[COND]], i8 [[E14]], i8 0
+; CHECK-NOOPT-NEXT: [[S15:%.*]] = select i1 [[COND]], i8 [[E15]], i8 0
+; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-NOOPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
+; CHECK-NOOPT-NEXT: store i8 [[S8]], ptr addrspace(1) [[PTR8]], align 1
+; CHECK-NOOPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
+; CHECK-NOOPT-NEXT: store i8 [[S9]], ptr addrspace(1) [[PTR9]], align 1
+; CHECK-NOOPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
+; CHECK-NOOPT-NEXT: store i8 [[S10]], ptr addrspace(1) [[PTR10]], align 1
+; CHECK-NOOPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
+; CHECK-NOOPT-NEXT: store i8 [[S11]], ptr addrspace(1) [[PTR11]], align 1
+; CHECK-NOOPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
+; CHECK-NOOPT-NEXT: store i8 [[S12]], ptr addrspace(1) [[PTR12]], align 1
+; CHECK-NOOPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
+; CHECK-NOOPT-NEXT: store i8 [[S13]], ptr addrspace(1) [[PTR13]], align 1
+; CHECK-NOOPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
+; CHECK-NOOPT-NEXT: store i8 [[S14]], ptr addrspace(1) [[PTR14]], align 1
+; CHECK-NOOPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
+; CHECK-NOOPT-NEXT: store i8 [[S15]], ptr addrspace(1) [[PTR15]], align 1
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <2 x double> %src,
+ i1 %cond
+) {
+entry:
+ %bytes = bitcast <2 x double> %src to <16 x i8>
+ %e0 = extractelement <16 x i8> %bytes, i64 0
+ %e1 = extractelement <16 x i8> %bytes, i64 1
+ %e2 = extractelement <16 x i8> %bytes, i64 2
+ %e3 = extractelement <16 x i8> %bytes, i64 3
+ %e4 = extractelement <16 x i8> %bytes, i64 4
+ %e5 = extractelement <16 x i8> %bytes, i64 5
+ %e6 = extractelement <16 x i8> %bytes, i64 6
+ %e7 = extractelement <16 x i8> %bytes, i64 7
+ %e8 = extractelement <16 x i8> %bytes, i64 8
+ %e9 = extractelement <16 x i8> %bytes, i64 9
+ %e10 = extractelement <16 x i8> %bytes, i64 10
+ %e11 = extractelement <16 x i8> %bytes, i64 11
+ %e12 = extractelement <16 x i8> %bytes, i64 12
+ %e13 = extractelement <16 x i8> %bytes, i64 13
+ %e14 = extractelement <16 x i8> %bytes, i64 14
+ %e15 = extractelement <16 x i8> %bytes, i64 15
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ %s8 = select i1 %cond, i8 %e8, i8 0
+ %s9 = select i1 %cond, i8 %e9, i8 0
+ %s10 = select i1 %cond, i8 %e10, i8 0
+ %s11 = select i1 %cond, i8 %e11, i8 0
+ %s12 = select i1 %cond, i8 %e12, i8 0
+ %s13 = select i1 %cond, i8 %e13, i8 0
+ %s14 = select i1 %cond, i8 %e14, i8 0
+ %s15 = select i1 %cond, i8 %e15, i8 0
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
+ store i8 %s8, ptr addrspace(1) %ptr8, align 1
+ %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
+ store i8 %s9, ptr addrspace(1) %ptr9, align 1
+ %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
+ store i8 %s10, ptr addrspace(1) %ptr10, align 1
+ %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
+ store i8 %s11, ptr addrspace(1) %ptr11, align 1
+ %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
+ store i8 %s12, ptr addrspace(1) %ptr12, align 1
+ %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
+ store i8 %s13, ptr addrspace(1) %ptr13, align 1
+ %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
+ store i8 %s14, ptr addrspace(1) %ptr14, align 1
+ %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
+ store i8 %s15, ptr addrspace(1) %ptr15, align 1
+ ret void
+}
+
+; Test <2 x double> to <8 x i16> (double elements to 16-bit elements)
+define amdgpu_kernel void @combine_v2f64_to_v8i16(
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v8i16(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x double> [[SRC]], <2 x double> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x double> [[COMBINED_SEL]] to <8 x i16>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[PTR2]], align 2
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[PTR3]], align 2
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[PTR4]], align 2
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[PTR5]], align 2
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[PTR6]], align 2
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[PTR7]], align 2
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v8i16(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[HALVES:%.*]] = bitcast <2 x double> [[SRC]] to <8 x i16>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i16> [[HALVES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i16> [[HALVES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i16> [[HALVES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i16> [[HALVES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i16> [[HALVES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i16> [[HALVES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i16> [[HALVES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i16> [[HALVES]], i64 7
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i16 [[E0]], i16 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i16 [[E1]], i16 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i16 [[E2]], i16 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i16 [[E3]], i16 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i16 [[E4]], i16 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i16 [[E5]], i16 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i16 [[E6]], i16 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i16 [[E7]], i16 0
+; CHECK-NOOPT-NEXT: store i16 [[S0]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i16 [[S1]], ptr addrspace(1) [[PTR1]], align 2
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i16 [[S2]], ptr addrspace(1) [[PTR2]], align 2
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i16 [[S3]], ptr addrspace(1) [[PTR3]], align 2
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i16 [[S4]], ptr addrspace(1) [[PTR4]], align 2
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i16 [[S5]], ptr addrspace(1) [[PTR5]], align 2
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i16 [[S6]], ptr addrspace(1) [[PTR6]], align 2
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i16 [[S7]], ptr addrspace(1) [[PTR7]], align 2
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <2 x double> %src,
+ i1 %cond
+) {
+entry:
+ %halves = bitcast <2 x double> %src to <8 x i16>
+ %e0 = extractelement <8 x i16> %halves, i64 0
+ %e1 = extractelement <8 x i16> %halves, i64 1
+ %e2 = extractelement <8 x i16> %halves, i64 2
+ %e3 = extractelement <8 x i16> %halves, i64 3
+ %e4 = extractelement <8 x i16> %halves, i64 4
+ %e5 = extractelement <8 x i16> %halves, i64 5
+ %e6 = extractelement <8 x i16> %halves, i64 6
+ %e7 = extractelement <8 x i16> %halves, i64 7
+ %s0 = select i1 %cond, i16 %e0, i16 0
+ %s1 = select i1 %cond, i16 %e1, i16 0
+ %s2 = select i1 %cond, i16 %e2, i16 0
+ %s3 = select i1 %cond, i16 %e3, i16 0
+ %s4 = select i1 %cond, i16 %e4, i16 0
+ %s5 = select i1 %cond, i16 %e5, i16 0
+ %s6 = select i1 %cond, i16 %e6, i16 0
+ %s7 = select i1 %cond, i16 %e7, i16 0
+ store i16 %s0, ptr addrspace(1) %out, align 2
+ %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
+ store i16 %s1, ptr addrspace(1) %ptr1, align 2
+ %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
+ store i16 %s2, ptr addrspace(1) %ptr2, align 2
+ %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
+ store i16 %s3, ptr addrspace(1) %ptr3, align 2
+ %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
+ store i16 %s4, ptr addrspace(1) %ptr4, align 2
+ %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
+ store i16 %s5, ptr addrspace(1) %ptr5, align 2
+ %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
+ store i16 %s6, ptr addrspace(1) %ptr6, align 2
+ %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
+ store i16 %s7, ptr addrspace(1) %ptr7, align 2
+ ret void
+}
+
+; Test <2 x double> to <4 x i32> (double elements to 32-bit elements)
+define amdgpu_kernel void @combine_v2f64_to_v4i32(
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v4i32(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x double> [[SRC]], <2 x double> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x double> [[COMBINED_SEL]] to <4 x i32>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[PTR1]], align 4
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[PTR2]], align 4
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[PTR3]], align 4
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v4i32(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[WORDS:%.*]] = bitcast <2 x double> [[SRC]] to <4 x i32>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[WORDS]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[WORDS]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[WORDS]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[WORDS]], i64 3
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i32 [[E0]], i32 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i32 [[E1]], i32 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i32 [[E2]], i32 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i32 [[E3]], i32 0
+; CHECK-NOOPT-NEXT: store i32 [[S0]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i32 [[S1]], ptr addrspace(1) [[PTR1]], align 4
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i32 [[S2]], ptr addrspace(1) [[PTR2]], align 4
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i32 [[S3]], ptr addrspace(1) [[PTR3]], align 4
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ <2 x double> %src,
+ i1 %cond
+) {
+entry:
+ %words = bitcast <2 x double> %src to <4 x i32>
+ %e0 = extractelement <4 x i32> %words, i64 0
+ %e1 = extractelement <4 x i32> %words, i64 1
+ %e2 = extractelement <4 x i32> %words, i64 2
+ %e3 = extractelement <4 x i32> %words, i64 3
+ %s0 = select i1 %cond, i32 %e0, i32 0
+ %s1 = select i1 %cond, i32 %e1, i32 0
+ %s2 = select i1 %cond, i32 %e2, i32 0
+ %s3 = select i1 %cond, i32 %e3, i32 0
+ store i32 %s0, ptr addrspace(1) %out, align 4
+ %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 1
+ store i32 %s1, ptr addrspace(1) %ptr1, align 4
+ %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 2
+ store i32 %s2, ptr addrspace(1) %ptr2, align 4
+ %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 3
+ store i32 %s3, ptr addrspace(1) %ptr3, align 4
+ ret void
+}
+
declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
>From 73a77e8133cb3263789843bd3b2d3063dd32a43a Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Tue, 30 Dec 2025 22:05:04 +0530
Subject: [PATCH 04/11] review: transformation is profitable for other uses
---
.../AMDGPU/combine-scalar-selects-asm.ll | 123 ++++++++++++++++
.../CodeGen/AMDGPU/combine-scalar-selects.ll | 131 ++++++++++++++++++
2 files changed, 254 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
index 4c60a78ccc716..12ed63c470db2 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
@@ -265,6 +265,129 @@ entry:
ret void
}
+; Test: extracts have additional unrelated uses (extracts can't be removed)
+; The transformation should still be profitable as we reduce v_cndmask count
+define amdgpu_kernel void @combine_with_extract_other_uses_asm(
+; CHECK-OPT-LABEL: combine_with_extract_other_uses_asm:
+; CHECK-OPT: ; %bb.0: ; %entry
+; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-OPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
+; CHECK-OPT-NEXT: s_load_dword s0, s[4:5], 0x18
+; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-OPT-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-OPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc
+; CHECK-OPT-NEXT: v_add_u16_e32 v1, v0, v5
+; CHECK-OPT-NEXT: v_add_u16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
+; CHECK-OPT-NEXT: v_add_u16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; CHECK-OPT-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
+; CHECK-OPT-NEXT: global_store_byte v4, v0, s[6:7]
+; CHECK-OPT-NEXT: s_endpgm
+;
+; CHECK-NOOPT-LABEL: combine_with_extract_other_uses_asm:
+; CHECK-NOOPT: ; %bb.0: ; %entry
+; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
+; CHECK-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x18
+; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NOOPT-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 24, v1
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v2, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v6, vcc
+; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; CHECK-NOOPT-NEXT: v_add_u16_e32 v0, v0, v5
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v8
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v10
+; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v12
+; CHECK-NOOPT-NEXT: v_add_u16_e32 v0, v0, v6
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v6, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_add_u16_e32 v2, v0, v2
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NOOPT-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
+; CHECK-NOOPT-NEXT: global_store_byte v4, v2, s[6:7]
+; CHECK-NOOPT-NEXT: s_endpgm
+ ptr addrspace(1) %in,
+ ptr addrspace(1) %out,
+ ptr addrspace(1) %out2,
+ i1 %cond
+) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep = getelementptr <2 x i32>, ptr addrspace(1) %in, i64 %tid.ext
+ %loaded = load <2 x i32>, ptr addrspace(1) %gep, align 8
+ %bytes = bitcast <2 x i32> %loaded to <8 x i8>
+ %e0 = extractelement <8 x i8> %bytes, i64 0
+ %e1 = extractelement <8 x i8> %bytes, i64 1
+ %e2 = extractelement <8 x i8> %bytes, i64 2
+ %e3 = extractelement <8 x i8> %bytes, i64 3
+ %e4 = extractelement <8 x i8> %bytes, i64 4
+ %e5 = extractelement <8 x i8> %bytes, i64 5
+ %e6 = extractelement <8 x i8> %bytes, i64 6
+ %e7 = extractelement <8 x i8> %bytes, i64 7
+ ; Selects that will be combined
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ ; Store select results
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ ; Additional unrelated uses of the extracts - these prevent extract removal
+ %sum = add i8 %e0, %e1
+ %sum2 = add i8 %sum, %e2
+ %sum3 = add i8 %sum2, %e3
+ store i8 %sum3, ptr addrspace(1) %out2, align 1
+ ret void
+}
+
; Test <4 x i32> to <8 x i16> (32-bit elements to 16-bit elements)
define amdgpu_kernel void @combine_v4i32_to_v8i16_asm(
; CHECK-OPT-LABEL: combine_v4i32_to_v8i16_asm:
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
index 38e20647d4214..37f1a1055db24 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
@@ -564,6 +564,137 @@ entry:
ret void
}
+; Test: extracts have additional unrelated uses (extracts can't be removed)
+; The transformation should still be profitable as we reduce v_cndmask count
+define amdgpu_kernel void @combine_with_extract_other_uses(
+; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_with_extract_other_uses(
+; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-OPT-NEXT: [[ENTRY:.*:]]
+; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
+; CHECK-OPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
+; CHECK-OPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
+; CHECK-OPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
+; CHECK-OPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
+; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i32> [[SRC]], <2 x i32> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i32> [[COMBINED_SEL]] to <8 x i8>
+; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 0
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 2
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-OPT-NEXT: [[SUM:%.*]] = add i8 [[E0]], [[E1]]
+; CHECK-OPT-NEXT: [[SUM2:%.*]] = add i8 [[SUM]], [[E2]]
+; CHECK-OPT-NEXT: [[SUM3:%.*]] = add i8 [[SUM2]], [[E3]]
+; CHECK-OPT-NEXT: store i8 [[SUM3]], ptr addrspace(1) [[OUT2]], align 1
+; CHECK-OPT-NEXT: ret void
+;
+; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_with_extract_other_uses(
+; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
+; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
+; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
+; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
+; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
+; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
+; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i8> [[BYTES]], i64 4
+; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i8> [[BYTES]], i64 5
+; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i8> [[BYTES]], i64 6
+; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i8> [[BYTES]], i64 7
+; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
+; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
+; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
+; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
+; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
+; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
+; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
+; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
+; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
+; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
+; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
+; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
+; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
+; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
+; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
+; CHECK-NOOPT-NEXT: [[SUM:%.*]] = add i8 [[E0]], [[E1]]
+; CHECK-NOOPT-NEXT: [[SUM2:%.*]] = add i8 [[SUM]], [[E2]]
+; CHECK-NOOPT-NEXT: [[SUM3:%.*]] = add i8 [[SUM2]], [[E3]]
+; CHECK-NOOPT-NEXT: store i8 [[SUM3]], ptr addrspace(1) [[OUT2]], align 1
+; CHECK-NOOPT-NEXT: ret void
+;
+ ptr addrspace(1) %out,
+ ptr addrspace(1) %out2,
+ <2 x i32> %src,
+ i1 %cond
+) {
+entry:
+ %bytes = bitcast <2 x i32> %src to <8 x i8>
+ %e0 = extractelement <8 x i8> %bytes, i64 0
+ %e1 = extractelement <8 x i8> %bytes, i64 1
+ %e2 = extractelement <8 x i8> %bytes, i64 2
+ %e3 = extractelement <8 x i8> %bytes, i64 3
+ %e4 = extractelement <8 x i8> %bytes, i64 4
+ %e5 = extractelement <8 x i8> %bytes, i64 5
+ %e6 = extractelement <8 x i8> %bytes, i64 6
+ %e7 = extractelement <8 x i8> %bytes, i64 7
+ ; Selects that will be combined
+ %s0 = select i1 %cond, i8 %e0, i8 0
+ %s1 = select i1 %cond, i8 %e1, i8 0
+ %s2 = select i1 %cond, i8 %e2, i8 0
+ %s3 = select i1 %cond, i8 %e3, i8 0
+ %s4 = select i1 %cond, i8 %e4, i8 0
+ %s5 = select i1 %cond, i8 %e5, i8 0
+ %s6 = select i1 %cond, i8 %e6, i8 0
+ %s7 = select i1 %cond, i8 %e7, i8 0
+ ; Store select results
+ store i8 %s0, ptr addrspace(1) %out, align 1
+ %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
+ store i8 %s1, ptr addrspace(1) %ptr1, align 1
+ %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
+ store i8 %s2, ptr addrspace(1) %ptr2, align 1
+ %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
+ store i8 %s3, ptr addrspace(1) %ptr3, align 1
+ %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
+ store i8 %s4, ptr addrspace(1) %ptr4, align 1
+ %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
+ store i8 %s5, ptr addrspace(1) %ptr5, align 1
+ %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
+ store i8 %s6, ptr addrspace(1) %ptr6, align 1
+ %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
+ store i8 %s7, ptr addrspace(1) %ptr7, align 1
+ ; Additional unrelated uses of the extracts - these prevent extract removal
+ %sum = add i8 %e0, %e1
+ %sum2 = add i8 %sum, %e2
+ %sum3 = add i8 %sum2, %e3
+ store i8 %sum3, ptr addrspace(1) %out2, align 1
+ ret void
+}
+
; Negative test: select with extract as false value (wrong operand position)
define amdgpu_kernel void @no_combine_wrong_operand_order(
;
>From 966a82551f3a0e5b3745b584e9084049de0bd669 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Tue, 30 Dec 2025 22:39:47 +0530
Subject: [PATCH 05/11] review: avoid making seperate pass over block
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 27 +++++++------------
1 file changed, 9 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 3bcad6dd9cfff..10921aecc3911 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -78,6 +78,7 @@ class AMDGPULateCodeGenPrepare
bool canWidenScalarExtLoad(LoadInst &LI) const;
bool visitLoadInst(LoadInst &LI);
+ bool visitBitCastInst(BitCastInst &BC);
/// Combine scalarized selects from a bitcast back into a vector select.
///
@@ -245,28 +246,12 @@ bool AMDGPULateCodeGenPrepare::run() {
bool Changed = false;
- bool HasScalarSubwordLoads = ST.hasScalarSubwordLoads();
-
for (auto &BB : reverse(F))
for (Instruction &I : make_early_inc_range(reverse(BB))) {
- Changed |= !HasScalarSubwordLoads && visit(I);
+ Changed |= visit(I);
Changed |= LRO.optimizeLiveType(&I, DeadInsts);
}
- // Combine scalarized selects back into vector selects.
- // This uses a top-down approach: iterate over bitcasts (i32 vec -> i8 vec)
- // and collect all select instructions that use extracted elements with a
- // zero false value. By starting from the bitcast, we process each source
- // exactly once, avoiding redundant work when multiple selects share a source.
- if (CombineScalarSelects) {
- for (auto &BB : F) {
- for (Instruction &I : make_early_inc_range(BB)) {
- if (auto *BC = dyn_cast<BitCastInst>(&I))
- Changed |= tryCombineSelectsFromBitcast(*BC);
- }
- }
- }
-
RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);
return Changed;
}
@@ -539,7 +524,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
}
bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
- if (!WidenLoads)
+ if (!WidenLoads || ST.hasScalarSubwordLoads())
return false;
// Skip if that load is already aligned on DWORD at least as it's handled in
@@ -593,6 +578,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
return true;
}
+bool AMDGPULateCodeGenPrepare::visitBitCastInst(BitCastInst &BC) {
+ if (CombineScalarSelects)
+ return tryCombineSelectsFromBitcast(BC);
+ return false;
+}
+
bool AMDGPULateCodeGenPrepare::tryCombineSelectsFromBitcast(BitCastInst &BC) {
auto *SrcVecTy = dyn_cast<FixedVectorType>(BC.getSrcTy());
auto *DstVecTy = dyn_cast<FixedVectorType>(BC.getDestTy());
>From d6feb62276df19bc86b74edeab6fa1ebf244a858 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Fri, 2 Jan 2026 15:21:54 +0530
Subject: [PATCH 06/11] review: apply the patch to vectorcombine based on
suggestion
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 157 +-----------------
.../Transforms/Vectorize/VectorCombine.cpp | 122 ++++++++++++++
.../AMDGPU/combine-scalar-selects-asm.ll | 4 +-
.../CodeGen/AMDGPU/combine-scalar-selects.ll | 151 +++++++++--------
4 files changed, 202 insertions(+), 232 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 10921aecc3911..63e265612cbf7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -14,7 +14,6 @@
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -22,17 +21,14 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/Local.h"
#define DEBUG_TYPE "amdgpu-late-codegenprepare"
using namespace llvm;
-using namespace llvm::PatternMatch;
// Scalar load widening needs running after load-store-vectorizer as that pass
// doesn't handle overlapping cases. In addition, this pass enhances the
@@ -44,12 +40,6 @@ static cl::opt<bool>
"AMDGPULateCodeGenPrepare"),
cl::ReallyHidden, cl::init(true));
-static cl::opt<bool> CombineScalarSelects(
- "amdgpu-late-codegenprepare-combine-scalar-selects",
- cl::desc("Combine scalarized selects back into vector selects in "
- "AMDGPULateCodeGenPrepare"),
- cl::ReallyHidden, cl::init(true));
-
namespace {
class AMDGPULateCodeGenPrepare
@@ -78,25 +68,6 @@ class AMDGPULateCodeGenPrepare
bool canWidenScalarExtLoad(LoadInst &LI) const;
bool visitLoadInst(LoadInst &LI);
- bool visitBitCastInst(BitCastInst &BC);
-
- /// Combine scalarized selects from a bitcast back into a vector select.
- ///
- /// This optimization addresses VGPR bloat from patterns like:
- /// %vec = bitcast <4 x i32> %src to <16 x i8>
- /// %e0 = extractelement <16 x i8> %vec, i64 0
- /// %s0 = select i1 %cond, i8 %e0, i8 0
- /// ... (repeated for all 16 elements)
- ///
- /// Which generates 16 separate v_cndmask_b32 instructions. Instead, we
- /// transform it to:
- /// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
- /// %vec = bitcast <4 x i32> %sel to <16 x i8>
- /// %e0 = extractelement <16 x i8> %vec, i64 0
- /// ...
- ///
- /// This produces only 4 v_cndmask_b32 instructions operating on dwords.
- bool tryCombineSelectsFromBitcast(BitCastInst &BC);
};
using ValueToValueMap = DenseMap<const Value *, Value *>;
@@ -246,9 +217,11 @@ bool AMDGPULateCodeGenPrepare::run() {
bool Changed = false;
+ bool HasScalarSubwordLoads = ST.hasScalarSubwordLoads();
+
for (auto &BB : reverse(F))
for (Instruction &I : make_early_inc_range(reverse(BB))) {
- Changed |= visit(I);
+ Changed |= !HasScalarSubwordLoads && visit(I);
Changed |= LRO.optimizeLiveType(&I, DeadInsts);
}
@@ -524,7 +497,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
}
bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
- if (!WidenLoads || ST.hasScalarSubwordLoads())
+ if (!WidenLoads)
return false;
// Skip if that load is already aligned on DWORD at least as it's handled in
@@ -578,128 +551,6 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
return true;
}
-bool AMDGPULateCodeGenPrepare::visitBitCastInst(BitCastInst &BC) {
- if (CombineScalarSelects)
- return tryCombineSelectsFromBitcast(BC);
- return false;
-}
-
-bool AMDGPULateCodeGenPrepare::tryCombineSelectsFromBitcast(BitCastInst &BC) {
- auto *SrcVecTy = dyn_cast<FixedVectorType>(BC.getSrcTy());
- auto *DstVecTy = dyn_cast<FixedVectorType>(BC.getDestTy());
- if (!SrcVecTy || !DstVecTy)
- return false;
-
- // Source can be any 32-bit or 64-bit element type (i32, i64, float, double).
- // Destination must be smaller integer elements (i8, i16, or i32 from i64).
- // Zero in all these types is all-bits-zero, so the transformation is valid.
- Type *SrcEltTy = SrcVecTy->getElementType();
- Type *DstEltTy = DstVecTy->getElementType();
- unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
- unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
-
- if (SrcEltBits != 32 && SrcEltBits != 64)
- return false;
-
- if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
- return false;
-
- unsigned NumDstElts = DstVecTy->getNumElements();
- BasicBlock *BB = BC.getParent();
-
- // Require at least half the elements to have matching selects.
- // This threshold ensures the transformation is profitable.
- unsigned MinRequired = NumDstElts / 2;
-
- // Early exit: not enough users to possibly meet the threshold.
- if (BC.getNumUses() < MinRequired)
- return false;
-
- // Group selects by their condition value. Different conditions selecting
- // from the same bitcast are handled as independent groups, allowing us to
- // optimize multiple select patterns from a single bitcast.
- struct SelectGroup {
- // Map from element index to (select, extractelement) pair.
- SmallDenseMap<unsigned, std::pair<SelectInst *, ExtractElementInst *>, 16>
- Selects;
- // Track the earliest select instruction for correct insertion point.
- SelectInst *FirstSelect = nullptr;
- };
- DenseMap<Value *, SelectGroup> ConditionGroups;
-
- // Collect all matching select patterns in a single pass.
- // Pattern: select i1 %cond, i8 (extractelement %bc, idx), i8 0
- for (User *U : BC.users()) {
- auto *Ext = dyn_cast<ExtractElementInst>(U);
- if (!Ext || Ext->getParent() != BB)
- continue;
-
- auto *IdxC = dyn_cast<ConstantInt>(Ext->getIndexOperand());
- if (!IdxC || IdxC->getZExtValue() >= NumDstElts)
- continue;
-
- unsigned Idx = IdxC->getZExtValue();
-
- for (User *EU : Ext->users()) {
- // Must be: select %cond, %extract, 0 (in same BB)
- if (!match(EU, m_Select(m_Value(), m_Specific(Ext), m_Zero())))
- continue;
- SelectInst *Sel = cast<SelectInst>(EU);
- if (Sel->getParent() != BB)
- continue;
-
- auto &Group = ConditionGroups[Sel->getCondition()];
- Group.Selects[Idx] = {Sel, Ext};
-
- // Track earliest select to ensure correct dominance for insertion.
- if (!Group.FirstSelect || Sel->comesBefore(Group.FirstSelect))
- Group.FirstSelect = Sel;
- }
- }
-
- bool Changed = false;
-
- // Process each condition group that meets the threshold.
- for (auto &[Cond, Group] : ConditionGroups) {
- if (Group.Selects.size() < MinRequired)
- continue;
-
- LLVM_DEBUG(dbgs() << "AMDGPULateCodeGenPrepare: Combining "
- << Group.Selects.size()
- << " scalar selects into vector select\n");
-
- // Insert before the first select to maintain dominance.
- IRBuilder<> Builder(Group.FirstSelect);
-
- // Create vector select: select i1 %cond, <N x i32> %src, zeroinitializer
- Value *VecSel =
- Builder.CreateSelect(Cond, BC.getOperand(0),
- Constant::getNullValue(SrcVecTy), "combined.sel");
-
- // Bitcast the selected vector back to the byte vector type.
- Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy, "combined.bc");
-
- // Replace each scalar select with an extract from the combined result.
- for (auto &[Idx, Pair] : Group.Selects) {
- Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
- Pair.first->replaceAllUsesWith(NewExt);
- DeadInsts.emplace_back(Pair.first);
-
- // Mark the original extract as dead if it has no remaining uses.
- if (Pair.second->use_empty())
- DeadInsts.emplace_back(Pair.second);
- }
-
- Changed = true;
- }
-
- // Mark the original bitcast as dead if all its users were replaced.
- if (Changed && BC.use_empty())
- DeadInsts.emplace_back(&BC);
-
- return Changed;
-}
-
PreservedAnalyses
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c9e45a8d05d78..f07908fd00e32 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -127,6 +127,7 @@ class VectorCombine {
bool scalarizeOpOrCmp(Instruction &I);
bool scalarizeVPIntrinsic(Instruction &I);
bool foldExtractedCmps(Instruction &I);
+ bool foldSelectsFromBitcast(Instruction &I);
bool foldBinopOfReductions(Instruction &I);
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoad(Instruction &I);
@@ -1546,6 +1547,123 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
return true;
}
+/// Try to fold a scalar select that selects between an extracted element and
+/// zero into extracting from a vector select.
+///
+/// This pattern arises when a vector is bitcast to a smaller element type,
+/// elements are extracted, and then conditionally selected with zero:
+///
+/// %bc = bitcast <4 x i32> %src to <16 x i8>
+/// %e0 = extractelement <16 x i8> %bc, i32 0
+/// %s0 = select i1 %cond, i8 %e0, i8 0
+///
+/// Transforms to:
+/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
+/// %bc = bitcast <4 x i32> %sel to <16 x i8>
+/// %e0 = extractelement <16 x i8> %bc, i32 0
+///
+/// This is profitable because vector select on wider types produces fewer
+/// select/cndmask instructions than scalar selects on each element.
+bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
+ // Match: select i1 %cond, iN %extractelement, iN 0
+ Value *Cond, *Ext;
+ if (!match(&I, m_Select(m_Value(Cond), m_Value(Ext), m_Zero())))
+ return false;
+
+ // Condition must be scalar i1
+ if (!Cond->getType()->isIntegerTy(1))
+ return false;
+
+ // True value must be an extractelement from a bitcast
+ auto *ExtInst = dyn_cast<ExtractElementInst>(Ext);
+ if (!ExtInst)
+ return false;
+
+ auto *BC = dyn_cast<BitCastInst>(ExtInst->getVectorOperand());
+ if (!BC)
+ return false;
+
+ auto *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
+ auto *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
+ if (!SrcVecTy || !DstVecTy)
+ return false;
+
+ // Source must be 32-bit or 64-bit elements, destination must be smaller
+ // integer elements. Zero in all these types is all-bits-zero.
+ Type *SrcEltTy = SrcVecTy->getElementType();
+ Type *DstEltTy = DstVecTy->getElementType();
+ unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
+ unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
+
+ if (SrcEltBits != 32 && SrcEltBits != 64)
+ return false;
+
+ if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
+ return false;
+
+ // Check if a compatible vector select already exists in this block.
+ // If so, we can reuse it and only create the extract.
+ BasicBlock *BB = BC->getParent();
+ Value *SrcVec = BC->getOperand(0);
+ SelectInst *ExistingVecSel = nullptr;
+ BitCastInst *ExistingBC = nullptr;
+
+ for (User *U : SrcVec->users()) {
+ auto *SI = dyn_cast<SelectInst>(U);
+ if (SI && SI->getParent() == BB && SI->getCondition() == Cond &&
+ SI->getTrueValue() == SrcVec && match(SI->getFalseValue(), m_Zero())) {
+ ExistingVecSel = SI;
+ // Also look for an existing bitcast of this select
+ for (User *SU : SI->users()) {
+ auto *BCI = dyn_cast<BitCastInst>(SU);
+ if (BCI && BCI->getParent() == BB && BCI->getDestTy() == DstVecTy) {
+ ExistingBC = BCI;
+ break;
+ }
+ }
+ break;
+ }
+ }
+
+ // If we already have a vector select, this transformation is always
+ // beneficial - we just extract from it instead of doing a scalar select.
+ // If we don't have one yet, we'll create it and subsequent selects will
+ // find and reuse it.
+ //
+ // This is always profitable because on targets like AMDGPU, a vector
+ // select with scalar condition produces fewer cndmask instructions than
+ // multiple scalar selects (one per 32-bit chunk vs one per scalar element).
+
+ // Create the transformation.
+ Builder.SetInsertPoint(&I);
+
+ Value *VecSel;
+ if (ExistingVecSel) {
+ // Reuse existing vector select
+ VecSel = ExistingVecSel;
+ } else {
+ // Create vector select: select i1 %cond, <N x T> %src, zeroinitializer
+ VecSel = Builder.CreateSelect(Cond, SrcVec,
+ Constant::getNullValue(SrcVecTy), "sel.bc");
+ }
+
+ // Reuse existing bitcast or create a new one
+ Value *NewBC;
+ if (ExistingBC) {
+ NewBC = ExistingBC;
+ } else {
+ NewBC = Builder.CreateBitCast(VecSel, DstVecTy, "sel.bc.cast");
+ }
+
+ // Extract the element from the new bitcast
+ Value *NewExt =
+ Builder.CreateExtractElement(NewBC, ExtInst->getIndexOperand());
+ replaceValue(I, *NewExt);
+
+ LLVM_DEBUG(dbgs() << "VectorCombine: folded select into vector select\n");
+ return true;
+}
+
static void analyzeCostOfVecReduction(const IntrinsicInst &II,
TTI::TargetCostKind CostKind,
const TargetTransformInfo &TTI,
@@ -5077,6 +5195,10 @@ bool VectorCombine::run() {
if (foldExtractExtract(I))
return true;
break;
+ case Instruction::Select:
+ if (foldSelectsFromBitcast(I))
+ return true;
+ break;
case Instruction::Or:
if (foldConcatOfBoolMasks(I))
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
index 12ed63c470db2..193efae4973e4 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s --check-prefix=CHECK-OPT
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-late-codegenprepare-combine-scalar-selects=false < %s | FileCheck %s --check-prefix=CHECK-NOOPT
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=vector-combine -S < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 | FileCheck %s --check-prefix=CHECK-OPT
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s --check-prefix=CHECK-NOOPT
define amdgpu_kernel void @combine_scalar_selects_v16i8(
; CHECK-OPT-LABEL: combine_scalar_selects_v16i8:
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
index 37f1a1055db24..69d1aed790bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-late-codegenprepare -S %s | FileCheck %s --check-prefix=CHECK-OPT
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-late-codegenprepare -amdgpu-late-codegenprepare-combine-scalar-selects=false -S %s | FileCheck %s --check-prefix=CHECK-NOOPT
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=vector-combine -S %s | FileCheck %s --check-prefix=CHECK-OPT
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -S %s | FileCheck %s --check-prefix=CHECK-NOOPT
; Test that multiple scalar selects from the same vector source are combined
; back into a vector select when the optimization is enabled, and remain as
@@ -18,21 +18,21 @@ define amdgpu_kernel void @combine_scalar_selects_v16i8(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[VALID]], <4 x i32> [[LOADED]], <4 x i32> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x i32> [[COMBINED_SEL]] to <16 x i8>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
+; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
+; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
-; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
-; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
-; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
-; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
@@ -223,13 +223,13 @@ define amdgpu_kernel void @combine_scalar_selects_v8i8(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i32> [[SRC]], <2 x i32> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i32> [[COMBINED_SEL]] to <8 x i8>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 4
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 6
; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 7
; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR1]], align 1
@@ -333,15 +333,15 @@ define amdgpu_kernel void @combine_partial_selects(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x i32> [[SRC]], <4 x i32> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x i32> [[COMBINED_SEL]] to <16 x i8>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
+; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR1]], align 1
@@ -500,16 +500,13 @@ define amdgpu_kernel void @no_combine_too_few_selects(
; CHECK-OPT-LABEL: define amdgpu_kernel void @no_combine_too_few_selects(
; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[SRC]] to <16 x i8>
+; CHECK-OPT-NEXT: [[SEL_BC:%.*]] = select i1 [[COND]], <4 x i32> [[SRC]], <4 x i32> zeroinitializer
+; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[SEL_BC]] to <16 x i8>
; CHECK-OPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-OPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
-; CHECK-OPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
-; CHECK-OPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
-; CHECK-OPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
-; CHECK-OPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
-; CHECK-OPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
-; CHECK-OPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
-; CHECK-OPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-OPT-NEXT: [[S1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
+; CHECK-OPT-NEXT: [[S2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
+; CHECK-OPT-NEXT: [[S3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
+; CHECK-OPT-NEXT: store i8 [[E0]], ptr addrspace(1) [[OUT]], align 1
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
@@ -578,13 +575,13 @@ define amdgpu_kernel void @combine_with_extract_other_uses(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i32> [[SRC]], <2 x i32> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i32> [[COMBINED_SEL]] to <8 x i8>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 4
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 6
; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 7
; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR1]], align 1
@@ -773,13 +770,13 @@ define amdgpu_kernel void @combine_v4i32_to_v8i16(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x i32> [[SRC]], <4 x i32> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x i32> [[COMBINED_SEL]] to <8 x i16>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
@@ -882,21 +879,21 @@ define amdgpu_kernel void @combine_v4f32_to_v16i8(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x float> [[SRC]], <4 x float> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x float> [[COMBINED_SEL]] to <16 x i8>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
+; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
+; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
-; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
-; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
-; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
-; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
@@ -1079,13 +1076,13 @@ define amdgpu_kernel void @combine_v4f32_to_v8i16(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x float> [[SRC]], <4 x float> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x float> [[COMBINED_SEL]] to <8 x i16>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
@@ -1188,21 +1185,21 @@ define amdgpu_kernel void @combine_v2i64_to_v16i8(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i64> [[SRC]], <2 x i64> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i64> [[COMBINED_SEL]] to <16 x i8>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
+; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
+; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
-; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
-; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
-; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
-; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
@@ -1385,13 +1382,13 @@ define amdgpu_kernel void @combine_v2i64_to_v8i16(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i64> [[SRC]], <2 x i64> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i64> [[COMBINED_SEL]] to <8 x i16>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
@@ -1559,21 +1556,21 @@ define amdgpu_kernel void @combine_v2f64_to_v16i8(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x double> [[SRC]], <2 x double> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x double> [[COMBINED_SEL]] to <16 x i8>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
+; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
+; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
-; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
-; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
-; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
-; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
+; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
@@ -1756,13 +1753,13 @@ define amdgpu_kernel void @combine_v2f64_to_v8i16(
; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x double> [[SRC]], <2 x double> zeroinitializer
; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x double> [[COMBINED_SEL]] to <8 x i16>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
+; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
+; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
+; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
+; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
>From d6ed38c6c0ecf9fd79cf2abba0c048e53cbb85cb Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Fri, 2 Jan 2026 17:19:38 +0530
Subject: [PATCH 07/11] add TTI cost check to check profitability
---
.../Transforms/Vectorize/VectorCombine.cpp | 38 ++++++++++++++++---
1 file changed, 32 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index f07908fd00e32..fd929cde3f7d3 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1627,12 +1627,38 @@ bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
// If we already have a vector select, this transformation is always
// beneficial - we just extract from it instead of doing a scalar select.
- // If we don't have one yet, we'll create it and subsequent selects will
- // find and reuse it.
- //
- // This is always profitable because on targets like AMDGPU, a vector
- // select with scalar condition produces fewer cndmask instructions than
- // multiple scalar selects (one per 32-bit chunk vs one per scalar element).
+ if (!ExistingVecSel) {
+ // If we need to create a new vector select, check profitability using TTI.
+ // Compare the cost of one scalar select vs amortized cost of vector select.
+ //
+ // The vector select cost is amortized across all elements, so we compare:
+ // ScalarSelCost vs VecSelCost / NumDstElements
+ //
+ // This is profitable when VecSelCost < ScalarSelCost * NumDstElements,
+ // which is equivalent to checking if the vector select is cheaper per
+ // element.
+ auto *CondTy = CmpInst::makeCmpResultType(DstEltTy);
+ auto *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
+
+ InstructionCost ScalarSelCost =
+ TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ InstructionCost VecSelCost =
+ TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+
+ // The transformation creates one vector select that replaces multiple
+ // scalar selects. It's profitable if the vector select cost is less than
+ // the cost of all the scalar selects it replaces.
+ unsigned NumDstElements = DstVecTy->getNumElements();
+ if (VecSelCost >= ScalarSelCost * NumDstElements) {
+ LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
+ << "profitable (VecCost=" << VecSelCost
+ << ", ScalarCost=" << ScalarSelCost
+ << ", NumElts=" << NumDstElements << ")\n");
+ return false;
+ }
+ }
// Create the transformation.
Builder.SetInsertPoint(&I);
>From 3ae7b94c7d19b908c2273c682ac5ed488b8a1f37 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 5 Jan 2026 13:39:58 +0530
Subject: [PATCH 08/11] review: split nested loop & enhance pattern match
---
.../Transforms/Vectorize/VectorCombine.cpp | 73 ++++++++++---------
1 file changed, 39 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fd929cde3f7d3..2c09d99780361 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1564,25 +1564,46 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
///
/// This is profitable because vector select on wider types produces fewer
/// select/cndmask instructions than scalar selects on each element.
+
+/// Find a vector select on SrcVec with the given condition in the same block.
+static SelectInst *findCompatibleVecSel(Value *SrcVec, Value *Cond,
+ BasicBlock *BB) {
+ for (User *U : SrcVec->users()) {
+ auto *SI = dyn_cast<SelectInst>(U);
+ if (SI && SI->getParent() == BB && SI->getCondition() == Cond &&
+ SI->getTrueValue() == SrcVec && match(SI->getFalseValue(), m_Zero()))
+ return SI;
+ }
+ return nullptr;
+}
+
+/// Find a bitcast of the select to the destination vector type in the same
+/// block.
+static BitCastInst *findCompatibleBitCast(Value *Sel, BasicBlock *BB,
+ FixedVectorType *DstVecTy) {
+ for (User *U : Sel->users()) {
+ auto *BCI = dyn_cast<BitCastInst>(U);
+ if (BCI && BCI->getParent() == BB && BCI->getDestTy() == DstVecTy)
+ return BCI;
+ }
+ return nullptr;
+}
+
bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
- // Match: select i1 %cond, iN %extractelement, iN 0
- Value *Cond, *Ext;
- if (!match(&I, m_Select(m_Value(Cond), m_Value(Ext), m_Zero())))
+ // Match: select i1 %cond, (extractelement (bitcast <N x T> to <M x iK>),
+ // idx), 0
+ Value *Cond, *Idx;
+ BitCastInst *BC;
+ using MatchBitcast = PatternMatch::bind_ty<BitCastInst>;
+ if (!match(&I,
+ m_Select(m_Value(Cond),
+ m_ExtractElt(MatchBitcast(BC), m_Value(Idx)), m_Zero())))
return false;
// Condition must be scalar i1
if (!Cond->getType()->isIntegerTy(1))
return false;
- // True value must be an extractelement from a bitcast
- auto *ExtInst = dyn_cast<ExtractElementInst>(Ext);
- if (!ExtInst)
- return false;
-
- auto *BC = dyn_cast<BitCastInst>(ExtInst->getVectorOperand());
- if (!BC)
- return false;
-
auto *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
auto *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
if (!SrcVecTy || !DstVecTy)
@@ -1603,27 +1624,12 @@ bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
// Check if a compatible vector select already exists in this block.
// If so, we can reuse it and only create the extract.
- BasicBlock *BB = BC->getParent();
Value *SrcVec = BC->getOperand(0);
- SelectInst *ExistingVecSel = nullptr;
- BitCastInst *ExistingBC = nullptr;
-
- for (User *U : SrcVec->users()) {
- auto *SI = dyn_cast<SelectInst>(U);
- if (SI && SI->getParent() == BB && SI->getCondition() == Cond &&
- SI->getTrueValue() == SrcVec && match(SI->getFalseValue(), m_Zero())) {
- ExistingVecSel = SI;
- // Also look for an existing bitcast of this select
- for (User *SU : SI->users()) {
- auto *BCI = dyn_cast<BitCastInst>(SU);
- if (BCI && BCI->getParent() == BB && BCI->getDestTy() == DstVecTy) {
- ExistingBC = BCI;
- break;
- }
- }
- break;
- }
- }
+ BasicBlock *BB = BC->getParent();
+ SelectInst *ExistingVecSel = findCompatibleVecSel(SrcVec, Cond, BB);
+ BitCastInst *ExistingBC =
+ ExistingVecSel ? findCompatibleBitCast(ExistingVecSel, BB, DstVecTy)
+ : nullptr;
// If we already have a vector select, this transformation is always
// beneficial - we just extract from it instead of doing a scalar select.
@@ -1682,8 +1688,7 @@ bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
}
// Extract the element from the new bitcast
- Value *NewExt =
- Builder.CreateExtractElement(NewBC, ExtInst->getIndexOperand());
+ Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
replaceValue(I, *NewExt);
LLVM_DEBUG(dbgs() << "VectorCombine: folded select into vector select\n");
>From 79d4c9b14c782648912088ec5ac97a1ebfa575ab Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 5 Jan 2026 19:11:19 +0530
Subject: [PATCH 09/11] review: update run check
---
.../AMDGPU/combine-scalar-selects-asm.ll | 6 +-
.../CodeGen/AMDGPU/combine-scalar-selects.ll | 93 +++++++------------
2 files changed, 35 insertions(+), 64 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
index 193efae4973e4..970d933b942fc 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=vector-combine -S < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 | FileCheck %s --check-prefix=CHECK-OPT
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s --check-prefix=CHECK-NOOPT
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=vector-combine -S < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 | FileCheck %s --check-prefixes=CHECK,CHECK-OPT
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOOPT
define amdgpu_kernel void @combine_scalar_selects_v16i8(
; CHECK-OPT-LABEL: combine_scalar_selects_v16i8:
@@ -1368,3 +1368,5 @@ entry:
}
declare i32 @llvm.amdgcn.workitem.id.x()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
index 69d1aed790bc9..d2cc39a7e2d18 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=vector-combine -S %s | FileCheck %s --check-prefix=CHECK-OPT
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -S %s | FileCheck %s --check-prefix=CHECK-NOOPT
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=CHECK,CHECK-OPT
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -S %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOOPT
; Test that multiple scalar selects from the same vector source are combined
; back into a vector select when the optimization is enabled, and remain as
@@ -460,25 +460,15 @@ entry:
; Negative test: should not combine if false value is not zero
define amdgpu_kernel void @no_combine_non_zero_false(
;
-; CHECK-OPT-LABEL: define amdgpu_kernel void @no_combine_non_zero_false(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[BUFFER_RESOURCE:%.*]], i32 [[OFFSET:%.*]], i1 [[VALID:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[LOADED:%.*]] = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> [[BUFFER_RESOURCE]], i32 [[OFFSET]], i32 0, i32 0)
-; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[LOADED]] to <16 x i8>
-; CHECK-OPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-OPT-NEXT: [[S0:%.*]] = select i1 [[VALID]], i8 [[E0]], i8 1
-; CHECK-OPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @no_combine_non_zero_false(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[BUFFER_RESOURCE:%.*]], i32 [[OFFSET:%.*]], i1 [[VALID:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[LOADED:%.*]] = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> [[BUFFER_RESOURCE]], i32 [[OFFSET]], i32 0, i32 0)
-; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[LOADED]] to <16 x i8>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[VALID]], i8 [[E0]], i8 1
-; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NOOPT-NEXT: ret void
+; CHECK-LABEL: define amdgpu_kernel void @no_combine_non_zero_false(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[BUFFER_RESOURCE:%.*]], i32 [[OFFSET:%.*]], i1 [[VALID:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LOADED:%.*]] = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> [[BUFFER_RESOURCE]], i32 [[OFFSET]], i32 0, i32 0)
+; CHECK-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[LOADED]] to <16 x i8>
+; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-NEXT: [[S0:%.*]] = select i1 [[VALID]], i8 [[E0]], i8 1
+; CHECK-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NEXT: ret void
;
ptr addrspace(1) %out,
<4 x i32> %buffer_resource,
@@ -695,47 +685,26 @@ entry:
; Negative test: select with extract as false value (wrong operand position)
define amdgpu_kernel void @no_combine_wrong_operand_order(
;
-; CHECK-OPT-LABEL: define amdgpu_kernel void @no_combine_wrong_operand_order(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
-; CHECK-OPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
-; CHECK-OPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
-; CHECK-OPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
-; CHECK-OPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
-; CHECK-OPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 0, i8 [[E0]]
-; CHECK-OPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 0, i8 [[E1]]
-; CHECK-OPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 0, i8 [[E2]]
-; CHECK-OPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 0, i8 [[E3]]
-; CHECK-OPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @no_combine_wrong_operand_order(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 0, i8 [[E0]]
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 0, i8 [[E1]]
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 0, i8 [[E2]]
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 0, i8 [[E3]]
-; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NOOPT-NEXT: ret void
+; CHECK-LABEL: define amdgpu_kernel void @no_combine_wrong_operand_order(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
+; CHECK-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
+; CHECK-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
+; CHECK-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
+; CHECK-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
+; CHECK-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 0, i8 [[E0]]
+; CHECK-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 0, i8 [[E1]]
+; CHECK-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 0, i8 [[E2]]
+; CHECK-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 0, i8 [[E3]]
+; CHECK-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NEXT: ret void
;
ptr addrspace(1) %out,
<2 x i32> %src,
>From adab81d994a059cbbb7b5a39bc899b3de2657684 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 5 Jan 2026 20:17:57 +0530
Subject: [PATCH 10/11] review:root the combine at def instead of the user
---
.../Transforms/Vectorize/VectorCombine.cpp | 175 +++++++-----------
.../AMDGPU/combine-scalar-selects-asm.ll | 4 +-
.../CodeGen/AMDGPU/combine-scalar-selects.ll | 62 +++----
3 files changed, 94 insertions(+), 147 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 2c09d99780361..646d386b74aa8 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1547,8 +1547,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
return true;
}
-/// Try to fold a scalar select that selects between an extracted element and
-/// zero into extracting from a vector select.
+/// Try to fold scalar selects that select between extracted elements and zero
+/// into extracting from a vector select. This is rooted at the bitcast.
///
/// This pattern arises when a vector is bitcast to a smaller element type,
/// elements are extracted, and then conditionally selected with zero:
@@ -1556,52 +1556,22 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
/// %bc = bitcast <4 x i32> %src to <16 x i8>
/// %e0 = extractelement <16 x i8> %bc, i32 0
/// %s0 = select i1 %cond, i8 %e0, i8 0
+/// %e1 = extractelement <16 x i8> %bc, i32 1
+/// %s1 = select i1 %cond, i8 %e1, i8 0
+/// ...
///
/// Transforms to:
/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
/// %bc = bitcast <4 x i32> %sel to <16 x i8>
/// %e0 = extractelement <16 x i8> %bc, i32 0
+/// %e1 = extractelement <16 x i8> %bc, i32 1
+/// ...
///
/// This is profitable because vector select on wider types produces fewer
/// select/cndmask instructions than scalar selects on each element.
-
-/// Find a vector select on SrcVec with the given condition in the same block.
-static SelectInst *findCompatibleVecSel(Value *SrcVec, Value *Cond,
- BasicBlock *BB) {
- for (User *U : SrcVec->users()) {
- auto *SI = dyn_cast<SelectInst>(U);
- if (SI && SI->getParent() == BB && SI->getCondition() == Cond &&
- SI->getTrueValue() == SrcVec && match(SI->getFalseValue(), m_Zero()))
- return SI;
- }
- return nullptr;
-}
-
-/// Find a bitcast of the select to the destination vector type in the same
-/// block.
-static BitCastInst *findCompatibleBitCast(Value *Sel, BasicBlock *BB,
- FixedVectorType *DstVecTy) {
- for (User *U : Sel->users()) {
- auto *BCI = dyn_cast<BitCastInst>(U);
- if (BCI && BCI->getParent() == BB && BCI->getDestTy() == DstVecTy)
- return BCI;
- }
- return nullptr;
-}
-
bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
- // Match: select i1 %cond, (extractelement (bitcast <N x T> to <M x iK>),
- // idx), 0
- Value *Cond, *Idx;
- BitCastInst *BC;
- using MatchBitcast = PatternMatch::bind_ty<BitCastInst>;
- if (!match(&I,
- m_Select(m_Value(Cond),
- m_ExtractElt(MatchBitcast(BC), m_Value(Idx)), m_Zero())))
- return false;
-
- // Condition must be scalar i1
- if (!Cond->getType()->isIntegerTy(1))
+ auto *BC = dyn_cast<BitCastInst>(&I);
+ if (!BC)
return false;
auto *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
@@ -1622,77 +1592,74 @@ bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
return false;
- // Check if a compatible vector select already exists in this block.
- // If so, we can reuse it and only create the extract.
+ // Collect all select users that match the pattern, grouped by condition.
+ // Pattern: select i1 %cond, (extractelement %bc, idx), 0
+ DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
+
+ for (User *U : BC->users()) {
+ auto *Ext = dyn_cast<ExtractElementInst>(U);
+ if (!Ext)
+ continue;
+
+ for (User *ExtUser : Ext->users()) {
+ Value *Cond;
+ // Match: select i1 %cond, %ext, 0
+ if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
+ Cond->getType()->isIntegerTy(1))
+ CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
+ }
+ }
+
+ if (CondToSelects.empty())
+ return false;
+
+ // Check profitability using TTI.
+ auto *CondTy = CmpInst::makeCmpResultType(DstEltTy);
+ auto *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
+
+ InstructionCost ScalarSelCost =
+ TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ InstructionCost VecSelCost =
+ TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+
+ bool MadeChange = false;
Value *SrcVec = BC->getOperand(0);
- BasicBlock *BB = BC->getParent();
- SelectInst *ExistingVecSel = findCompatibleVecSel(SrcVec, Cond, BB);
- BitCastInst *ExistingBC =
- ExistingVecSel ? findCompatibleBitCast(ExistingVecSel, BB, DstVecTy)
- : nullptr;
-
- // If we already have a vector select, this transformation is always
- // beneficial - we just extract from it instead of doing a scalar select.
- if (!ExistingVecSel) {
- // If we need to create a new vector select, check profitability using TTI.
- // Compare the cost of one scalar select vs amortized cost of vector select.
- //
- // The vector select cost is amortized across all elements, so we compare:
- // ScalarSelCost vs VecSelCost / NumDstElements
- //
- // This is profitable when VecSelCost < ScalarSelCost * NumDstElements,
- // which is equivalent to checking if the vector select is cheaper per
- // element.
- auto *CondTy = CmpInst::makeCmpResultType(DstEltTy);
- auto *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
-
- InstructionCost ScalarSelCost =
- TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
- InstructionCost VecSelCost =
- TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
-
- // The transformation creates one vector select that replaces multiple
- // scalar selects. It's profitable if the vector select cost is less than
- // the cost of all the scalar selects it replaces.
- unsigned NumDstElements = DstVecTy->getNumElements();
- if (VecSelCost >= ScalarSelCost * NumDstElements) {
+
+ // Process each group of selects with the same condition.
+ for (auto &[Cond, Selects] : CondToSelects) {
+ // Only profitable if vector select cost < total scalar select cost.
+ if (VecSelCost >= ScalarSelCost * Selects.size()) {
LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
<< "profitable (VecCost=" << VecSelCost
<< ", ScalarCost=" << ScalarSelCost
- << ", NumElts=" << NumDstElements << ")\n");
- return false;
+ << ", NumSelects=" << Selects.size() << ")\n");
+ continue;
}
- }
- // Create the transformation.
- Builder.SetInsertPoint(&I);
+ // Create the vector select and bitcast once for this condition.
+ Builder.SetInsertPoint(BC->getNextNode());
+ Value *VecSel =
+ Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
+ Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
+
+ // Replace each scalar select with an extract from the new bitcast.
+ for (SelectInst *Sel : Selects) {
+ auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
+ Value *Idx = Ext->getIndexOperand();
+
+ Builder.SetInsertPoint(Sel);
+ Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
+ replaceValue(*Sel, *NewExt);
+ MadeChange = true;
+ }
- Value *VecSel;
- if (ExistingVecSel) {
- // Reuse existing vector select
- VecSel = ExistingVecSel;
- } else {
- // Create vector select: select i1 %cond, <N x T> %src, zeroinitializer
- VecSel = Builder.CreateSelect(Cond, SrcVec,
- Constant::getNullValue(SrcVecTy), "sel.bc");
+ LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
+ << " selects into vector select\n");
}
- // Reuse existing bitcast or create a new one
- Value *NewBC;
- if (ExistingBC) {
- NewBC = ExistingBC;
- } else {
- NewBC = Builder.CreateBitCast(VecSel, DstVecTy, "sel.bc.cast");
- }
-
- // Extract the element from the new bitcast
- Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
- replaceValue(I, *NewExt);
-
- LLVM_DEBUG(dbgs() << "VectorCombine: folded select into vector select\n");
- return true;
+ return MadeChange;
}
static void analyzeCostOfVecReduction(const IntrinsicInst &II,
@@ -5191,6 +5158,8 @@ bool VectorCombine::run() {
case Instruction::BitCast:
if (foldBitcastShuffle(I))
return true;
+ if (foldSelectsFromBitcast(I))
+ return true;
break;
case Instruction::And:
case Instruction::Or:
@@ -5226,10 +5195,6 @@ bool VectorCombine::run() {
if (foldExtractExtract(I))
return true;
break;
- case Instruction::Select:
- if (foldSelectsFromBitcast(I))
- return true;
- break;
case Instruction::Or:
if (foldConcatOfBoolMasks(I))
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
index 970d933b942fc..7ad2bf9c8f557 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
@@ -282,9 +282,9 @@ define amdgpu_kernel void @combine_with_extract_other_uses_asm(
; CHECK-OPT-NEXT: s_bitcmp1_b32 s0, 0
; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc
-; CHECK-OPT-NEXT: v_add_u16_e32 v1, v0, v5
+; CHECK-OPT-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; CHECK-OPT-NEXT: v_add_u16_e32 v1, v0, v1
; CHECK-OPT-NEXT: v_add_u16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
; CHECK-OPT-NEXT: v_add_u16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
index d2cc39a7e2d18..8867387e1fad0 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
@@ -487,44 +487,26 @@ entry:
; Negative test: too few selects (only 4 out of 16, less than half)
define amdgpu_kernel void @no_combine_too_few_selects(
;
-; CHECK-OPT-LABEL: define amdgpu_kernel void @no_combine_too_few_selects(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[SEL_BC:%.*]] = select i1 [[COND]], <4 x i32> [[SRC]], <4 x i32> zeroinitializer
-; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[SEL_BC]] to <16 x i8>
-; CHECK-OPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-OPT-NEXT: [[S1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
-; CHECK-OPT-NEXT: [[S2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
-; CHECK-OPT-NEXT: [[S3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
-; CHECK-OPT-NEXT: store i8 [[E0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @no_combine_too_few_selects(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[SRC]] to <16 x i8>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
-; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NOOPT-NEXT: ret void
+; CHECK-LABEL: define amdgpu_kernel void @no_combine_too_few_selects(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[SRC]] to <16 x i8>
+; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
+; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
+; CHECK-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
+; CHECK-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
+; CHECK-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
+; CHECK-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
+; CHECK-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
+; CHECK-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
+; CHECK-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
+; CHECK-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
+; CHECK-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
+; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
+; CHECK-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
+; CHECK-NEXT: ret void
;
ptr addrspace(1) %out,
<4 x i32> %src,
@@ -558,12 +540,12 @@ define amdgpu_kernel void @combine_with_extract_other_uses(
; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
; CHECK-OPT-NEXT: [[ENTRY:.*:]]
; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
+; CHECK-OPT-NEXT: [[TMP8:%.*]] = select i1 [[COND]], <2 x i32> [[SRC]], <2 x i32> zeroinitializer
+; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
; CHECK-OPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
; CHECK-OPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
; CHECK-OPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
; CHECK-OPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i32> [[SRC]], <2 x i32> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i32> [[COMBINED_SEL]] to <8 x i8>
; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 0
; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 1
; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 2
>From 6c2bba7b96271ff43a5dd24629305e1bc1af25b8 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Wed, 7 Jan 2026 18:09:02 +0530
Subject: [PATCH 11/11] review: address review comment post-merge
---
.../Transforms/Vectorize/VectorCombine.cpp | 38 +-
.../AMDGPU/combine-scalar-selects-asm.ll | 1372 -----------------
.../AMDGPU/combine-scalar-selects.ll | 0
3 files changed, 25 insertions(+), 1385 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
rename llvm/test/{CodeGen => Transforms/VectorCombine}/AMDGPU/combine-scalar-selects.ll (100%)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 646d386b74aa8..65c4eff1f229b 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1592,6 +1592,29 @@ bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
return false;
+ // Check profitability using TTI before collecting users.
+ Type *CondTy = CmpInst::makeCmpResultType(DstEltTy);
+ Type *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
+
+ InstructionCost ScalarSelCost =
+ TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ InstructionCost VecSelCost =
+ TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+
+ // We need at least this many selects for vectorization to be profitable.
+ // VecSelCost < ScalarSelCost * NumSelects => NumSelects > VecSelCost /
+ // ScalarSelCost
+ if (!ScalarSelCost.isValid() || ScalarSelCost == 0)
+ return false;
+
+ unsigned MinSelects = (VecSelCost.getValue() / ScalarSelCost.getValue()) + 1;
+
+ // Quick check: if bitcast doesn't have enough users, bail early.
+ if (!BC->hasNUsesOrMore(MinSelects))
+ return false;
+
// Collect all select users that match the pattern, grouped by condition.
// Pattern: select i1 %cond, (extractelement %bc, idx), 0
DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
@@ -1613,24 +1636,13 @@ bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
if (CondToSelects.empty())
return false;
- // Check profitability using TTI.
- auto *CondTy = CmpInst::makeCmpResultType(DstEltTy);
- auto *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
-
- InstructionCost ScalarSelCost =
- TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
- InstructionCost VecSelCost =
- TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
-
bool MadeChange = false;
Value *SrcVec = BC->getOperand(0);
// Process each group of selects with the same condition.
- for (auto &[Cond, Selects] : CondToSelects) {
+ for (auto [Cond, Selects] : CondToSelects) {
// Only profitable if vector select cost < total scalar select cost.
- if (VecSelCost >= ScalarSelCost * Selects.size()) {
+ if (Selects.size() < MinSelects) {
LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
<< "profitable (VecCost=" << VecSelCost
<< ", ScalarCost=" << ScalarSelCost
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
deleted file mode 100644
index 7ad2bf9c8f557..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
+++ /dev/null
@@ -1,1372 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=vector-combine -S < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 | FileCheck %s --check-prefixes=CHECK,CHECK-OPT
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOOPT
-
-define amdgpu_kernel void @combine_scalar_selects_v16i8(
-; CHECK-OPT-LABEL: combine_scalar_selects_v16i8:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_scalar_selects_v16i8:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %valid
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <4 x i32>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <4 x i32>, ptr addrspace(1) %gep, align 16
- %bytes = bitcast <4 x i32> %loaded to <16 x i8>
-
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
-
- %s0 = select i1 %valid, i8 %e0, i8 0
- %s1 = select i1 %valid, i8 %e1, i8 0
- %s2 = select i1 %valid, i8 %e2, i8 0
- %s3 = select i1 %valid, i8 %e3, i8 0
- %s4 = select i1 %valid, i8 %e4, i8 0
- %s5 = select i1 %valid, i8 %e5, i8 0
- %s6 = select i1 %valid, i8 %e6, i8 0
- %s7 = select i1 %valid, i8 %e7, i8 0
- %s8 = select i1 %valid, i8 %e8, i8 0
- %s9 = select i1 %valid, i8 %e9, i8 0
- %s10 = select i1 %valid, i8 %e10, i8 0
- %s11 = select i1 %valid, i8 %e11, i8 0
- %s12 = select i1 %valid, i8 %e12, i8 0
- %s13 = select i1 %valid, i8 %e13, i8 0
- %s14 = select i1 %valid, i8 %e14, i8 0
- %s15 = select i1 %valid, i8 %e15, i8 0
-
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
-
- ret void
-}
-
-; Test with v8i8 from v2i32 (smaller vector)
-define amdgpu_kernel void @combine_scalar_selects_v8i8(
-; CHECK-OPT-LABEL: combine_scalar_selects_v8i8:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_scalar_selects_v8i8:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v6, 8, v6
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v6, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x i32>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x i32>, ptr addrspace(1) %gep, align 8
- %bytes = bitcast <2 x i32> %loaded to <8 x i8>
- %e0 = extractelement <8 x i8> %bytes, i64 0
- %e1 = extractelement <8 x i8> %bytes, i64 1
- %e2 = extractelement <8 x i8> %bytes, i64 2
- %e3 = extractelement <8 x i8> %bytes, i64 3
- %e4 = extractelement <8 x i8> %bytes, i64 4
- %e5 = extractelement <8 x i8> %bytes, i64 5
- %e6 = extractelement <8 x i8> %bytes, i64 6
- %e7 = extractelement <8 x i8> %bytes, i64 7
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- ret void
-}
-
-; Test: extracts have additional unrelated uses (extracts can't be removed)
-; The transformation should still be profitable as we reduce v_cndmask count
-define amdgpu_kernel void @combine_with_extract_other_uses_asm(
-; CHECK-OPT-LABEL: combine_with_extract_other_uses_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
-; CHECK-OPT-NEXT: s_load_dword s0, s[4:5], 0x18
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s0, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc
-; CHECK-OPT-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; CHECK-OPT-NEXT: v_add_u16_e32 v1, v0, v1
-; CHECK-OPT-NEXT: v_add_u16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
-; CHECK-OPT-NEXT: v_add_u16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; CHECK-OPT-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
-; CHECK-OPT-NEXT: global_store_byte v4, v0, s[6:7]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_with_extract_other_uses_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x18
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s0, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v3, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_add_u16_e32 v0, v0, v5
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v10
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v12
-; CHECK-NOOPT-NEXT: v_add_u16_e32 v0, v0, v6
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v6, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_add_u16_e32 v2, v0, v2
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
-; CHECK-NOOPT-NEXT: global_store_byte v4, v2, s[6:7]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- ptr addrspace(1) %out2,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x i32>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x i32>, ptr addrspace(1) %gep, align 8
- %bytes = bitcast <2 x i32> %loaded to <8 x i8>
- %e0 = extractelement <8 x i8> %bytes, i64 0
- %e1 = extractelement <8 x i8> %bytes, i64 1
- %e2 = extractelement <8 x i8> %bytes, i64 2
- %e3 = extractelement <8 x i8> %bytes, i64 3
- %e4 = extractelement <8 x i8> %bytes, i64 4
- %e5 = extractelement <8 x i8> %bytes, i64 5
- %e6 = extractelement <8 x i8> %bytes, i64 6
- %e7 = extractelement <8 x i8> %bytes, i64 7
- ; Selects that will be combined
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- ; Store select results
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- ; Additional unrelated uses of the extracts - these prevent extract removal
- %sum = add i8 %e0, %e1
- %sum2 = add i8 %sum, %e2
- %sum3 = add i8 %sum2, %e3
- store i8 %sum3, ptr addrspace(1) %out2, align 1
- ret void
-}
-
-; Test <4 x i32> to <8 x i16> (32-bit elements to 16-bit elements)
-define amdgpu_kernel void @combine_v4i32_to_v8i16_asm(
-; CHECK-OPT-LABEL: combine_v4i32_to_v8i16_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v4i32_to_v8i16_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <4 x i32>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <4 x i32>, ptr addrspace(1) %gep, align 16
- %halves = bitcast <4 x i32> %loaded to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <4 x float> to <16 x i8> (float elements to byte elements)
-define amdgpu_kernel void @combine_v4f32_to_v16i8_asm(
-; CHECK-OPT-LABEL: combine_v4f32_to_v16i8_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v4f32_to_v16i8_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <4 x float>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <4 x float>, ptr addrspace(1) %gep, align 16
- %bytes = bitcast <4 x float> %loaded to <16 x i8>
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- %s8 = select i1 %cond, i8 %e8, i8 0
- %s9 = select i1 %cond, i8 %e9, i8 0
- %s10 = select i1 %cond, i8 %e10, i8 0
- %s11 = select i1 %cond, i8 %e11, i8 0
- %s12 = select i1 %cond, i8 %e12, i8 0
- %s13 = select i1 %cond, i8 %e13, i8 0
- %s14 = select i1 %cond, i8 %e14, i8 0
- %s15 = select i1 %cond, i8 %e15, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
- ret void
-}
-
-; Test <4 x float> to <8 x i16> (float elements to 16-bit elements)
-define amdgpu_kernel void @combine_v4f32_to_v8i16_asm(
-; CHECK-OPT-LABEL: combine_v4f32_to_v8i16_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v4f32_to_v8i16_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <4 x float>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <4 x float>, ptr addrspace(1) %gep, align 16
- %halves = bitcast <4 x float> %loaded to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <2 x i64> to <16 x i8> (64-bit elements to byte elements)
-define amdgpu_kernel void @combine_v2i64_to_v16i8_asm(
-; CHECK-OPT-LABEL: combine_v2i64_to_v16i8_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2i64_to_v16i8_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x i64>, ptr addrspace(1) %gep, align 16
- %bytes = bitcast <2 x i64> %loaded to <16 x i8>
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- %s8 = select i1 %cond, i8 %e8, i8 0
- %s9 = select i1 %cond, i8 %e9, i8 0
- %s10 = select i1 %cond, i8 %e10, i8 0
- %s11 = select i1 %cond, i8 %e11, i8 0
- %s12 = select i1 %cond, i8 %e12, i8 0
- %s13 = select i1 %cond, i8 %e13, i8 0
- %s14 = select i1 %cond, i8 %e14, i8 0
- %s15 = select i1 %cond, i8 %e15, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
- ret void
-}
-
-; Test <2 x i64> to <8 x i16> (64-bit elements to 16-bit elements)
-define amdgpu_kernel void @combine_v2i64_to_v8i16_asm(
-; CHECK-OPT-LABEL: combine_v2i64_to_v8i16_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2i64_to_v8i16_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x i64>, ptr addrspace(1) %gep, align 16
- %halves = bitcast <2 x i64> %loaded to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <2 x i64> to <4 x i32> (64-bit elements to 32-bit elements)
-define amdgpu_kernel void @combine_v2i64_to_v4i32_asm(
-; CHECK-OPT-LABEL: combine_v2i64_to_v4i32_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2i64_to_v4i32_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x i64>, ptr addrspace(1) %gep, align 16
- %words = bitcast <2 x i64> %loaded to <4 x i32>
- %e0 = extractelement <4 x i32> %words, i64 0
- %e1 = extractelement <4 x i32> %words, i64 1
- %e2 = extractelement <4 x i32> %words, i64 2
- %e3 = extractelement <4 x i32> %words, i64 3
- %s0 = select i1 %cond, i32 %e0, i32 0
- %s1 = select i1 %cond, i32 %e1, i32 0
- %s2 = select i1 %cond, i32 %e2, i32 0
- %s3 = select i1 %cond, i32 %e3, i32 0
- store i32 %s0, ptr addrspace(1) %out, align 4
- %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 1
- store i32 %s1, ptr addrspace(1) %ptr1, align 4
- %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 2
- store i32 %s2, ptr addrspace(1) %ptr2, align 4
- %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 3
- store i32 %s3, ptr addrspace(1) %ptr3, align 4
- ret void
-}
-
-; Test <2 x double> to <16 x i8> (double elements to byte elements)
-define amdgpu_kernel void @combine_v2f64_to_v16i8_asm(
-; CHECK-OPT-LABEL: combine_v2f64_to_v16i8_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2f64_to_v16i8_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x double>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x double>, ptr addrspace(1) %gep, align 16
- %bytes = bitcast <2 x double> %loaded to <16 x i8>
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- %s8 = select i1 %cond, i8 %e8, i8 0
- %s9 = select i1 %cond, i8 %e9, i8 0
- %s10 = select i1 %cond, i8 %e10, i8 0
- %s11 = select i1 %cond, i8 %e11, i8 0
- %s12 = select i1 %cond, i8 %e12, i8 0
- %s13 = select i1 %cond, i8 %e13, i8 0
- %s14 = select i1 %cond, i8 %e14, i8 0
- %s15 = select i1 %cond, i8 %e15, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
- ret void
-}
-
-; Test <2 x double> to <8 x i16> (double elements to 16-bit elements)
-define amdgpu_kernel void @combine_v2f64_to_v8i16_asm(
-; CHECK-OPT-LABEL: combine_v2f64_to_v8i16_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2f64_to_v8i16_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x double>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x double>, ptr addrspace(1) %gep, align 16
- %halves = bitcast <2 x double> %loaded to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <2 x double> to <4 x i32> (double elements to 32-bit elements)
-define amdgpu_kernel void @combine_v2f64_to_v4i32_asm(
-; CHECK-OPT-LABEL: combine_v2f64_to_v4i32_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2f64_to_v4i32_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x double>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x double>, ptr addrspace(1) %gep, align 16
- %words = bitcast <2 x double> %loaded to <4 x i32>
- %e0 = extractelement <4 x i32> %words, i64 0
- %e1 = extractelement <4 x i32> %words, i64 1
- %e2 = extractelement <4 x i32> %words, i64 2
- %e3 = extractelement <4 x i32> %words, i64 3
- %s0 = select i1 %cond, i32 %e0, i32 0
- %s1 = select i1 %cond, i32 %e1, i32 0
- %s2 = select i1 %cond, i32 %e2, i32 0
- %s3 = select i1 %cond, i32 %e3, i32 0
- store i32 %s0, ptr addrspace(1) %out, align 4
- %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 1
- store i32 %s1, ptr addrspace(1) %ptr1, align 4
- %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 2
- store i32 %s2, ptr addrspace(1) %ptr2, align 4
- %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 3
- store i32 %s3, ptr addrspace(1) %ptr3, align 4
- ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x()
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/combine-scalar-selects.ll
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
rename to llvm/test/Transforms/VectorCombine/AMDGPU/combine-scalar-selects.ll
More information about the llvm-commits
mailing list