[llvm-branch-commits] [llvm] da5e36b - Revert "[VectorCombine] Fold scalar selects from bitcast into vector select (…"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jan 7 04:14:55 PST 2026
Author: Pankaj Dwivedi
Date: 2026-01-07T17:44:51+05:30
New Revision: da5e36bb6ca90dfe99c0456fa5827d584605ace6
URL: https://github.com/llvm/llvm-project/commit/da5e36bb6ca90dfe99c0456fa5827d584605ace6
DIFF: https://github.com/llvm/llvm-project/commit/da5e36bb6ca90dfe99c0456fa5827d584605ace6.diff
LOG: Revert "[VectorCombine] Fold scalar selects from bitcast into vector select (…"
This reverts commit 72f18a05d6dcf96d0f5722ff425a6c7388933ab1.
Added:
Modified:
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Removed:
llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 6fc5ca155d57a..3e06f74fa5c65 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -127,7 +127,6 @@ class VectorCombine {
bool scalarizeOpOrCmp(Instruction &I);
bool scalarizeVPIntrinsic(Instruction &I);
bool foldExtractedCmps(Instruction &I);
- bool foldSelectsFromBitcast(Instruction &I);
bool foldBinopOfReductions(Instruction &I);
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoad(Instruction &I);
@@ -1547,121 +1546,6 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
return true;
}
-/// Try to fold scalar selects that select between extracted elements and zero
-/// into extracting from a vector select. This is rooted at the bitcast.
-///
-/// This pattern arises when a vector is bitcast to a smaller element type,
-/// elements are extracted, and then conditionally selected with zero:
-///
-/// %bc = bitcast <4 x i32> %src to <16 x i8>
-/// %e0 = extractelement <16 x i8> %bc, i32 0
-/// %s0 = select i1 %cond, i8 %e0, i8 0
-/// %e1 = extractelement <16 x i8> %bc, i32 1
-/// %s1 = select i1 %cond, i8 %e1, i8 0
-/// ...
-///
-/// Transforms to:
-/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
-/// %bc = bitcast <4 x i32> %sel to <16 x i8>
-/// %e0 = extractelement <16 x i8> %bc, i32 0
-/// %e1 = extractelement <16 x i8> %bc, i32 1
-/// ...
-///
-/// This is profitable because vector select on wider types produces fewer
-/// select/cndmask instructions than scalar selects on each element.
-bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
- auto *BC = dyn_cast<BitCastInst>(&I);
- if (!BC)
- return false;
-
- auto *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
- auto *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
- if (!SrcVecTy || !DstVecTy)
- return false;
-
- // Source must be 32-bit or 64-bit elements, destination must be smaller
- // integer elements. Zero in all these types is all-bits-zero.
- Type *SrcEltTy = SrcVecTy->getElementType();
- Type *DstEltTy = DstVecTy->getElementType();
- unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
- unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
-
- if (SrcEltBits != 32 && SrcEltBits != 64)
- return false;
-
- if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
- return false;
-
- // Collect all select users that match the pattern, grouped by condition.
- // Pattern: select i1 %cond, (extractelement %bc, idx), 0
- DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
-
- for (User *U : BC->users()) {
- auto *Ext = dyn_cast<ExtractElementInst>(U);
- if (!Ext)
- continue;
-
- for (User *ExtUser : Ext->users()) {
- Value *Cond;
- // Match: select i1 %cond, %ext, 0
- if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
- Cond->getType()->isIntegerTy(1))
- CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
- }
- }
-
- if (CondToSelects.empty())
- return false;
-
- // Check profitability using TTI.
- auto *CondTy = CmpInst::makeCmpResultType(DstEltTy);
- auto *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
-
- InstructionCost ScalarSelCost =
- TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
- InstructionCost VecSelCost =
- TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
-
- bool MadeChange = false;
- Value *SrcVec = BC->getOperand(0);
-
- // Process each group of selects with the same condition.
- for (auto &[Cond, Selects] : CondToSelects) {
- // Only profitable if vector select cost < total scalar select cost.
- if (VecSelCost >= ScalarSelCost * Selects.size()) {
- LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
- << "profitable (VecCost=" << VecSelCost
- << ", ScalarCost=" << ScalarSelCost
- << ", NumSelects=" << Selects.size() << ")\n");
- continue;
- }
-
- // Create the vector select and bitcast once for this condition.
- Builder.SetInsertPoint(BC->getNextNode());
- Value *VecSel =
- Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
- Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
-
- // Replace each scalar select with an extract from the new bitcast.
- for (SelectInst *Sel : Selects) {
- auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
- Value *Idx = Ext->getIndexOperand();
-
- Builder.SetInsertPoint(Sel);
- Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
- replaceValue(*Sel, *NewExt);
- MadeChange = true;
- }
-
- LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
- << " selects into vector select\n");
- }
-
- return MadeChange;
-}
-
static void analyzeCostOfVecReduction(const IntrinsicInst &II,
TTI::TargetCostKind CostKind,
const TargetTransformInfo &TTI,
@@ -5158,8 +5042,6 @@ bool VectorCombine::run() {
case Instruction::BitCast:
if (foldBitcastShuffle(I))
return true;
- if (foldSelectsFromBitcast(I))
- return true;
break;
case Instruction::And:
case Instruction::Or:
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
deleted file mode 100644
index 7ad2bf9c8f557..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects-asm.ll
+++ /dev/null
@@ -1,1372 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=vector-combine -S < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 | FileCheck %s --check-prefixes=CHECK,CHECK-OPT
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOOPT
-
-define amdgpu_kernel void @combine_scalar_selects_v16i8(
-; CHECK-OPT-LABEL: combine_scalar_selects_v16i8:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_scalar_selects_v16i8:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %valid
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <4 x i32>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <4 x i32>, ptr addrspace(1) %gep, align 16
- %bytes = bitcast <4 x i32> %loaded to <16 x i8>
-
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
-
- %s0 = select i1 %valid, i8 %e0, i8 0
- %s1 = select i1 %valid, i8 %e1, i8 0
- %s2 = select i1 %valid, i8 %e2, i8 0
- %s3 = select i1 %valid, i8 %e3, i8 0
- %s4 = select i1 %valid, i8 %e4, i8 0
- %s5 = select i1 %valid, i8 %e5, i8 0
- %s6 = select i1 %valid, i8 %e6, i8 0
- %s7 = select i1 %valid, i8 %e7, i8 0
- %s8 = select i1 %valid, i8 %e8, i8 0
- %s9 = select i1 %valid, i8 %e9, i8 0
- %s10 = select i1 %valid, i8 %e10, i8 0
- %s11 = select i1 %valid, i8 %e11, i8 0
- %s12 = select i1 %valid, i8 %e12, i8 0
- %s13 = select i1 %valid, i8 %e13, i8 0
- %s14 = select i1 %valid, i8 %e14, i8 0
- %s15 = select i1 %valid, i8 %e15, i8 0
-
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
-
- ret void
-}
-
-; Test with v8i8 from v2i32 (smaller vector)
-define amdgpu_kernel void @combine_scalar_selects_v8i8(
-; CHECK-OPT-LABEL: combine_scalar_selects_v8i8:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_scalar_selects_v8i8:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v6, 8, v6
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v6, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x i32>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x i32>, ptr addrspace(1) %gep, align 8
- %bytes = bitcast <2 x i32> %loaded to <8 x i8>
- %e0 = extractelement <8 x i8> %bytes, i64 0
- %e1 = extractelement <8 x i8> %bytes, i64 1
- %e2 = extractelement <8 x i8> %bytes, i64 2
- %e3 = extractelement <8 x i8> %bytes, i64 3
- %e4 = extractelement <8 x i8> %bytes, i64 4
- %e5 = extractelement <8 x i8> %bytes, i64 5
- %e6 = extractelement <8 x i8> %bytes, i64 6
- %e7 = extractelement <8 x i8> %bytes, i64 7
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- ret void
-}
-
-; Test: extracts have additional unrelated uses (extracts can't be removed)
-; The transformation should still be profitable as we reduce v_cndmask count
-define amdgpu_kernel void @combine_with_extract_other_uses_asm(
-; CHECK-OPT-LABEL: combine_with_extract_other_uses_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
-; CHECK-OPT-NEXT: s_load_dword s0, s[4:5], 0x18
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s0, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc
-; CHECK-OPT-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; CHECK-OPT-NEXT: v_add_u16_e32 v1, v0, v1
-; CHECK-OPT-NEXT: v_add_u16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
-; CHECK-OPT-NEXT: v_add_u16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; CHECK-OPT-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
-; CHECK-OPT-NEXT: global_store_byte v4, v0, s[6:7]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_with_extract_other_uses_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x18
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s0, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v3, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_add_u16_e32 v0, v0, v5
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v10
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v12
-; CHECK-NOOPT-NEXT: v_add_u16_e32 v0, v0, v6
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v6, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_add_u16_e32 v2, v0, v2
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
-; CHECK-NOOPT-NEXT: global_store_byte v4, v2, s[6:7]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- ptr addrspace(1) %out2,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x i32>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x i32>, ptr addrspace(1) %gep, align 8
- %bytes = bitcast <2 x i32> %loaded to <8 x i8>
- %e0 = extractelement <8 x i8> %bytes, i64 0
- %e1 = extractelement <8 x i8> %bytes, i64 1
- %e2 = extractelement <8 x i8> %bytes, i64 2
- %e3 = extractelement <8 x i8> %bytes, i64 3
- %e4 = extractelement <8 x i8> %bytes, i64 4
- %e5 = extractelement <8 x i8> %bytes, i64 5
- %e6 = extractelement <8 x i8> %bytes, i64 6
- %e7 = extractelement <8 x i8> %bytes, i64 7
- ; Selects that will be combined
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- ; Store select results
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- ; Additional unrelated uses of the extracts - these prevent extract removal
- %sum = add i8 %e0, %e1
- %sum2 = add i8 %sum, %e2
- %sum3 = add i8 %sum2, %e3
- store i8 %sum3, ptr addrspace(1) %out2, align 1
- ret void
-}
-
-; Test <4 x i32> to <8 x i16> (32-bit elements to 16-bit elements)
-define amdgpu_kernel void @combine_v4i32_to_v8i16_asm(
-; CHECK-OPT-LABEL: combine_v4i32_to_v8i16_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v4i32_to_v8i16_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <4 x i32>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <4 x i32>, ptr addrspace(1) %gep, align 16
- %halves = bitcast <4 x i32> %loaded to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <4 x float> to <16 x i8> (float elements to byte elements)
-define amdgpu_kernel void @combine_v4f32_to_v16i8_asm(
-; CHECK-OPT-LABEL: combine_v4f32_to_v16i8_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v4f32_to_v16i8_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <4 x float>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <4 x float>, ptr addrspace(1) %gep, align 16
- %bytes = bitcast <4 x float> %loaded to <16 x i8>
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- %s8 = select i1 %cond, i8 %e8, i8 0
- %s9 = select i1 %cond, i8 %e9, i8 0
- %s10 = select i1 %cond, i8 %e10, i8 0
- %s11 = select i1 %cond, i8 %e11, i8 0
- %s12 = select i1 %cond, i8 %e12, i8 0
- %s13 = select i1 %cond, i8 %e13, i8 0
- %s14 = select i1 %cond, i8 %e14, i8 0
- %s15 = select i1 %cond, i8 %e15, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
- ret void
-}
-
-; Test <4 x float> to <8 x i16> (float elements to 16-bit elements)
-define amdgpu_kernel void @combine_v4f32_to_v8i16_asm(
-; CHECK-OPT-LABEL: combine_v4f32_to_v8i16_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v4f32_to_v8i16_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <4 x float>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <4 x float>, ptr addrspace(1) %gep, align 16
- %halves = bitcast <4 x float> %loaded to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <2 x i64> to <16 x i8> (64-bit elements to byte elements)
-define amdgpu_kernel void @combine_v2i64_to_v16i8_asm(
-; CHECK-OPT-LABEL: combine_v2i64_to_v16i8_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2i64_to_v16i8_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x i64>, ptr addrspace(1) %gep, align 16
- %bytes = bitcast <2 x i64> %loaded to <16 x i8>
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- %s8 = select i1 %cond, i8 %e8, i8 0
- %s9 = select i1 %cond, i8 %e9, i8 0
- %s10 = select i1 %cond, i8 %e10, i8 0
- %s11 = select i1 %cond, i8 %e11, i8 0
- %s12 = select i1 %cond, i8 %e12, i8 0
- %s13 = select i1 %cond, i8 %e13, i8 0
- %s14 = select i1 %cond, i8 %e14, i8 0
- %s15 = select i1 %cond, i8 %e15, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
- ret void
-}
-
-; Test <2 x i64> to <8 x i16> (64-bit elements to 16-bit elements)
-define amdgpu_kernel void @combine_v2i64_to_v8i16_asm(
-; CHECK-OPT-LABEL: combine_v2i64_to_v8i16_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2i64_to_v8i16_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x i64>, ptr addrspace(1) %gep, align 16
- %halves = bitcast <2 x i64> %loaded to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <2 x i64> to <4 x i32> (64-bit elements to 32-bit elements)
-define amdgpu_kernel void @combine_v2i64_to_v4i32_asm(
-; CHECK-OPT-LABEL: combine_v2i64_to_v4i32_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2i64_to_v4i32_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x i64>, ptr addrspace(1) %gep, align 16
- %words = bitcast <2 x i64> %loaded to <4 x i32>
- %e0 = extractelement <4 x i32> %words, i64 0
- %e1 = extractelement <4 x i32> %words, i64 1
- %e2 = extractelement <4 x i32> %words, i64 2
- %e3 = extractelement <4 x i32> %words, i64 3
- %s0 = select i1 %cond, i32 %e0, i32 0
- %s1 = select i1 %cond, i32 %e1, i32 0
- %s2 = select i1 %cond, i32 %e2, i32 0
- %s3 = select i1 %cond, i32 %e3, i32 0
- store i32 %s0, ptr addrspace(1) %out, align 4
- %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 1
- store i32 %s1, ptr addrspace(1) %ptr1, align 4
- %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 2
- store i32 %s2, ptr addrspace(1) %ptr2, align 4
- %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 3
- store i32 %s3, ptr addrspace(1) %ptr3, align 4
- ret void
-}
-
-; Test <2 x double> to <16 x i8> (double elements to byte elements)
-define amdgpu_kernel void @combine_v2f64_to_v16i8_asm(
-; CHECK-OPT-LABEL: combine_v2f64_to_v16i8_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2f64_to_v16i8_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v11, 8, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v16, 24, v3
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; CHECK-NOOPT-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v14, 8, v14
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v16, 8, v16
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v11, 8, v11
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v8, 8, v8
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v10, 8, v10
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; CHECK-NOOPT-NEXT: v_lshlrev_b16_e32 v7, 8, v7
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v14, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v11, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x double>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x double>, ptr addrspace(1) %gep, align 16
- %bytes = bitcast <2 x double> %loaded to <16 x i8>
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- %s8 = select i1 %cond, i8 %e8, i8 0
- %s9 = select i1 %cond, i8 %e9, i8 0
- %s10 = select i1 %cond, i8 %e10, i8 0
- %s11 = select i1 %cond, i8 %e11, i8 0
- %s12 = select i1 %cond, i8 %e12, i8 0
- %s13 = select i1 %cond, i8 %e13, i8 0
- %s14 = select i1 %cond, i8 %e14, i8 0
- %s15 = select i1 %cond, i8 %e15, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
- ret void
-}
-
-; Test <2 x double> to <8 x i16> (double elements to 16-bit elements)
-define amdgpu_kernel void @combine_v2f64_to_v8i16_asm(
-; CHECK-OPT-LABEL: combine_v2f64_to_v8i16_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2f64_to_v8i16_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_mov_b32 s0, 0x5040100
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_cndmask_b32_sdwa v3, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; CHECK-NOOPT-NEXT: v_perm_b32 v1, v1, v6, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v2, v2, v7, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v3, v3, v8, s0
-; CHECK-NOOPT-NEXT: v_perm_b32 v0, v0, v5, s0
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x double>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x double>, ptr addrspace(1) %gep, align 16
- %halves = bitcast <2 x double> %loaded to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <2 x double> to <4 x i32> (double elements to 32-bit elements)
-define amdgpu_kernel void @combine_v2f64_to_v4i32_asm(
-; CHECK-OPT-LABEL: combine_v2f64_to_v4i32_asm:
-; CHECK-OPT: ; %bb.0: ; %entry
-; CHECK-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-OPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-OPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-OPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-OPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-OPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-OPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-OPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-OPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-OPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-OPT-NEXT: s_endpgm
-;
-; CHECK-NOOPT-LABEL: combine_v2f64_to_v4i32_asm:
-; CHECK-NOOPT: ; %bb.0: ; %entry
-; CHECK-NOOPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NOOPT-NEXT: s_load_dword s6, s[4:5], 0x10
-; CHECK-NOOPT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NOOPT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-NOOPT-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NOOPT-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
-; CHECK-NOOPT-NEXT: s_bitcmp1_b32 s6, 0
-; CHECK-NOOPT-NEXT: s_cselect_b64 vcc, -1, 0
-; CHECK-NOOPT-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NOOPT-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NOOPT-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; CHECK-NOOPT-NEXT: s_endpgm
- ptr addrspace(1) %in,
- ptr addrspace(1) %out,
- i1 %cond
-) {
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = zext i32 %tid to i64
- %gep = getelementptr <2 x double>, ptr addrspace(1) %in, i64 %tid.ext
- %loaded = load <2 x double>, ptr addrspace(1) %gep, align 16
- %words = bitcast <2 x double> %loaded to <4 x i32>
- %e0 = extractelement <4 x i32> %words, i64 0
- %e1 = extractelement <4 x i32> %words, i64 1
- %e2 = extractelement <4 x i32> %words, i64 2
- %e3 = extractelement <4 x i32> %words, i64 3
- %s0 = select i1 %cond, i32 %e0, i32 0
- %s1 = select i1 %cond, i32 %e1, i32 0
- %s2 = select i1 %cond, i32 %e2, i32 0
- %s3 = select i1 %cond, i32 %e3, i32 0
- store i32 %s0, ptr addrspace(1) %out, align 4
- %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 1
- store i32 %s1, ptr addrspace(1) %ptr1, align 4
- %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 2
- store i32 %s2, ptr addrspace(1) %ptr2, align 4
- %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 3
- store i32 %s3, ptr addrspace(1) %ptr3, align 4
- ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x()
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll b/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
deleted file mode 100644
index 8867387e1fad0..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/combine-scalar-selects.ll
+++ /dev/null
@@ -1,1874 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=CHECK,CHECK-OPT
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -S %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOOPT
-
-; Test that multiple scalar selects from the same vector source are combined
-; back into a vector select when the optimization is enabled, and remain as
-; individual scalar selects when disabled.
-
-; This pattern occurs when buffer_load_dwordx4 results are bitcast to v16i8,
-; then each byte is extracted and conditionally selected with zero.
-
-define amdgpu_kernel void @combine_scalar_selects_v16i8(
-;
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_scalar_selects_v16i8(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[BUFFER_RESOURCE:%.*]], i32 [[OFFSET:%.*]], i1 [[VALID:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[LOADED:%.*]] = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> [[BUFFER_RESOURCE]], i32 [[OFFSET]], i32 0, i32 0)
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[VALID]], <4 x i32> [[LOADED]], <4 x i32> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x i32> [[COMBINED_SEL]] to <16 x i8>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
-; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
-; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
-; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
-; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i8 [[TMP14]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-OPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
-; CHECK-OPT-NEXT: store i8 [[TMP10]], ptr addrspace(1) [[PTR8]], align 1
-; CHECK-OPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
-; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR9]], align 1
-; CHECK-OPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
-; CHECK-OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[PTR10]], align 1
-; CHECK-OPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
-; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR11]], align 1
-; CHECK-OPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
-; CHECK-OPT-NEXT: store i8 [[TMP15]], ptr addrspace(1) [[PTR12]], align 1
-; CHECK-OPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
-; CHECK-OPT-NEXT: store i8 [[TMP8]], ptr addrspace(1) [[PTR13]], align 1
-; CHECK-OPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
-; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR14]], align 1
-; CHECK-OPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
-; CHECK-OPT-NEXT: store i8 [[TMP11]], ptr addrspace(1) [[PTR15]], align 1
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_scalar_selects_v16i8(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[BUFFER_RESOURCE:%.*]], i32 [[OFFSET:%.*]], i1 [[VALID:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[LOADED:%.*]] = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> [[BUFFER_RESOURCE]], i32 [[OFFSET]], i32 0, i32 0)
-; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[LOADED]] to <16 x i8>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[BYTES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <16 x i8> [[BYTES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <16 x i8> [[BYTES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <16 x i8> [[BYTES]], i64 7
-; CHECK-NOOPT-NEXT: [[E8:%.*]] = extractelement <16 x i8> [[BYTES]], i64 8
-; CHECK-NOOPT-NEXT: [[E9:%.*]] = extractelement <16 x i8> [[BYTES]], i64 9
-; CHECK-NOOPT-NEXT: [[E10:%.*]] = extractelement <16 x i8> [[BYTES]], i64 10
-; CHECK-NOOPT-NEXT: [[E11:%.*]] = extractelement <16 x i8> [[BYTES]], i64 11
-; CHECK-NOOPT-NEXT: [[E12:%.*]] = extractelement <16 x i8> [[BYTES]], i64 12
-; CHECK-NOOPT-NEXT: [[E13:%.*]] = extractelement <16 x i8> [[BYTES]], i64 13
-; CHECK-NOOPT-NEXT: [[E14:%.*]] = extractelement <16 x i8> [[BYTES]], i64 14
-; CHECK-NOOPT-NEXT: [[E15:%.*]] = extractelement <16 x i8> [[BYTES]], i64 15
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[VALID]], i8 [[E0]], i8 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[VALID]], i8 [[E1]], i8 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[VALID]], i8 [[E2]], i8 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[VALID]], i8 [[E3]], i8 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[VALID]], i8 [[E4]], i8 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[VALID]], i8 [[E5]], i8 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[VALID]], i8 [[E6]], i8 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[VALID]], i8 [[E7]], i8 0
-; CHECK-NOOPT-NEXT: [[S8:%.*]] = select i1 [[VALID]], i8 [[E8]], i8 0
-; CHECK-NOOPT-NEXT: [[S9:%.*]] = select i1 [[VALID]], i8 [[E9]], i8 0
-; CHECK-NOOPT-NEXT: [[S10:%.*]] = select i1 [[VALID]], i8 [[E10]], i8 0
-; CHECK-NOOPT-NEXT: [[S11:%.*]] = select i1 [[VALID]], i8 [[E11]], i8 0
-; CHECK-NOOPT-NEXT: [[S12:%.*]] = select i1 [[VALID]], i8 [[E12]], i8 0
-; CHECK-NOOPT-NEXT: [[S13:%.*]] = select i1 [[VALID]], i8 [[E13]], i8 0
-; CHECK-NOOPT-NEXT: [[S14:%.*]] = select i1 [[VALID]], i8 [[E14]], i8 0
-; CHECK-NOOPT-NEXT: [[S15:%.*]] = select i1 [[VALID]], i8 [[E15]], i8 0
-; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-NOOPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
-; CHECK-NOOPT-NEXT: store i8 [[S8]], ptr addrspace(1) [[PTR8]], align 1
-; CHECK-NOOPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
-; CHECK-NOOPT-NEXT: store i8 [[S9]], ptr addrspace(1) [[PTR9]], align 1
-; CHECK-NOOPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
-; CHECK-NOOPT-NEXT: store i8 [[S10]], ptr addrspace(1) [[PTR10]], align 1
-; CHECK-NOOPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
-; CHECK-NOOPT-NEXT: store i8 [[S11]], ptr addrspace(1) [[PTR11]], align 1
-; CHECK-NOOPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
-; CHECK-NOOPT-NEXT: store i8 [[S12]], ptr addrspace(1) [[PTR12]], align 1
-; CHECK-NOOPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
-; CHECK-NOOPT-NEXT: store i8 [[S13]], ptr addrspace(1) [[PTR13]], align 1
-; CHECK-NOOPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
-; CHECK-NOOPT-NEXT: store i8 [[S14]], ptr addrspace(1) [[PTR14]], align 1
-; CHECK-NOOPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
-; CHECK-NOOPT-NEXT: store i8 [[S15]], ptr addrspace(1) [[PTR15]], align 1
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <4 x i32> %buffer_resource,
- i32 %offset,
- i1 %valid
-) {
-entry:
- %loaded = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %buffer_resource, i32 %offset, i32 0, i32 0)
- %bytes = bitcast <4 x i32> %loaded to <16 x i8>
-
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
-
- %s0 = select i1 %valid, i8 %e0, i8 0
- %s1 = select i1 %valid, i8 %e1, i8 0
- %s2 = select i1 %valid, i8 %e2, i8 0
- %s3 = select i1 %valid, i8 %e3, i8 0
- %s4 = select i1 %valid, i8 %e4, i8 0
- %s5 = select i1 %valid, i8 %e5, i8 0
- %s6 = select i1 %valid, i8 %e6, i8 0
- %s7 = select i1 %valid, i8 %e7, i8 0
- %s8 = select i1 %valid, i8 %e8, i8 0
- %s9 = select i1 %valid, i8 %e9, i8 0
- %s10 = select i1 %valid, i8 %e10, i8 0
- %s11 = select i1 %valid, i8 %e11, i8 0
- %s12 = select i1 %valid, i8 %e12, i8 0
- %s13 = select i1 %valid, i8 %e13, i8 0
- %s14 = select i1 %valid, i8 %e14, i8 0
- %s15 = select i1 %valid, i8 %e15, i8 0
-
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
-
- ret void
-}
-
-; Test with v8i8 from v2i32 (smaller vector)
-define amdgpu_kernel void @combine_scalar_selects_v8i8(
-;
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_scalar_selects_v8i8(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i32> [[SRC]], <2 x i32> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i32> [[COMBINED_SEL]] to <8 x i8>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_scalar_selects_v8i8(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i8> [[BYTES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i8> [[BYTES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i8> [[BYTES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i8> [[BYTES]], i64 7
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
-; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <2 x i32> %src,
- i1 %cond
-) {
-entry:
- %bytes = bitcast <2 x i32> %src to <8 x i8>
- %e0 = extractelement <8 x i8> %bytes, i64 0
- %e1 = extractelement <8 x i8> %bytes, i64 1
- %e2 = extractelement <8 x i8> %bytes, i64 2
- %e3 = extractelement <8 x i8> %bytes, i64 3
- %e4 = extractelement <8 x i8> %bytes, i64 4
- %e5 = extractelement <8 x i8> %bytes, i64 5
- %e6 = extractelement <8 x i8> %bytes, i64 6
- %e7 = extractelement <8 x i8> %bytes, i64 7
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- ret void
-}
-
-; Test partial coverage: 10 out of 16 elements (should still combine, >= half)
-define amdgpu_kernel void @combine_partial_selects(
-;
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_partial_selects(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x i32> [[SRC]], <4 x i32> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x i32> [[COMBINED_SEL]] to <16 x i8>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
-; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i8 [[TMP8]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-OPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
-; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR8]], align 1
-; CHECK-OPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
-; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR9]], align 1
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_partial_selects(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[SRC]] to <16 x i8>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[BYTES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <16 x i8> [[BYTES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <16 x i8> [[BYTES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <16 x i8> [[BYTES]], i64 7
-; CHECK-NOOPT-NEXT: [[E8:%.*]] = extractelement <16 x i8> [[BYTES]], i64 8
-; CHECK-NOOPT-NEXT: [[E9:%.*]] = extractelement <16 x i8> [[BYTES]], i64 9
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
-; CHECK-NOOPT-NEXT: [[S8:%.*]] = select i1 [[COND]], i8 [[E8]], i8 0
-; CHECK-NOOPT-NEXT: [[S9:%.*]] = select i1 [[COND]], i8 [[E9]], i8 0
-; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-NOOPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
-; CHECK-NOOPT-NEXT: store i8 [[S8]], ptr addrspace(1) [[PTR8]], align 1
-; CHECK-NOOPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
-; CHECK-NOOPT-NEXT: store i8 [[S9]], ptr addrspace(1) [[PTR9]], align 1
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <4 x i32> %src,
- i1 %cond
-) {
-entry:
- %bytes = bitcast <4 x i32> %src to <16 x i8>
- ; Only extract and select 10 elements (indices 0-9)
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- %s8 = select i1 %cond, i8 %e8, i8 0
- %s9 = select i1 %cond, i8 %e9, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- ret void
-}
-
-; Negative test: should not combine if false value is not zero
-define amdgpu_kernel void @no_combine_non_zero_false(
-;
-; CHECK-LABEL: define amdgpu_kernel void @no_combine_non_zero_false(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[BUFFER_RESOURCE:%.*]], i32 [[OFFSET:%.*]], i1 [[VALID:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[LOADED:%.*]] = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> [[BUFFER_RESOURCE]], i32 [[OFFSET]], i32 0, i32 0)
-; CHECK-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[LOADED]] to <16 x i8>
-; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-NEXT: [[S0:%.*]] = select i1 [[VALID]], i8 [[E0]], i8 1
-; CHECK-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <4 x i32> %buffer_resource,
- i32 %offset,
- i1 %valid
-) {
-entry:
- %loaded = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %buffer_resource, i32 %offset, i32 0, i32 0)
- %bytes = bitcast <4 x i32> %loaded to <16 x i8>
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %s0 = select i1 %valid, i8 %e0, i8 1 ; false value is 1, not 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- ret void
-}
-
-; Negative test: too few selects (only 4 out of 16, less than half)
-define amdgpu_kernel void @no_combine_too_few_selects(
-;
-; CHECK-LABEL: define amdgpu_kernel void @no_combine_too_few_selects(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[BYTES:%.*]] = bitcast <4 x i32> [[SRC]] to <16 x i8>
-; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
-; CHECK-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
-; CHECK-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
-; CHECK-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
-; CHECK-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
-; CHECK-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
-; CHECK-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
-; CHECK-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <4 x i32> %src,
- i1 %cond
-) {
-entry:
- %bytes = bitcast <4 x i32> %src to <16 x i8>
- ; Only 4 selects - less than half of 16
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- ret void
-}
-
-; Test: extracts have additional unrelated uses (extracts can't be removed)
-; The transformation should still be profitable as we reduce v_cndmask count
-define amdgpu_kernel void @combine_with_extract_other_uses(
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_with_extract_other_uses(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = select i1 [[COND]], <2 x i32> [[SRC]], <2 x i32> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-; CHECK-OPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
-; CHECK-OPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
-; CHECK-OPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
-; CHECK-OPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-OPT-NEXT: [[SUM:%.*]] = add i8 [[E0]], [[E1]]
-; CHECK-OPT-NEXT: [[SUM2:%.*]] = add i8 [[SUM]], [[E2]]
-; CHECK-OPT-NEXT: [[SUM3:%.*]] = add i8 [[SUM2]], [[E3]]
-; CHECK-OPT-NEXT: store i8 [[SUM3]], ptr addrspace(1) [[OUT2]], align 1
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_with_extract_other_uses(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i8> [[BYTES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i8> [[BYTES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i8> [[BYTES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i8> [[BYTES]], i64 7
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
-; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-NOOPT-NEXT: [[SUM:%.*]] = add i8 [[E0]], [[E1]]
-; CHECK-NOOPT-NEXT: [[SUM2:%.*]] = add i8 [[SUM]], [[E2]]
-; CHECK-NOOPT-NEXT: [[SUM3:%.*]] = add i8 [[SUM2]], [[E3]]
-; CHECK-NOOPT-NEXT: store i8 [[SUM3]], ptr addrspace(1) [[OUT2]], align 1
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- ptr addrspace(1) %out2,
- <2 x i32> %src,
- i1 %cond
-) {
-entry:
- %bytes = bitcast <2 x i32> %src to <8 x i8>
- %e0 = extractelement <8 x i8> %bytes, i64 0
- %e1 = extractelement <8 x i8> %bytes, i64 1
- %e2 = extractelement <8 x i8> %bytes, i64 2
- %e3 = extractelement <8 x i8> %bytes, i64 3
- %e4 = extractelement <8 x i8> %bytes, i64 4
- %e5 = extractelement <8 x i8> %bytes, i64 5
- %e6 = extractelement <8 x i8> %bytes, i64 6
- %e7 = extractelement <8 x i8> %bytes, i64 7
- ; Selects that will be combined
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- ; Store select results
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- ; Additional unrelated uses of the extracts - these prevent extract removal
- %sum = add i8 %e0, %e1
- %sum2 = add i8 %sum, %e2
- %sum3 = add i8 %sum2, %e3
- store i8 %sum3, ptr addrspace(1) %out2, align 1
- ret void
-}
-
-; Negative test: select with extract as false value (wrong operand position)
-define amdgpu_kernel void @no_combine_wrong_operand_order(
-;
-; CHECK-LABEL: define amdgpu_kernel void @no_combine_wrong_operand_order(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[BYTES:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
-; CHECK-NEXT: [[E0:%.*]] = extractelement <8 x i8> [[BYTES]], i64 0
-; CHECK-NEXT: [[E1:%.*]] = extractelement <8 x i8> [[BYTES]], i64 1
-; CHECK-NEXT: [[E2:%.*]] = extractelement <8 x i8> [[BYTES]], i64 2
-; CHECK-NEXT: [[E3:%.*]] = extractelement <8 x i8> [[BYTES]], i64 3
-; CHECK-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 0, i8 [[E0]]
-; CHECK-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 0, i8 [[E1]]
-; CHECK-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 0, i8 [[E2]]
-; CHECK-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 0, i8 [[E3]]
-; CHECK-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <2 x i32> %src,
- i1 %cond
-) {
-entry:
- %bytes = bitcast <2 x i32> %src to <8 x i8>
- %e0 = extractelement <8 x i8> %bytes, i64 0
- %e1 = extractelement <8 x i8> %bytes, i64 1
- %e2 = extractelement <8 x i8> %bytes, i64 2
- %e3 = extractelement <8 x i8> %bytes, i64 3
- ; Extract is false value, 0 is true value - should not combine
- %s0 = select i1 %cond, i8 0, i8 %e0
- %s1 = select i1 %cond, i8 0, i8 %e1
- %s2 = select i1 %cond, i8 0, i8 %e2
- %s3 = select i1 %cond, i8 0, i8 %e3
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- ret void
-}
-
-; Test <4 x i32> to <8 x i16> (32-bit elements to 16-bit elements)
-define amdgpu_kernel void @combine_v4i32_to_v8i16(
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v4i32_to_v8i16(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x i32> [[SRC]], <4 x i32> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x i32> [[COMBINED_SEL]] to <8 x i16>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[PTR2]], align 2
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[PTR3]], align 2
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[PTR4]], align 2
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[PTR5]], align 2
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[PTR6]], align 2
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[PTR7]], align 2
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v4i32_to_v8i16(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x i32> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[HALVES:%.*]] = bitcast <4 x i32> [[SRC]] to <8 x i16>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i16> [[HALVES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i16> [[HALVES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i16> [[HALVES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i16> [[HALVES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i16> [[HALVES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i16> [[HALVES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i16> [[HALVES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i16> [[HALVES]], i64 7
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i16 [[E0]], i16 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i16 [[E1]], i16 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i16 [[E2]], i16 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i16 [[E3]], i16 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i16 [[E4]], i16 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i16 [[E5]], i16 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i16 [[E6]], i16 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i16 [[E7]], i16 0
-; CHECK-NOOPT-NEXT: store i16 [[S0]], ptr addrspace(1) [[OUT]], align 2
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i16 [[S1]], ptr addrspace(1) [[PTR1]], align 2
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i16 [[S2]], ptr addrspace(1) [[PTR2]], align 2
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i16 [[S3]], ptr addrspace(1) [[PTR3]], align 2
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i16 [[S4]], ptr addrspace(1) [[PTR4]], align 2
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i16 [[S5]], ptr addrspace(1) [[PTR5]], align 2
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i16 [[S6]], ptr addrspace(1) [[PTR6]], align 2
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i16 [[S7]], ptr addrspace(1) [[PTR7]], align 2
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <4 x i32> %src,
- i1 %cond
-) {
-entry:
- %halves = bitcast <4 x i32> %src to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <4 x float> to <16 x i8> (float elements to byte elements)
-define amdgpu_kernel void @combine_v4f32_to_v16i8(
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v4f32_to_v16i8(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x float> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x float> [[SRC]], <4 x float> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x float> [[COMBINED_SEL]] to <16 x i8>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
-; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
-; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
-; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
-; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i8 [[TMP14]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-OPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
-; CHECK-OPT-NEXT: store i8 [[TMP10]], ptr addrspace(1) [[PTR8]], align 1
-; CHECK-OPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
-; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR9]], align 1
-; CHECK-OPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
-; CHECK-OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[PTR10]], align 1
-; CHECK-OPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
-; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR11]], align 1
-; CHECK-OPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
-; CHECK-OPT-NEXT: store i8 [[TMP15]], ptr addrspace(1) [[PTR12]], align 1
-; CHECK-OPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
-; CHECK-OPT-NEXT: store i8 [[TMP8]], ptr addrspace(1) [[PTR13]], align 1
-; CHECK-OPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
-; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR14]], align 1
-; CHECK-OPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
-; CHECK-OPT-NEXT: store i8 [[TMP11]], ptr addrspace(1) [[PTR15]], align 1
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v4f32_to_v16i8(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x float> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <4 x float> [[SRC]] to <16 x i8>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[BYTES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <16 x i8> [[BYTES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <16 x i8> [[BYTES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <16 x i8> [[BYTES]], i64 7
-; CHECK-NOOPT-NEXT: [[E8:%.*]] = extractelement <16 x i8> [[BYTES]], i64 8
-; CHECK-NOOPT-NEXT: [[E9:%.*]] = extractelement <16 x i8> [[BYTES]], i64 9
-; CHECK-NOOPT-NEXT: [[E10:%.*]] = extractelement <16 x i8> [[BYTES]], i64 10
-; CHECK-NOOPT-NEXT: [[E11:%.*]] = extractelement <16 x i8> [[BYTES]], i64 11
-; CHECK-NOOPT-NEXT: [[E12:%.*]] = extractelement <16 x i8> [[BYTES]], i64 12
-; CHECK-NOOPT-NEXT: [[E13:%.*]] = extractelement <16 x i8> [[BYTES]], i64 13
-; CHECK-NOOPT-NEXT: [[E14:%.*]] = extractelement <16 x i8> [[BYTES]], i64 14
-; CHECK-NOOPT-NEXT: [[E15:%.*]] = extractelement <16 x i8> [[BYTES]], i64 15
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
-; CHECK-NOOPT-NEXT: [[S8:%.*]] = select i1 [[COND]], i8 [[E8]], i8 0
-; CHECK-NOOPT-NEXT: [[S9:%.*]] = select i1 [[COND]], i8 [[E9]], i8 0
-; CHECK-NOOPT-NEXT: [[S10:%.*]] = select i1 [[COND]], i8 [[E10]], i8 0
-; CHECK-NOOPT-NEXT: [[S11:%.*]] = select i1 [[COND]], i8 [[E11]], i8 0
-; CHECK-NOOPT-NEXT: [[S12:%.*]] = select i1 [[COND]], i8 [[E12]], i8 0
-; CHECK-NOOPT-NEXT: [[S13:%.*]] = select i1 [[COND]], i8 [[E13]], i8 0
-; CHECK-NOOPT-NEXT: [[S14:%.*]] = select i1 [[COND]], i8 [[E14]], i8 0
-; CHECK-NOOPT-NEXT: [[S15:%.*]] = select i1 [[COND]], i8 [[E15]], i8 0
-; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-NOOPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
-; CHECK-NOOPT-NEXT: store i8 [[S8]], ptr addrspace(1) [[PTR8]], align 1
-; CHECK-NOOPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
-; CHECK-NOOPT-NEXT: store i8 [[S9]], ptr addrspace(1) [[PTR9]], align 1
-; CHECK-NOOPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
-; CHECK-NOOPT-NEXT: store i8 [[S10]], ptr addrspace(1) [[PTR10]], align 1
-; CHECK-NOOPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
-; CHECK-NOOPT-NEXT: store i8 [[S11]], ptr addrspace(1) [[PTR11]], align 1
-; CHECK-NOOPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
-; CHECK-NOOPT-NEXT: store i8 [[S12]], ptr addrspace(1) [[PTR12]], align 1
-; CHECK-NOOPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
-; CHECK-NOOPT-NEXT: store i8 [[S13]], ptr addrspace(1) [[PTR13]], align 1
-; CHECK-NOOPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
-; CHECK-NOOPT-NEXT: store i8 [[S14]], ptr addrspace(1) [[PTR14]], align 1
-; CHECK-NOOPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
-; CHECK-NOOPT-NEXT: store i8 [[S15]], ptr addrspace(1) [[PTR15]], align 1
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <4 x float> %src,
- i1 %cond
-) {
-entry:
- %bytes = bitcast <4 x float> %src to <16 x i8>
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- %s8 = select i1 %cond, i8 %e8, i8 0
- %s9 = select i1 %cond, i8 %e9, i8 0
- %s10 = select i1 %cond, i8 %e10, i8 0
- %s11 = select i1 %cond, i8 %e11, i8 0
- %s12 = select i1 %cond, i8 %e12, i8 0
- %s13 = select i1 %cond, i8 %e13, i8 0
- %s14 = select i1 %cond, i8 %e14, i8 0
- %s15 = select i1 %cond, i8 %e15, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
- ret void
-}
-
-; Test <4 x float> to <8 x i16> (float elements to 16-bit elements)
-define amdgpu_kernel void @combine_v4f32_to_v8i16(
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v4f32_to_v8i16(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x float> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <4 x float> [[SRC]], <4 x float> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <4 x float> [[COMBINED_SEL]] to <8 x i16>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[PTR2]], align 2
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[PTR3]], align 2
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[PTR4]], align 2
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[PTR5]], align 2
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[PTR6]], align 2
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[PTR7]], align 2
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v4f32_to_v8i16(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <4 x float> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[HALVES:%.*]] = bitcast <4 x float> [[SRC]] to <8 x i16>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i16> [[HALVES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i16> [[HALVES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i16> [[HALVES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i16> [[HALVES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i16> [[HALVES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i16> [[HALVES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i16> [[HALVES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i16> [[HALVES]], i64 7
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i16 [[E0]], i16 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i16 [[E1]], i16 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i16 [[E2]], i16 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i16 [[E3]], i16 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i16 [[E4]], i16 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i16 [[E5]], i16 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i16 [[E6]], i16 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i16 [[E7]], i16 0
-; CHECK-NOOPT-NEXT: store i16 [[S0]], ptr addrspace(1) [[OUT]], align 2
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i16 [[S1]], ptr addrspace(1) [[PTR1]], align 2
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i16 [[S2]], ptr addrspace(1) [[PTR2]], align 2
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i16 [[S3]], ptr addrspace(1) [[PTR3]], align 2
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i16 [[S4]], ptr addrspace(1) [[PTR4]], align 2
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i16 [[S5]], ptr addrspace(1) [[PTR5]], align 2
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i16 [[S6]], ptr addrspace(1) [[PTR6]], align 2
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i16 [[S7]], ptr addrspace(1) [[PTR7]], align 2
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <4 x float> %src,
- i1 %cond
-) {
-entry:
- %halves = bitcast <4 x float> %src to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <2 x i64> to <16 x i8> (64-bit elements to byte elements)
-define amdgpu_kernel void @combine_v2i64_to_v16i8(
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v16i8(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i64> [[SRC]], <2 x i64> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i64> [[COMBINED_SEL]] to <16 x i8>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
-; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
-; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
-; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
-; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i8 [[TMP14]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-OPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
-; CHECK-OPT-NEXT: store i8 [[TMP10]], ptr addrspace(1) [[PTR8]], align 1
-; CHECK-OPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
-; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR9]], align 1
-; CHECK-OPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
-; CHECK-OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[PTR10]], align 1
-; CHECK-OPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
-; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR11]], align 1
-; CHECK-OPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
-; CHECK-OPT-NEXT: store i8 [[TMP15]], ptr addrspace(1) [[PTR12]], align 1
-; CHECK-OPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
-; CHECK-OPT-NEXT: store i8 [[TMP8]], ptr addrspace(1) [[PTR13]], align 1
-; CHECK-OPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
-; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR14]], align 1
-; CHECK-OPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
-; CHECK-OPT-NEXT: store i8 [[TMP11]], ptr addrspace(1) [[PTR15]], align 1
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v16i8(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <2 x i64> [[SRC]] to <16 x i8>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[BYTES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <16 x i8> [[BYTES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <16 x i8> [[BYTES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <16 x i8> [[BYTES]], i64 7
-; CHECK-NOOPT-NEXT: [[E8:%.*]] = extractelement <16 x i8> [[BYTES]], i64 8
-; CHECK-NOOPT-NEXT: [[E9:%.*]] = extractelement <16 x i8> [[BYTES]], i64 9
-; CHECK-NOOPT-NEXT: [[E10:%.*]] = extractelement <16 x i8> [[BYTES]], i64 10
-; CHECK-NOOPT-NEXT: [[E11:%.*]] = extractelement <16 x i8> [[BYTES]], i64 11
-; CHECK-NOOPT-NEXT: [[E12:%.*]] = extractelement <16 x i8> [[BYTES]], i64 12
-; CHECK-NOOPT-NEXT: [[E13:%.*]] = extractelement <16 x i8> [[BYTES]], i64 13
-; CHECK-NOOPT-NEXT: [[E14:%.*]] = extractelement <16 x i8> [[BYTES]], i64 14
-; CHECK-NOOPT-NEXT: [[E15:%.*]] = extractelement <16 x i8> [[BYTES]], i64 15
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
-; CHECK-NOOPT-NEXT: [[S8:%.*]] = select i1 [[COND]], i8 [[E8]], i8 0
-; CHECK-NOOPT-NEXT: [[S9:%.*]] = select i1 [[COND]], i8 [[E9]], i8 0
-; CHECK-NOOPT-NEXT: [[S10:%.*]] = select i1 [[COND]], i8 [[E10]], i8 0
-; CHECK-NOOPT-NEXT: [[S11:%.*]] = select i1 [[COND]], i8 [[E11]], i8 0
-; CHECK-NOOPT-NEXT: [[S12:%.*]] = select i1 [[COND]], i8 [[E12]], i8 0
-; CHECK-NOOPT-NEXT: [[S13:%.*]] = select i1 [[COND]], i8 [[E13]], i8 0
-; CHECK-NOOPT-NEXT: [[S14:%.*]] = select i1 [[COND]], i8 [[E14]], i8 0
-; CHECK-NOOPT-NEXT: [[S15:%.*]] = select i1 [[COND]], i8 [[E15]], i8 0
-; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-NOOPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
-; CHECK-NOOPT-NEXT: store i8 [[S8]], ptr addrspace(1) [[PTR8]], align 1
-; CHECK-NOOPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
-; CHECK-NOOPT-NEXT: store i8 [[S9]], ptr addrspace(1) [[PTR9]], align 1
-; CHECK-NOOPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
-; CHECK-NOOPT-NEXT: store i8 [[S10]], ptr addrspace(1) [[PTR10]], align 1
-; CHECK-NOOPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
-; CHECK-NOOPT-NEXT: store i8 [[S11]], ptr addrspace(1) [[PTR11]], align 1
-; CHECK-NOOPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
-; CHECK-NOOPT-NEXT: store i8 [[S12]], ptr addrspace(1) [[PTR12]], align 1
-; CHECK-NOOPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
-; CHECK-NOOPT-NEXT: store i8 [[S13]], ptr addrspace(1) [[PTR13]], align 1
-; CHECK-NOOPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
-; CHECK-NOOPT-NEXT: store i8 [[S14]], ptr addrspace(1) [[PTR14]], align 1
-; CHECK-NOOPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
-; CHECK-NOOPT-NEXT: store i8 [[S15]], ptr addrspace(1) [[PTR15]], align 1
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <2 x i64> %src,
- i1 %cond
-) {
-entry:
- %bytes = bitcast <2 x i64> %src to <16 x i8>
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- %s8 = select i1 %cond, i8 %e8, i8 0
- %s9 = select i1 %cond, i8 %e9, i8 0
- %s10 = select i1 %cond, i8 %e10, i8 0
- %s11 = select i1 %cond, i8 %e11, i8 0
- %s12 = select i1 %cond, i8 %e12, i8 0
- %s13 = select i1 %cond, i8 %e13, i8 0
- %s14 = select i1 %cond, i8 %e14, i8 0
- %s15 = select i1 %cond, i8 %e15, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
- ret void
-}
-
-; Test <2 x i64> to <8 x i16> (64-bit elements to 16-bit elements)
-define amdgpu_kernel void @combine_v2i64_to_v8i16(
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v8i16(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i64> [[SRC]], <2 x i64> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i64> [[COMBINED_SEL]] to <8 x i16>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[PTR2]], align 2
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[PTR3]], align 2
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[PTR4]], align 2
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[PTR5]], align 2
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[PTR6]], align 2
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[PTR7]], align 2
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v8i16(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[HALVES:%.*]] = bitcast <2 x i64> [[SRC]] to <8 x i16>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i16> [[HALVES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i16> [[HALVES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i16> [[HALVES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i16> [[HALVES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i16> [[HALVES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i16> [[HALVES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i16> [[HALVES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i16> [[HALVES]], i64 7
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i16 [[E0]], i16 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i16 [[E1]], i16 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i16 [[E2]], i16 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i16 [[E3]], i16 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i16 [[E4]], i16 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i16 [[E5]], i16 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i16 [[E6]], i16 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i16 [[E7]], i16 0
-; CHECK-NOOPT-NEXT: store i16 [[S0]], ptr addrspace(1) [[OUT]], align 2
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i16 [[S1]], ptr addrspace(1) [[PTR1]], align 2
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i16 [[S2]], ptr addrspace(1) [[PTR2]], align 2
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i16 [[S3]], ptr addrspace(1) [[PTR3]], align 2
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i16 [[S4]], ptr addrspace(1) [[PTR4]], align 2
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i16 [[S5]], ptr addrspace(1) [[PTR5]], align 2
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i16 [[S6]], ptr addrspace(1) [[PTR6]], align 2
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i16 [[S7]], ptr addrspace(1) [[PTR7]], align 2
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <2 x i64> %src,
- i1 %cond
-) {
-entry:
- %halves = bitcast <2 x i64> %src to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <2 x i64> to <4 x i32> (64-bit elements to 32-bit elements)
-define amdgpu_kernel void @combine_v2i64_to_v4i32(
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v4i32(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x i64> [[SRC]], <2 x i64> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x i64> [[COMBINED_SEL]] to <4 x i32>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[PTR1]], align 4
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[PTR2]], align 4
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[PTR3]], align 4
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2i64_to_v4i32(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x i64> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[WORDS:%.*]] = bitcast <2 x i64> [[SRC]] to <4 x i32>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[WORDS]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[WORDS]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[WORDS]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[WORDS]], i64 3
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i32 [[E0]], i32 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i32 [[E1]], i32 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i32 [[E2]], i32 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i32 [[E3]], i32 0
-; CHECK-NOOPT-NEXT: store i32 [[S0]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i32 [[S1]], ptr addrspace(1) [[PTR1]], align 4
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i32 [[S2]], ptr addrspace(1) [[PTR2]], align 4
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i32 [[S3]], ptr addrspace(1) [[PTR3]], align 4
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <2 x i64> %src,
- i1 %cond
-) {
-entry:
- %words = bitcast <2 x i64> %src to <4 x i32>
- %e0 = extractelement <4 x i32> %words, i64 0
- %e1 = extractelement <4 x i32> %words, i64 1
- %e2 = extractelement <4 x i32> %words, i64 2
- %e3 = extractelement <4 x i32> %words, i64 3
- %s0 = select i1 %cond, i32 %e0, i32 0
- %s1 = select i1 %cond, i32 %e1, i32 0
- %s2 = select i1 %cond, i32 %e2, i32 0
- %s3 = select i1 %cond, i32 %e3, i32 0
- store i32 %s0, ptr addrspace(1) %out, align 4
- %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 1
- store i32 %s1, ptr addrspace(1) %ptr1, align 4
- %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 2
- store i32 %s2, ptr addrspace(1) %ptr2, align 4
- %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 3
- store i32 %s3, ptr addrspace(1) %ptr3, align 4
- ret void
-}
-
-; Test <2 x double> to <16 x i8> (double elements to byte elements)
-define amdgpu_kernel void @combine_v2f64_to_v16i8(
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v16i8(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x double> [[SRC]], <2 x double> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x double> [[COMBINED_SEL]] to <16 x i8>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 8
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 9
-; CHECK-OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 10
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 11
-; CHECK-OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 12
-; CHECK-OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 13
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 14
-; CHECK-OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[COMBINED_BC]], i64 15
-; CHECK-OPT-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i8 [[TMP9]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i8 [[TMP3]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i8 [[TMP5]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i8 [[TMP14]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i8 [[TMP1]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-OPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
-; CHECK-OPT-NEXT: store i8 [[TMP10]], ptr addrspace(1) [[PTR8]], align 1
-; CHECK-OPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
-; CHECK-OPT-NEXT: store i8 [[TMP4]], ptr addrspace(1) [[PTR9]], align 1
-; CHECK-OPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
-; CHECK-OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[PTR10]], align 1
-; CHECK-OPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
-; CHECK-OPT-NEXT: store i8 [[TMP6]], ptr addrspace(1) [[PTR11]], align 1
-; CHECK-OPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
-; CHECK-OPT-NEXT: store i8 [[TMP15]], ptr addrspace(1) [[PTR12]], align 1
-; CHECK-OPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
-; CHECK-OPT-NEXT: store i8 [[TMP8]], ptr addrspace(1) [[PTR13]], align 1
-; CHECK-OPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
-; CHECK-OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[PTR14]], align 1
-; CHECK-OPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
-; CHECK-OPT-NEXT: store i8 [[TMP11]], ptr addrspace(1) [[PTR15]], align 1
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v16i8(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[BYTES:%.*]] = bitcast <2 x double> [[SRC]] to <16 x i8>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[BYTES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[BYTES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <16 x i8> [[BYTES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[BYTES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[BYTES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <16 x i8> [[BYTES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <16 x i8> [[BYTES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <16 x i8> [[BYTES]], i64 7
-; CHECK-NOOPT-NEXT: [[E8:%.*]] = extractelement <16 x i8> [[BYTES]], i64 8
-; CHECK-NOOPT-NEXT: [[E9:%.*]] = extractelement <16 x i8> [[BYTES]], i64 9
-; CHECK-NOOPT-NEXT: [[E10:%.*]] = extractelement <16 x i8> [[BYTES]], i64 10
-; CHECK-NOOPT-NEXT: [[E11:%.*]] = extractelement <16 x i8> [[BYTES]], i64 11
-; CHECK-NOOPT-NEXT: [[E12:%.*]] = extractelement <16 x i8> [[BYTES]], i64 12
-; CHECK-NOOPT-NEXT: [[E13:%.*]] = extractelement <16 x i8> [[BYTES]], i64 13
-; CHECK-NOOPT-NEXT: [[E14:%.*]] = extractelement <16 x i8> [[BYTES]], i64 14
-; CHECK-NOOPT-NEXT: [[E15:%.*]] = extractelement <16 x i8> [[BYTES]], i64 15
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i8 [[E0]], i8 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i8 [[E1]], i8 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i8 [[E2]], i8 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i8 [[E3]], i8 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i8 [[E4]], i8 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i8 [[E5]], i8 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i8 [[E6]], i8 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i8 [[E7]], i8 0
-; CHECK-NOOPT-NEXT: [[S8:%.*]] = select i1 [[COND]], i8 [[E8]], i8 0
-; CHECK-NOOPT-NEXT: [[S9:%.*]] = select i1 [[COND]], i8 [[E9]], i8 0
-; CHECK-NOOPT-NEXT: [[S10:%.*]] = select i1 [[COND]], i8 [[E10]], i8 0
-; CHECK-NOOPT-NEXT: [[S11:%.*]] = select i1 [[COND]], i8 [[E11]], i8 0
-; CHECK-NOOPT-NEXT: [[S12:%.*]] = select i1 [[COND]], i8 [[E12]], i8 0
-; CHECK-NOOPT-NEXT: [[S13:%.*]] = select i1 [[COND]], i8 [[E13]], i8 0
-; CHECK-NOOPT-NEXT: [[S14:%.*]] = select i1 [[COND]], i8 [[E14]], i8 0
-; CHECK-NOOPT-NEXT: [[S15:%.*]] = select i1 [[COND]], i8 [[E15]], i8 0
-; CHECK-NOOPT-NEXT: store i8 [[S0]], ptr addrspace(1) [[OUT]], align 1
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i8 [[S1]], ptr addrspace(1) [[PTR1]], align 1
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i8 [[S2]], ptr addrspace(1) [[PTR2]], align 1
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i8 [[S3]], ptr addrspace(1) [[PTR3]], align 1
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i8 [[S4]], ptr addrspace(1) [[PTR4]], align 1
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i8 [[S5]], ptr addrspace(1) [[PTR5]], align 1
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i8 [[S6]], ptr addrspace(1) [[PTR6]], align 1
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i8 [[S7]], ptr addrspace(1) [[PTR7]], align 1
-; CHECK-NOOPT-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 8
-; CHECK-NOOPT-NEXT: store i8 [[S8]], ptr addrspace(1) [[PTR8]], align 1
-; CHECK-NOOPT-NEXT: [[PTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 9
-; CHECK-NOOPT-NEXT: store i8 [[S9]], ptr addrspace(1) [[PTR9]], align 1
-; CHECK-NOOPT-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 10
-; CHECK-NOOPT-NEXT: store i8 [[S10]], ptr addrspace(1) [[PTR10]], align 1
-; CHECK-NOOPT-NEXT: [[PTR11:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 11
-; CHECK-NOOPT-NEXT: store i8 [[S11]], ptr addrspace(1) [[PTR11]], align 1
-; CHECK-NOOPT-NEXT: [[PTR12:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 12
-; CHECK-NOOPT-NEXT: store i8 [[S12]], ptr addrspace(1) [[PTR12]], align 1
-; CHECK-NOOPT-NEXT: [[PTR13:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 13
-; CHECK-NOOPT-NEXT: store i8 [[S13]], ptr addrspace(1) [[PTR13]], align 1
-; CHECK-NOOPT-NEXT: [[PTR14:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 14
-; CHECK-NOOPT-NEXT: store i8 [[S14]], ptr addrspace(1) [[PTR14]], align 1
-; CHECK-NOOPT-NEXT: [[PTR15:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 15
-; CHECK-NOOPT-NEXT: store i8 [[S15]], ptr addrspace(1) [[PTR15]], align 1
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <2 x double> %src,
- i1 %cond
-) {
-entry:
- %bytes = bitcast <2 x double> %src to <16 x i8>
- %e0 = extractelement <16 x i8> %bytes, i64 0
- %e1 = extractelement <16 x i8> %bytes, i64 1
- %e2 = extractelement <16 x i8> %bytes, i64 2
- %e3 = extractelement <16 x i8> %bytes, i64 3
- %e4 = extractelement <16 x i8> %bytes, i64 4
- %e5 = extractelement <16 x i8> %bytes, i64 5
- %e6 = extractelement <16 x i8> %bytes, i64 6
- %e7 = extractelement <16 x i8> %bytes, i64 7
- %e8 = extractelement <16 x i8> %bytes, i64 8
- %e9 = extractelement <16 x i8> %bytes, i64 9
- %e10 = extractelement <16 x i8> %bytes, i64 10
- %e11 = extractelement <16 x i8> %bytes, i64 11
- %e12 = extractelement <16 x i8> %bytes, i64 12
- %e13 = extractelement <16 x i8> %bytes, i64 13
- %e14 = extractelement <16 x i8> %bytes, i64 14
- %e15 = extractelement <16 x i8> %bytes, i64 15
- %s0 = select i1 %cond, i8 %e0, i8 0
- %s1 = select i1 %cond, i8 %e1, i8 0
- %s2 = select i1 %cond, i8 %e2, i8 0
- %s3 = select i1 %cond, i8 %e3, i8 0
- %s4 = select i1 %cond, i8 %e4, i8 0
- %s5 = select i1 %cond, i8 %e5, i8 0
- %s6 = select i1 %cond, i8 %e6, i8 0
- %s7 = select i1 %cond, i8 %e7, i8 0
- %s8 = select i1 %cond, i8 %e8, i8 0
- %s9 = select i1 %cond, i8 %e9, i8 0
- %s10 = select i1 %cond, i8 %e10, i8 0
- %s11 = select i1 %cond, i8 %e11, i8 0
- %s12 = select i1 %cond, i8 %e12, i8 0
- %s13 = select i1 %cond, i8 %e13, i8 0
- %s14 = select i1 %cond, i8 %e14, i8 0
- %s15 = select i1 %cond, i8 %e15, i8 0
- store i8 %s0, ptr addrspace(1) %out, align 1
- %ptr1 = getelementptr i8, ptr addrspace(1) %out, i64 1
- store i8 %s1, ptr addrspace(1) %ptr1, align 1
- %ptr2 = getelementptr i8, ptr addrspace(1) %out, i64 2
- store i8 %s2, ptr addrspace(1) %ptr2, align 1
- %ptr3 = getelementptr i8, ptr addrspace(1) %out, i64 3
- store i8 %s3, ptr addrspace(1) %ptr3, align 1
- %ptr4 = getelementptr i8, ptr addrspace(1) %out, i64 4
- store i8 %s4, ptr addrspace(1) %ptr4, align 1
- %ptr5 = getelementptr i8, ptr addrspace(1) %out, i64 5
- store i8 %s5, ptr addrspace(1) %ptr5, align 1
- %ptr6 = getelementptr i8, ptr addrspace(1) %out, i64 6
- store i8 %s6, ptr addrspace(1) %ptr6, align 1
- %ptr7 = getelementptr i8, ptr addrspace(1) %out, i64 7
- store i8 %s7, ptr addrspace(1) %ptr7, align 1
- %ptr8 = getelementptr i8, ptr addrspace(1) %out, i64 8
- store i8 %s8, ptr addrspace(1) %ptr8, align 1
- %ptr9 = getelementptr i8, ptr addrspace(1) %out, i64 9
- store i8 %s9, ptr addrspace(1) %ptr9, align 1
- %ptr10 = getelementptr i8, ptr addrspace(1) %out, i64 10
- store i8 %s10, ptr addrspace(1) %ptr10, align 1
- %ptr11 = getelementptr i8, ptr addrspace(1) %out, i64 11
- store i8 %s11, ptr addrspace(1) %ptr11, align 1
- %ptr12 = getelementptr i8, ptr addrspace(1) %out, i64 12
- store i8 %s12, ptr addrspace(1) %ptr12, align 1
- %ptr13 = getelementptr i8, ptr addrspace(1) %out, i64 13
- store i8 %s13, ptr addrspace(1) %ptr13, align 1
- %ptr14 = getelementptr i8, ptr addrspace(1) %out, i64 14
- store i8 %s14, ptr addrspace(1) %ptr14, align 1
- %ptr15 = getelementptr i8, ptr addrspace(1) %out, i64 15
- store i8 %s15, ptr addrspace(1) %ptr15, align 1
- ret void
-}
-
-; Test <2 x double> to <8 x i16> (double elements to 16-bit elements)
-define amdgpu_kernel void @combine_v2f64_to_v8i16(
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v8i16(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x double> [[SRC]], <2 x double> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x double> [[COMBINED_SEL]] to <8 x i16>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 4
-; CHECK-OPT-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 5
-; CHECK-OPT-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 6
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[COMBINED_BC]], i64 7
-; CHECK-OPT-NEXT: store i16 [[TMP0]], ptr addrspace(1) [[OUT]], align 2
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[PTR1]], align 2
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[PTR2]], align 2
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[PTR3]], align 2
-; CHECK-OPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[PTR4]], align 2
-; CHECK-OPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-OPT-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[PTR5]], align 2
-; CHECK-OPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[PTR6]], align 2
-; CHECK-OPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-OPT-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[PTR7]], align 2
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v8i16(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[HALVES:%.*]] = bitcast <2 x double> [[SRC]] to <8 x i16>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <8 x i16> [[HALVES]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <8 x i16> [[HALVES]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <8 x i16> [[HALVES]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <8 x i16> [[HALVES]], i64 3
-; CHECK-NOOPT-NEXT: [[E4:%.*]] = extractelement <8 x i16> [[HALVES]], i64 4
-; CHECK-NOOPT-NEXT: [[E5:%.*]] = extractelement <8 x i16> [[HALVES]], i64 5
-; CHECK-NOOPT-NEXT: [[E6:%.*]] = extractelement <8 x i16> [[HALVES]], i64 6
-; CHECK-NOOPT-NEXT: [[E7:%.*]] = extractelement <8 x i16> [[HALVES]], i64 7
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i16 [[E0]], i16 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i16 [[E1]], i16 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i16 [[E2]], i16 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i16 [[E3]], i16 0
-; CHECK-NOOPT-NEXT: [[S4:%.*]] = select i1 [[COND]], i16 [[E4]], i16 0
-; CHECK-NOOPT-NEXT: [[S5:%.*]] = select i1 [[COND]], i16 [[E5]], i16 0
-; CHECK-NOOPT-NEXT: [[S6:%.*]] = select i1 [[COND]], i16 [[E6]], i16 0
-; CHECK-NOOPT-NEXT: [[S7:%.*]] = select i1 [[COND]], i16 [[E7]], i16 0
-; CHECK-NOOPT-NEXT: store i16 [[S0]], ptr addrspace(1) [[OUT]], align 2
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i16 [[S1]], ptr addrspace(1) [[PTR1]], align 2
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i16 [[S2]], ptr addrspace(1) [[PTR2]], align 2
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i16 [[S3]], ptr addrspace(1) [[PTR3]], align 2
-; CHECK-NOOPT-NEXT: [[PTR4:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 4
-; CHECK-NOOPT-NEXT: store i16 [[S4]], ptr addrspace(1) [[PTR4]], align 2
-; CHECK-NOOPT-NEXT: [[PTR5:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 5
-; CHECK-NOOPT-NEXT: store i16 [[S5]], ptr addrspace(1) [[PTR5]], align 2
-; CHECK-NOOPT-NEXT: [[PTR6:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 6
-; CHECK-NOOPT-NEXT: store i16 [[S6]], ptr addrspace(1) [[PTR6]], align 2
-; CHECK-NOOPT-NEXT: [[PTR7:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT]], i64 7
-; CHECK-NOOPT-NEXT: store i16 [[S7]], ptr addrspace(1) [[PTR7]], align 2
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <2 x double> %src,
- i1 %cond
-) {
-entry:
- %halves = bitcast <2 x double> %src to <8 x i16>
- %e0 = extractelement <8 x i16> %halves, i64 0
- %e1 = extractelement <8 x i16> %halves, i64 1
- %e2 = extractelement <8 x i16> %halves, i64 2
- %e3 = extractelement <8 x i16> %halves, i64 3
- %e4 = extractelement <8 x i16> %halves, i64 4
- %e5 = extractelement <8 x i16> %halves, i64 5
- %e6 = extractelement <8 x i16> %halves, i64 6
- %e7 = extractelement <8 x i16> %halves, i64 7
- %s0 = select i1 %cond, i16 %e0, i16 0
- %s1 = select i1 %cond, i16 %e1, i16 0
- %s2 = select i1 %cond, i16 %e2, i16 0
- %s3 = select i1 %cond, i16 %e3, i16 0
- %s4 = select i1 %cond, i16 %e4, i16 0
- %s5 = select i1 %cond, i16 %e5, i16 0
- %s6 = select i1 %cond, i16 %e6, i16 0
- %s7 = select i1 %cond, i16 %e7, i16 0
- store i16 %s0, ptr addrspace(1) %out, align 2
- %ptr1 = getelementptr i16, ptr addrspace(1) %out, i64 1
- store i16 %s1, ptr addrspace(1) %ptr1, align 2
- %ptr2 = getelementptr i16, ptr addrspace(1) %out, i64 2
- store i16 %s2, ptr addrspace(1) %ptr2, align 2
- %ptr3 = getelementptr i16, ptr addrspace(1) %out, i64 3
- store i16 %s3, ptr addrspace(1) %ptr3, align 2
- %ptr4 = getelementptr i16, ptr addrspace(1) %out, i64 4
- store i16 %s4, ptr addrspace(1) %ptr4, align 2
- %ptr5 = getelementptr i16, ptr addrspace(1) %out, i64 5
- store i16 %s5, ptr addrspace(1) %ptr5, align 2
- %ptr6 = getelementptr i16, ptr addrspace(1) %out, i64 6
- store i16 %s6, ptr addrspace(1) %ptr6, align 2
- %ptr7 = getelementptr i16, ptr addrspace(1) %out, i64 7
- store i16 %s7, ptr addrspace(1) %ptr7, align 2
- ret void
-}
-
-; Test <2 x double> to <4 x i32> (double elements to 32-bit elements)
-define amdgpu_kernel void @combine_v2f64_to_v4i32(
-; CHECK-OPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v4i32(
-; CHECK-OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-OPT-NEXT: [[ENTRY:.*:]]
-; CHECK-OPT-NEXT: [[COMBINED_SEL:%.*]] = select i1 [[COND]], <2 x double> [[SRC]], <2 x double> zeroinitializer
-; CHECK-OPT-NEXT: [[COMBINED_BC:%.*]] = bitcast <2 x double> [[COMBINED_SEL]] to <4 x i32>
-; CHECK-OPT-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 0
-; CHECK-OPT-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 1
-; CHECK-OPT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 2
-; CHECK-OPT-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[COMBINED_BC]], i64 3
-; CHECK-OPT-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-OPT-NEXT: [[PTR1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-OPT-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[PTR1]], align 4
-; CHECK-OPT-NEXT: [[PTR2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[PTR2]], align 4
-; CHECK-OPT-NEXT: [[PTR3:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-OPT-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[PTR3]], align 4
-; CHECK-OPT-NEXT: ret void
-;
-; CHECK-NOOPT-LABEL: define amdgpu_kernel void @combine_v2f64_to_v4i32(
-; CHECK-NOOPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x double> [[SRC:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
-; CHECK-NOOPT-NEXT: [[ENTRY:.*:]]
-; CHECK-NOOPT-NEXT: [[WORDS:%.*]] = bitcast <2 x double> [[SRC]] to <4 x i32>
-; CHECK-NOOPT-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[WORDS]], i64 0
-; CHECK-NOOPT-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[WORDS]], i64 1
-; CHECK-NOOPT-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[WORDS]], i64 2
-; CHECK-NOOPT-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[WORDS]], i64 3
-; CHECK-NOOPT-NEXT: [[S0:%.*]] = select i1 [[COND]], i32 [[E0]], i32 0
-; CHECK-NOOPT-NEXT: [[S1:%.*]] = select i1 [[COND]], i32 [[E1]], i32 0
-; CHECK-NOOPT-NEXT: [[S2:%.*]] = select i1 [[COND]], i32 [[E2]], i32 0
-; CHECK-NOOPT-NEXT: [[S3:%.*]] = select i1 [[COND]], i32 [[E3]], i32 0
-; CHECK-NOOPT-NEXT: store i32 [[S0]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NOOPT-NEXT: [[PTR1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NOOPT-NEXT: store i32 [[S1]], ptr addrspace(1) [[PTR1]], align 4
-; CHECK-NOOPT-NEXT: [[PTR2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 2
-; CHECK-NOOPT-NEXT: store i32 [[S2]], ptr addrspace(1) [[PTR2]], align 4
-; CHECK-NOOPT-NEXT: [[PTR3:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 3
-; CHECK-NOOPT-NEXT: store i32 [[S3]], ptr addrspace(1) [[PTR3]], align 4
-; CHECK-NOOPT-NEXT: ret void
-;
- ptr addrspace(1) %out,
- <2 x double> %src,
- i1 %cond
-) {
-entry:
- %words = bitcast <2 x double> %src to <4 x i32>
- %e0 = extractelement <4 x i32> %words, i64 0
- %e1 = extractelement <4 x i32> %words, i64 1
- %e2 = extractelement <4 x i32> %words, i64 2
- %e3 = extractelement <4 x i32> %words, i64 3
- %s0 = select i1 %cond, i32 %e0, i32 0
- %s1 = select i1 %cond, i32 %e1, i32 0
- %s2 = select i1 %cond, i32 %e2, i32 0
- %s3 = select i1 %cond, i32 %e3, i32 0
- store i32 %s0, ptr addrspace(1) %out, align 4
- %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 1
- store i32 %s1, ptr addrspace(1) %ptr1, align 4
- %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 2
- store i32 %s2, ptr addrspace(1) %ptr2, align 4
- %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 3
- store i32 %s3, ptr addrspace(1) %ptr3, align 4
- ret void
-}
-
-declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
-
More information about the llvm-branch-commits
mailing list