[llvm] [AMDGPU] New image intrinsic optimizer pass (PR #67151)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 25 06:40:09 PDT 2023
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/67151
>From 36d958410e55e6450ee8445059fae5fac49f59e0 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 22 Sep 2023 15:12:09 +0100
Subject: [PATCH 1/4] [AMDGPU] New image intrinsic optimizer pass
Implement a new pass to combine multiple image_load_2dmsaa and
2darraymsaa intrinsic calls into a single image_msaa_load if:
- they refer to the same vaddr except for sample_id,
- they use a constant sample_id and they fall into the same group,
- they have the same dmask and the number of instructions and the
number of vaddr/vdata dword transfers is reduced by the combine
This should be valid on all GFX11 but a hardware bug renders it
unworkable on GFX11.0.* so it is only enabled for GFX11.5.
Based on a patch by Rodrigo Dominguez!
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 13 +
llvm/lib/Target/AMDGPU/AMDGPU.td | 9 +-
.../AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp | 336 ++++++++++
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 13 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 4 +
.../AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll | 606 ++++++++++++++++++
llvm/tools/opt/opt.cpp | 1 +
9 files changed, 985 insertions(+), 1 deletion(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b7101f401154706..97a413296c55e55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -49,6 +49,7 @@ FunctionPass *createSIPreAllocateWWMRegsPass();
FunctionPass *createSIFormMemoryClausesPass();
FunctionPass *createSIPostRABundlerPass();
+FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPreparePass();
@@ -64,6 +65,15 @@ struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
+struct AMDGPUImageIntrinsicOptimizerPass
+ : PassInfoMixin<AMDGPUImageIntrinsicOptimizerPass> {
+ AMDGPUImageIntrinsicOptimizerPass(TargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ TargetMachine &TM;
+};
+
struct AMDGPUUseNativeCallsPass : PassInfoMixin<AMDGPUUseNativeCallsPass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
@@ -175,6 +185,9 @@ extern char &SIOptimizeExecMaskingID;
void initializeSIPreAllocateWWMRegsPass(PassRegistry &);
extern char &SIPreAllocateWWMRegsID;
+void initializeAMDGPUImageIntrinsicOptimizerPass(PassRegistry &);
+extern char &AMDGPUImageIntrinsicOptimizerID;
+
void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
extern char &AMDGPUPerfHintAnalysisID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index efa1cc0696d2f7c..d5356d1be3d758a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -281,6 +281,12 @@ def FeatureMADIntraFwdBug : SubtargetFeature<"mad-intra-fwd-bug",
"MAD_U64/I64 intra instruction forwarding bug"
>;
+def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug",
+ "HasMSAALoadDstSelBug",
+ "true",
+ "MSAA loads not honoring dst_sel bug"
+>;
+
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,
"LDSBankCount",
@@ -1355,7 +1361,8 @@ def FeatureISAVersion11_Common : FeatureSet<
def FeatureISAVersion11_0_Common : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
- [FeatureVALUTransUseHazard])>;
+ [FeatureMSAALoadDstSelBug,
+ FeatureVALUTransUseHazard])>;
def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_0_Common.Features,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
new file mode 100644
index 000000000000000..c392cc4fd1ebebe
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -0,0 +1,336 @@
+//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
+// or dim=2darraymsaa into a single image_msaa_load intrinsic if:
+//
+// - they refer to the same vaddr except for sample_id,
+// - they use a constant sample_id and they fall into the same group,
+// - they have the same dmask and the number of intrinsics and the number of
+// vaddr/vdata dword transfers is reduced by the combine.
+//
+// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
+//
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
+// | (dmask) | | | | vdata | | vdata | |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+//
+// Some cases are of questionable benefit, like the one marked with "yes?"
+// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
+// and TX, but higher vdata. We start by erring on the side of converting these
+// to MSAA_LOAD.
+//
+// This pass will combine intrinsics such as (not neccessarily consecutive):
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+// ==>
+// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+//
+// Future improvements:
+//
+// - We may occasionally not want to do the combine if it increases the maximum
+// register pressure.
+//
+// - Ensure clausing when multiple MSAA_LOAD are generated.
+//
+// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
+// combine only applies to gfx11, due to a limitation in gfx10: the gfx10
+// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
+// we don't know the format at compile time.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
+
+namespace {
+class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
+ const TargetMachine *TM;
+
+public:
+ static char ID;
+
+ AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
+ : FunctionPass(ID), TM(TM) {}
+
+ bool runOnFunction(Function &F) override;
+
+}; // End of class AMDGPUImageIntrinsicOptimizer
+} // End anonymous namespace
+
+INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
+ "AMDGPU Image Intrinsic Optimizer", false, false)
+
+char AMDGPUImageIntrinsicOptimizer::ID = 0;
+
+void addInstToMergeableList(
+ IntrinsicInst *II, std::list<std::list<IntrinsicInst *>> &MergeableInsts,
+ const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
+ for (std::list<IntrinsicInst *> &IIList : MergeableInsts) {
+ // Check Dim.
+ if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
+ continue;
+
+ // Check D16.
+ if (IIList.front()->getType() != II->getType())
+ continue;
+
+ // Check DMask.
+ Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex);
+ Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex);
+ if (DMaskList != DMask)
+ continue;
+
+ // Check VAddr (except FragId).
+ int I = ImageDimIntr->VAddrStart;
+ for (; I < ImageDimIntr->VAddrEnd - 1; ++I) {
+ if (IIList.front()->getArgOperand(I) != II->getArgOperand(I))
+ break;
+ }
+
+ if (I != ImageDimIntr->VAddrEnd - 1)
+ continue;
+
+ // Check FragId group.
+ const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
+ Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex);
+ auto IIListFragId = cast<ConstantInt>(FragIdList);
+ auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
+ if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4))
+ continue;
+
+ // Add to the list.
+ IIList.emplace_back(II);
+ return;
+ }
+
+ // Similar instruction not found, so add a new list.
+ MergeableInsts.emplace_back(1, II);
+ LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
+}
+
+// Collect list of all instructions we know how to merge in a subset of the
+// block. It returns an iterator to the instruction after the last one analyzed.
+BasicBlock::iterator
+collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E,
+ std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
+ for (; I != E; ++I) {
+ // Don't combine if there is a store in the middle or if there is a memory
+ // barrier.
+ if (I->mayHaveSideEffects()) {
+ ++I;
+ break;
+ }
+
+ // Ignore non-intrinsics.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ Intrinsic::ID IntrinID = II->getIntrinsicID();
+
+ // Ignore other intrinsics.
+ if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
+ IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
+ continue;
+
+ // Check for constant FragId.
+ const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
+ const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
+ if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
+ addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
+ }
+ }
+
+ return I;
+}
+
+bool optimizeSection(std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
+ bool Modified = false;
+
+ SmallVector<Instruction *, 4> InstrsToErase;
+ for (auto IIList : MergeableInsts) {
+ if (IIList.size() <= 1)
+ continue;
+
+ // Assume the arguments are unchanged and later override them, if needed.
+ SmallVector<Value *, 16> Args(IIList.front()->args());
+
+ // Validate function argument and return types, extracting overloaded
+ // types along the way.
+ SmallVector<Type *, 6> OverloadTys;
+ Function *F = IIList.front()->getCalledFunction();
+ if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
+ continue;
+
+ Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
+ const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrinID);
+
+ Type *EltTy = IIList.front()->getType()->getScalarType();
+ Type *NewTy = FixedVectorType::get(EltTy, 4);
+ OverloadTys[0] = NewTy;
+ bool isD16 = EltTy->isHalfTy();
+
+ ConstantInt *DMask = cast<ConstantInt>(
+ IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
+ unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+ unsigned NumElts = popcount(DMaskVal);
+
+ // Number of instructions and the number of vaddr/vdata dword transfers
+ // should be reduced.
+ unsigned NumLoads = IIList.size();
+ unsigned NumMsaas = NumElts;
+ unsigned NumVAddrLoads = 3 * NumLoads;
+ unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
+ unsigned NumVAddrMsaas = 3 * NumMsaas;
+ unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
+
+ if (NumLoads < NumMsaas ||
+ (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
+ continue;
+
+ const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
+ auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
+ const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
+
+ // Create the new instructions.
+ IRBuilder<> B(IIList.front());
+
+ // Create the new image_msaa_load intrinsic.
+ SmallVector<Instruction *, 4> NewCalls;
+ while (DMaskVal != 0) {
+ unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
+
+ Intrinsic::ID NewIntrinID;
+ if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
+ NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
+ else
+ NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
+
+ Function *NewIntrin = Intrinsic::getDeclaration(
+ IIList.front()->getModule(), NewIntrinID, OverloadTys);
+ Args[ImageDimIntr->DMaskIndex] =
+ ConstantInt::get(DMask->getType(), NewMaskVal);
+ Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
+ CallInst *NewCall = B.CreateCall(NewIntrin, Args);
+ LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
+
+ NewCalls.push_back(NewCall);
+ DMaskVal -= NewMaskVal;
+ }
+
+ // Create the new extractelement instructions.
+ for (auto &II : IIList) {
+ Value *VecOp = UndefValue::get(II->getType());
+ auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
+ if (NumElts == 1) {
+ VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
+ LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
+ } else {
+ for (unsigned I = 0; I < NumElts; ++I) {
+ VecOp = B.CreateInsertElement(
+ VecOp, B.CreateExtractElement(
+ NewCalls[I], Idx->getValue().urem(4)), I);
+ LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
+ }
+ }
+
+ // Replace the old instruction.
+ II->replaceAllUsesWith(VecOp);
+ InstrsToErase.push_back(II);
+ }
+
+ Modified = true;
+ }
+
+ for (auto I : InstrsToErase) {
+ I->eraseFromParent();
+ }
+
+ MergeableInsts.clear();
+
+ return Modified;
+}
+
+static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
+ if (!TM)
+ return false;
+
+ // This optimization only applies to GFX11 and beyond.
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
+ return false;
+
+ Module *M = F.getParent();
+
+ // Early test to determine if the intrinsics are used.
+ if (std::none_of(M->begin(), M->end(), [](Function &F) {
+ return !F.users().empty() &&
+ (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
+ F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
+ }))
+ return false;
+
+ bool Modified = false;
+ for (auto &BB : F) {
+ BasicBlock::iterator SectionEnd;
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
+ I = SectionEnd) {
+ std::list<std::list<IntrinsicInst *>> MergeableInsts;
+
+ SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
+ Modified |= optimizeSection(MergeableInsts);
+ }
+ }
+
+ return Modified;
+}
+
+bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ return imageIntrinsicOptimizerImpl(F, TM);
+}
+
+FunctionPass *
+llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {
+ return new AMDGPUImageIntrinsicOptimizer(TM);
+}
+
+PreservedAnalyses
+AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+
+ bool Changed = imageIntrinsicOptimizerImpl(F, &TM);
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 481fbaf1543a4ea..bcbc03eb2559c4f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -333,6 +333,11 @@ static cl::opt<bool> EnablePromoteKernelArguments(
cl::desc("Enable promotion of flat kernel pointer arguments to global"),
cl::Hidden, cl::init(true));
+static cl::opt<bool> EnableImageIntrinsicOptimizer(
+ "amdgpu-enable-image-intrinsic-optimizer",
+ cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
+ cl::Hidden);
+
static cl::opt<bool> EnableMaxIlpSchedStrategy(
"amdgpu-enable-max-ilp-scheduling-strategy",
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
@@ -410,6 +415,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUExternalAAWrapperPass(*PR);
+ initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeAMDGPUResourceUsageAnalysisPass(*PR);
initializeGCNNSAReassignPass(*PR);
@@ -626,6 +632,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(AMDGPUSimplifyLibCallsPass());
return true;
}
+ if (PassName == "amdgpu-image-intrinsic-opt") {
+ PM.addPass(AMDGPUImageIntrinsicOptimizerPass(*this));
+ return true;
+ }
if (PassName == "amdgpu-usenative") {
PM.addPass(AMDGPUUseNativeCallsPass());
return true;
@@ -980,6 +990,9 @@ void AMDGPUPassConfig::addIRPasses() {
if (LowerCtorDtor)
addPass(createAMDGPUCtorDtorLoweringLegacyPass());
+ if (isPassEnabled(EnableImageIntrinsicOptimizer))
+ addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
+
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 0922e8d99deb3aa..8124fdd5ddfefec 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -66,6 +66,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULateCodeGenPrepare.cpp
AMDGPULegalizerInfo.cpp
AMDGPULibCalls.cpp
+ AMDGPUImageIntrinsicOptimizer.cpp
AMDGPULibFunc.cpp
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 970ce48de9f47c2..744eb50aaebd36a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -206,6 +206,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasFlatSegmentOffsetBug = false;
bool HasImageStoreD16Bug = false;
bool HasImageGather4D16Bug = false;
+ bool HasMSAALoadDstSelBug = false;
bool HasGFX11FullVGPRs = false;
bool HasMADIntraFwdBug = false;
bool HasVOPDInsts = false;
@@ -954,6 +955,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
+ bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
+
bool hasNSAEncoding() const { return HasNSAEncoding; }
bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 84f67b3faac3c07..b939c8d2e339de4 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -739,6 +739,8 @@
; GCN-O2-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O2-NEXT: AMDGPU Printf lowering
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O2-NEXT: FunctionPass Manager
+; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer
; GCN-O2-NEXT: AMDGPU Inline All Functions
; GCN-O2-NEXT: Inliner for always_inline functions
; GCN-O2-NEXT: FunctionPass Manager
@@ -1043,6 +1045,8 @@
; GCN-O3-NEXT: AMDGPU Remove Incompatible Functions
; GCN-O3-NEXT: AMDGPU Printf lowering
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
+; GCN-O3-NEXT: FunctionPass Manager
+; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer
; GCN-O3-NEXT: AMDGPU Inline All Functions
; GCN-O3-NEXT: Inliner for always_inline functions
; GCN-O3-NEXT: FunctionPass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
new file mode 100644
index 000000000000000..45afac52a6a5ceb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
@@ -0,0 +1,606 @@
+; RUN: opt -S -passes=amdgpu-image-intrinsic-opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=NO-MSAA %s
+; RUN: opt -S -passes=amdgpu-image-intrinsic-opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=NO-MSAA %s
+; RUN: opt -S -passes=amdgpu-image-intrinsic-opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1150 < %s | FileCheck -check-prefixes=MSAA %s
+
+; NO-MSAA-NOT: @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32
+; NO-MSAA-NOT: @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32
+
+; MSAA-LABEL: @load_2dmsaa_v4f32_dmask1
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 0
+; MSAA: %2 = extractelement <4 x float> %0, i64 1
+; MSAA: %3 = extractelement <4 x float> %0, i64 2
+; MSAA: %4 = extractelement <4 x float> %0, i64 3
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x float] undef, float %i, 0
+ %i5 = insertvalue [4 x float] %i4, float %i1, 1
+ %i6 = insertvalue [4 x float] %i5, float %i2, 2
+ %i7 = insertvalue [4 x float] %i6, float %i3, 3
+ ret [4 x float] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4f32_dmask2
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 0
+; MSAA: %2 = extractelement <4 x float> %0, i64 1
+; MSAA: %3 = extractelement <4 x float> %0, i64 2
+; MSAA: %4 = extractelement <4 x float> %0, i64 3
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask2(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x float] undef, float %i, 0
+ %i5 = insertvalue [4 x float] %i4, float %i1, 1
+ %i6 = insertvalue [4 x float] %i5, float %i2, 2
+ %i7 = insertvalue [4 x float] %i6, float %i3, 3
+ ret [4 x float] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4f32_dmask4
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 0
+; MSAA: %2 = extractelement <4 x float> %0, i64 1
+; MSAA: %3 = extractelement <4 x float> %0, i64 2
+; MSAA: %4 = extractelement <4 x float> %0, i64 3
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask4(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x float] undef, float %i, 0
+ %i5 = insertvalue [4 x float] %i4, float %i1, 1
+ %i6 = insertvalue [4 x float] %i5, float %i2, 2
+ %i7 = insertvalue [4 x float] %i6, float %i3, 3
+ ret [4 x float] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4f32_dmask8
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 0
+; MSAA: %2 = extractelement <4 x float> %0, i64 1
+; MSAA: %3 = extractelement <4 x float> %0, i64 2
+; MSAA: %4 = extractelement <4 x float> %0, i64 3
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask8(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x float] undef, float %i, 0
+ %i5 = insertvalue [4 x float] %i4, float %i1, 1
+ %i6 = insertvalue [4 x float] %i5, float %i2, 2
+ %i7 = insertvalue [4 x float] %i6, float %i3, 3
+ ret [4 x float] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4f32_reverse
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 3
+; MSAA: %2 = extractelement <4 x float> %0, i64 2
+; MSAA: %3 = extractelement <4 x float> %0, i64 1
+; MSAA: %4 = extractelement <4 x float> %0, i64 0
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_reverse(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x float] undef, float %i, 0
+ %i5 = insertvalue [4 x float] %i4, float %i1, 1
+ %i6 = insertvalue [4 x float] %i5, float %i2, 2
+ %i7 = insertvalue [4 x float] %i6, float %i3, 3
+ ret [4 x float] %i7
+}
+
+; Don't combine because the vaddr inputs are not identical.
+; MSAA-LABEL: @load_2dmsaa_v4f32_vaddr
+; MSAA-NOT: @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_vaddr(<8 x i32> inreg %rsrc, i32 %s0, i32 %t0, i32 %s1, i32 %t1, i32 %s2, i32 %t2, i32 %s3, i32 %t3) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s0, i32 %t0, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s1, i32 %t1, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s2, i32 %t2, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s3, i32 %t3, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x float] undef, float %i, 0
+ %i5 = insertvalue [4 x float] %i4, float %i1, 1
+ %i6 = insertvalue [4 x float] %i5, float %i2, 2
+ %i7 = insertvalue [4 x float] %i6, float %i3, 3
+ ret [4 x float] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v8f32
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 0
+; MSAA: %2 = extractelement <4 x float> %0, i64 1
+; MSAA: %3 = extractelement <4 x float> %0, i64 2
+; MSAA: %4 = extractelement <4 x float> %0, i64 3
+define amdgpu_ps [8 x float] @load_2dmsaa_v8f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i5 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i6 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i7 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i8 = insertvalue [8 x float] undef, float %i, 0
+ %i9 = insertvalue [8 x float] %i8, float %i1, 1
+ %i10 = insertvalue [8 x float] %i9, float %i2, 2
+ %i11 = insertvalue [8 x float] %i10, float %i3, 3
+ %i12 = insertvalue [8 x float] %i11, float %i4, 4
+ %i13 = insertvalue [8 x float] %i12, float %i5, 5
+ %i14 = insertvalue [8 x float] %i13, float %i6, 6
+ %i15 = insertvalue [8 x float] %i14, float %i7, 7
+ ret [8 x float] %i15
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4f32_interleaved
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 0
+; MSAA: %2 = extractelement <4 x float> %0, i64 1
+; MSAA: %3 = extractelement <4 x float> %0, i64 2
+; MSAA: %4 = extractelement <4 x float> %0, i64 3
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_interleaved(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = insertvalue [4 x float] undef, float %i, 0
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = insertvalue [4 x float] %i1, float %i2, 1
+ %i4 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i5 = insertvalue [4 x float] %i3, float %i4, 2
+ %i6 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i7 = insertvalue [4 x float] %i5, float %i6, 3
+ ret [4 x float] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v2f32_fragId01
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 0
+; MSAA: %2 = extractelement <4 x float> %0, i64 1
+define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId01(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = insertvalue [2 x float] undef, float %i, 0
+ %i3 = insertvalue [2 x float] %i2, float %i1, 1
+ ret [2 x float] %i3
+}
+
+; MSAA-LABEL: @load_2dmsaa_v2f32_fragId23
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 2
+; MSAA: %2 = extractelement <4 x float> %0, i64 3
+define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId23(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = insertvalue [2 x float] undef, float %i, 0
+ %i3 = insertvalue [2 x float] %i2, float %i1, 1
+ ret [2 x float] %i3
+}
+
+; Don't combine because it's not profitable: the resulting msaa loads would
+; have 8 vdata outputs.
+; MSAA-LABEL: @load_2dmsaa_v2v2f32_dmask3
+; MSAA-NOT: @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32
+define amdgpu_ps [2 x <2 x float>] @load_2dmsaa_v2v2f32_dmask3(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [2 x <2 x float>] undef, <2 x float> %i, 0
+ %i5 = insertvalue [2 x <2 x float>] %i4, <2 x float> %i1, 1
+ ret [2 x <2 x float>] %i5
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask3
+
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+
+; MSAA: %2 = extractelement <4 x float> %0, i64 0
+; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
+; MSAA: %4 = extractelement <4 x float> %1, i64 0
+; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
+
+; MSAA: %6 = extractelement <4 x float> %0, i64 1
+; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
+; MSAA: %8 = extractelement <4 x float> %1, i64 1
+; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
+define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask3(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+ %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+ %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+ %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+ ret [4 x <2 x float>] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask5
+
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+
+; MSAA: %2 = extractelement <4 x float> %0, i64 0
+; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
+; MSAA: %4 = extractelement <4 x float> %1, i64 0
+; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
+
+; MSAA: %6 = extractelement <4 x float> %0, i64 1
+; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
+; MSAA: %8 = extractelement <4 x float> %1, i64 1
+; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
+define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask5(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+ %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+ %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+ %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+ ret [4 x <2 x float>] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask6
+
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+
+; MSAA: %2 = extractelement <4 x float> %0, i64 0
+; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
+; MSAA: %4 = extractelement <4 x float> %1, i64 0
+; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
+
+; MSAA: %6 = extractelement <4 x float> %0, i64 1
+; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
+; MSAA: %8 = extractelement <4 x float> %1, i64 1
+; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
+define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask6(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+ %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+ %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+ %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+ ret [4 x <2 x float>] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask9
+
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+
+; MSAA: %2 = extractelement <4 x float> %0, i64 0
+; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
+; MSAA: %4 = extractelement <4 x float> %1, i64 0
+; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
+
+; MSAA: %6 = extractelement <4 x float> %0, i64 1
+; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
+; MSAA: %8 = extractelement <4 x float> %1, i64 1
+; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
+define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask9(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+ %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+ %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+ %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+ ret [4 x <2 x float>] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask10
+
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+
+; MSAA: %2 = extractelement <4 x float> %0, i64 0
+; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
+; MSAA: %4 = extractelement <4 x float> %1, i64 0
+; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
+
+; MSAA: %6 = extractelement <4 x float> %0, i64 1
+; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
+; MSAA: %8 = extractelement <4 x float> %1, i64 1
+; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
+define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask10(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+ %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+ %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+ %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+ ret [4 x <2 x float>] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask12
+
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+
+; MSAA: %2 = extractelement <4 x float> %0, i64 0
+; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
+; MSAA: %4 = extractelement <4 x float> %1, i64 0
+; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
+
+; MSAA: %6 = extractelement <4 x float> %0, i64 1
+; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
+; MSAA: %8 = extractelement <4 x float> %1, i64 1
+; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
+define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask12(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+ %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+ %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+ %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+ ret [4 x <2 x float>] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v2f16_fragId01
+; MSAA: %0 = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x half> %0, i64 0
+; MSAA: %2 = extractelement <4 x half> %0, i64 1
+define amdgpu_ps [2 x half] @load_2dmsaa_v2f16_fragId01(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = insertvalue [2 x half] undef, half %i, 0
+ %i3 = insertvalue [2 x half] %i2, half %i1, 1
+ ret [2 x half] %i3
+}
+
+; MSAA-LABEL: @load_2darraymsaa_v4f32_dmask1
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 0
+; MSAA: %2 = extractelement <4 x float> %0, i64 1
+; MSAA: %3 = extractelement <4 x float> %0, i64 2
+; MSAA: %4 = extractelement <4 x float> %0, i64 3
+define amdgpu_ps [4 x float] @load_2darraymsaa_v4f32_dmask1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x float] undef, float %i, 0
+ %i5 = insertvalue [4 x float] %i4, float %i1, 1
+ %i6 = insertvalue [4 x float] %i5, float %i2, 2
+ %i7 = insertvalue [4 x float] %i6, float %i3, 3
+ ret [4 x float] %i7
+}
+
+; MSAA-LABEL: @load_2darraymsaa_v4v2f32_dmask3
+
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+
+; MSAA: %2 = extractelement <4 x float> %0, i64 0
+; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
+; MSAA: %4 = extractelement <4 x float> %1, i64 0
+; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
+
+; MSAA: %6 = extractelement <4 x float> %0, i64 1
+; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
+; MSAA: %8 = extractelement <4 x float> %1, i64 1
+; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
+define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask3(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+main_body:
+ %i = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+ %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+ %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+ %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+ ret [4 x <2 x float>] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4v3f32_dmask7
+
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %2 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+
+; MSAA: %3 = extractelement <4 x float> %0, i64 0
+; MSAA: %4 = insertelement <3 x float> undef, float %3, i64 0
+; MSAA: %5 = extractelement <4 x float> %1, i64 0
+; MSAA: %6 = insertelement <3 x float> %4, float %5, i64 1
+; MSAA: %7 = extractelement <4 x float> %2, i64 0
+; MSAA: %8 = insertelement <3 x float> %6, float %7, i64 2
+
+; MSAA: %9 = extractelement <4 x float> %0, i64 1
+; MSAA: %10 = insertelement <3 x float> undef, float %9, i64 0
+; MSAA: %11 = extractelement <4 x float> %1, i64 1
+; MSAA: %12 = insertelement <3 x float> %10, float %11, i64 1
+; MSAA: %13 = extractelement <4 x float> %2, i64 1
+; MSAA: %14 = insertelement <3 x float> %12, float %13, i64 2
+
+; MSAA: %15 = extractelement <4 x float> %0, i64 2
+; MSAA: %16 = insertelement <3 x float> undef, float %15, i64 0
+; MSAA: %17 = extractelement <4 x float> %1, i64 2
+; MSAA: %18 = insertelement <3 x float> %16, float %17, i64 1
+; MSAA: %19 = extractelement <4 x float> %2, i64 2
+; MSAA: %20 = insertelement <3 x float> %18, float %19, i64 2
+
+; MSAA: %21 = extractelement <4 x float> %0, i64 3
+; MSAA: %22 = insertelement <3 x float> undef, float %21, i64 0
+; MSAA: %23 = extractelement <4 x float> %1, i64 3
+; MSAA: %24 = insertelement <3 x float> %22, float %23, i64 1
+; MSAA: %25 = extractelement <4 x float> %2, i64 3
+; MSAA: %26 = insertelement <3 x float> %24, float %25, i64 2
+define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
+ %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
+ %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
+ %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
+ ret [4 x <3 x float>] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4v3f32_dmask7_group1
+
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 4, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 4, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %2 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 4, <8 x i32> %rsrc, i32 0, i32 0)
+
+; MSAA: %3 = extractelement <4 x float> %0, i64 0
+; MSAA: %4 = insertelement <3 x float> undef, float %3, i64 0
+; MSAA: %5 = extractelement <4 x float> %1, i64 0
+; MSAA: %6 = insertelement <3 x float> %4, float %5, i64 1
+; MSAA: %7 = extractelement <4 x float> %2, i64 0
+; MSAA: %8 = insertelement <3 x float> %6, float %7, i64 2
+
+; MSAA: %9 = extractelement <4 x float> %0, i64 1
+; MSAA: %10 = insertelement <3 x float> undef, float %9, i64 0
+; MSAA: %11 = extractelement <4 x float> %1, i64 1
+; MSAA: %12 = insertelement <3 x float> %10, float %11, i64 1
+; MSAA: %13 = extractelement <4 x float> %2, i64 1
+; MSAA: %14 = insertelement <3 x float> %12, float %13, i64 2
+
+; MSAA: %15 = extractelement <4 x float> %0, i64 2
+; MSAA: %16 = insertelement <3 x float> undef, float %15, i64 0
+; MSAA: %17 = extractelement <4 x float> %1, i64 2
+; MSAA: %18 = insertelement <3 x float> %16, float %17, i64 1
+; MSAA: %19 = extractelement <4 x float> %2, i64 2
+; MSAA: %20 = insertelement <3 x float> %18, float %19, i64 2
+
+; MSAA: %21 = extractelement <4 x float> %0, i64 3
+; MSAA: %22 = insertelement <3 x float> undef, float %21, i64 0
+; MSAA: %23 = extractelement <4 x float> %1, i64 3
+; MSAA: %24 = insertelement <3 x float> %22, float %23, i64 1
+; MSAA: %25 = extractelement <4 x float> %2, i64 3
+; MSAA: %26 = insertelement <3 x float> %24, float %25, i64 2
+define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7_group1(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 4, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 5, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 6, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 7, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
+ %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
+ %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
+ %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
+ ret [4 x <3 x float>] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4f32_sections
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 0
+; MSAA: %2 = extractelement <4 x float> %0, i64 1
+; MSAA: call void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float %vdata, i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %3 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %4 = extractelement <4 x float> %3, i64 2
+; MSAA: %5 = extractelement <4 x float> %3, i64 3
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_sections(<8 x i32> inreg %rsrc, float %vdata, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ call void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float %vdata, i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x float] undef, float %i, 0
+ %i5 = insertvalue [4 x float] %i4, float %i1, 1
+ %i6 = insertvalue [4 x float] %i5, float %i2, 2
+ %i7 = insertvalue [4 x float] %i6, float %i3, 3
+ ret [4 x float] %i7
+}
+
+; MSAA-LABEL: @load_2dmsaa_v4f32_blocks
+; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %1 = extractelement <4 x float> %0, i64 0
+; MSAA: %2 = extractelement <4 x float> %0, i64 1
+; MSAA: %3 = extractelement <4 x float> %0, i64 2
+; MSAA: %4 = extractelement <4 x float> %0, i64 3
+; MSAA-LABEL: if_equal:
+; MSAA: %5 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %6 = extractelement <4 x float> %5, i64 0
+; MSAA: %7 = extractelement <4 x float> %5, i64 1
+; MSAA: %8 = extractelement <4 x float> %5, i64 2
+; MSAA: %9 = extractelement <4 x float> %5, i64 3
+; MSAA-LABEL: if_unequal:
+; MSAA: %10 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+; MSAA: %11 = extractelement <4 x float> %10, i64 0
+; MSAA: %12 = extractelement <4 x float> %10, i64 1
+; MSAA: %13 = extractelement <4 x float> %10, i64 2
+; MSAA: %14 = extractelement <4 x float> %10, i64 3
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %cond) {
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i4 = insertvalue [4 x float] undef, float %i, 0
+ %i5 = insertvalue [4 x float] %i4, float %i1, 1
+ %i6 = insertvalue [4 x float] %i5, float %i2, 2
+ %i7 = insertvalue [4 x float] %i6, float %i3, 3
+ %i8 = trunc i32 %cond to i1
+ br i1 %i8, label %if_equal, label %if_unequal
+if_equal:
+ %i9 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i10 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i11 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i12 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i13 = insertvalue [4 x float] undef, float %i9, 0
+ %i14 = insertvalue [4 x float] %i13, float %i10, 1
+ %i15 = insertvalue [4 x float] %i14, float %i11, 2
+ %i16 = insertvalue [4 x float] %i15, float %i12, 3
+ br label %merge
+if_unequal:
+ %i17 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ %i18 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+ %i19 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+ %i20 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+ %i21 = insertvalue [4 x float] undef, float %i17, 0
+ %i22 = insertvalue [4 x float] %i21, float %i18, 1
+ %i23 = insertvalue [4 x float] %i22, float %i19, 2
+ %i24 = insertvalue [4 x float] %i23, float %i20, 3
+ br label %merge
+merge:
+ %i25 = phi [4 x float] [%i16, %if_equal], [%i24, %if_unequal]
+ ret [4 x float] %i25
+}
+
+declare float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float, i32, i32, i32, i32, <8 x i32>, i32, i32)
+
+attributes #0 = { nounwind readonly willreturn }
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 34a0eb7a20d3c39..cf4b629ec01977c 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -331,6 +331,7 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
"nvvm-reflect",
"nvvm-intr-range",
"amdgpu-simplifylib",
+ "amdgpu-image-intrinsic-opt",
"amdgpu-usenative",
"amdgpu-promote-alloca",
"amdgpu-promote-alloca-to-vector",
>From 092dc609f2f5ef60908e9e8faa8793344fbbcebf Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 22 Sep 2023 16:32:49 +0100
Subject: [PATCH 2/4] clang-format
---
llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
index c392cc4fd1ebebe..c51bf83ac097315 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -36,6 +36,8 @@
// and TX, but higher vdata. We start by erring on the side of converting these
// to MSAA_LOAD.
//
+// clang-format off
+//
// This pass will combine intrinsics such as (not neccessarily consecutive):
// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -44,6 +46,8 @@
// ==>
// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
//
+// clang-format on
+//
// Future improvements:
//
// - We may occasionally not want to do the combine if it increases the maximum
@@ -258,8 +262,8 @@ bool optimizeSection(std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
} else {
for (unsigned I = 0; I < NumElts; ++I) {
VecOp = B.CreateInsertElement(
- VecOp, B.CreateExtractElement(
- NewCalls[I], Idx->getValue().urem(4)), I);
+ VecOp,
+ B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
}
}
>From 93cc83ede497438e85d25420d5846f39e6dec75d Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 25 Sep 2023 11:05:07 +0100
Subject: [PATCH 3/4] Address some review comments
---
.../AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp | 29 ++++++++-----------
1 file changed, 12 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
index c51bf83ac097315..e6164deb977f663 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -96,9 +96,10 @@ INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
char AMDGPUImageIntrinsicOptimizer::ID = 0;
void addInstToMergeableList(
- IntrinsicInst *II, std::list<std::list<IntrinsicInst *>> &MergeableInsts,
+ IntrinsicInst *II,
+ SmallVector<SmallVector<IntrinsicInst *>> &MergeableInsts,
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
- for (std::list<IntrinsicInst *> &IIList : MergeableInsts) {
+ for (SmallVector<IntrinsicInst *> &IIList : MergeableInsts) {
// Check Dim.
if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
continue;
@@ -143,9 +144,9 @@ void addInstToMergeableList(
// Collect list of all instructions we know how to merge in a subset of the
// block. It returns an iterator to the instruction after the last one analyzed.
-BasicBlock::iterator
-collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E,
- std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
+BasicBlock::iterator collectMergeableInsts(
+ BasicBlock::iterator I, BasicBlock::iterator E,
+ SmallVector<SmallVector<IntrinsicInst *>> &MergeableInsts) {
for (; I != E; ++I) {
// Don't combine if there is a store in the middle or if there is a memory
// barrier.
@@ -177,11 +178,10 @@ collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E,
return I;
}
-bool optimizeSection(std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
+bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *>> MergeableInsts) {
bool Modified = false;
- SmallVector<Instruction *, 4> InstrsToErase;
- for (auto IIList : MergeableInsts) {
+ for (const auto &IIList : MergeableInsts) {
if (IIList.size() <= 1)
continue;
@@ -254,12 +254,13 @@ bool optimizeSection(std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
// Create the new extractelement instructions.
for (auto &II : IIList) {
- Value *VecOp = UndefValue::get(II->getType());
+ Value *VecOp = nullptr;
auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
if (NumElts == 1) {
VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
} else {
+ VecOp = UndefValue::get(II->getType());
for (unsigned I = 0; I < NumElts; ++I) {
VecOp = B.CreateInsertElement(
VecOp,
@@ -270,18 +271,12 @@ bool optimizeSection(std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
// Replace the old instruction.
II->replaceAllUsesWith(VecOp);
- InstrsToErase.push_back(II);
+ II->eraseFromParent();
}
Modified = true;
}
- for (auto I : InstrsToErase) {
- I->eraseFromParent();
- }
-
- MergeableInsts.clear();
-
return Modified;
}
@@ -309,7 +304,7 @@ static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
BasicBlock::iterator SectionEnd;
for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
I = SectionEnd) {
- std::list<std::list<IntrinsicInst *>> MergeableInsts;
+ SmallVector<SmallVector<IntrinsicInst *>> MergeableInsts;
SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
Modified |= optimizeSection(MergeableInsts);
>From bc981ba83d1f288f9aca125d77278d438d741e2e Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 25 Sep 2023 14:39:26 +0100
Subject: [PATCH 4/4] Address more review comments
---
.../AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp | 18 +-
.../AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll | 1065 +++++++++++++----
2 files changed, 841 insertions(+), 242 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
index e6164deb977f663..acfd3407681a7fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -97,9 +97,9 @@ char AMDGPUImageIntrinsicOptimizer::ID = 0;
void addInstToMergeableList(
IntrinsicInst *II,
- SmallVector<SmallVector<IntrinsicInst *>> &MergeableInsts,
+ SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
- for (SmallVector<IntrinsicInst *> &IIList : MergeableInsts) {
+ for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {
// Check Dim.
if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
continue;
@@ -146,7 +146,7 @@ void addInstToMergeableList(
// block. It returns an iterator to the instruction after the last one analyzed.
BasicBlock::iterator collectMergeableInsts(
BasicBlock::iterator I, BasicBlock::iterator E,
- SmallVector<SmallVector<IntrinsicInst *>> &MergeableInsts) {
+ SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {
for (; I != E; ++I) {
// Don't combine if there is a store in the middle or if there is a memory
// barrier.
@@ -178,9 +178,10 @@ BasicBlock::iterator collectMergeableInsts(
return I;
}
-bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *>> MergeableInsts) {
+bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
bool Modified = false;
+ SmallVector<Instruction *, 4> InstrsToErase;
for (const auto &IIList : MergeableInsts) {
if (IIList.size() <= 1)
continue;
@@ -256,6 +257,7 @@ bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *>> MergeableInsts) {
for (auto &II : IIList) {
Value *VecOp = nullptr;
auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
+ B.SetCurrentDebugLocation(II->getDebugLoc());
if (NumElts == 1) {
VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
@@ -271,12 +273,16 @@ bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *>> MergeableInsts) {
// Replace the old instruction.
II->replaceAllUsesWith(VecOp);
- II->eraseFromParent();
+ VecOp->takeName(II);
+ InstrsToErase.push_back(II);
}
Modified = true;
}
+ for (auto I : InstrsToErase)
+ I->eraseFromParent();
+
return Modified;
}
@@ -304,7 +310,7 @@ static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
BasicBlock::iterator SectionEnd;
for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
I = SectionEnd) {
- SmallVector<SmallVector<IntrinsicInst *>> MergeableInsts;
+ SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;
SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
Modified |= optimizeSection(MergeableInsts);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
index 45afac52a6a5ceb..853ca53767be8cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
@@ -1,17 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt -S -passes=amdgpu-image-intrinsic-opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=NO-MSAA %s
; RUN: opt -S -passes=amdgpu-image-intrinsic-opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=NO-MSAA %s
; RUN: opt -S -passes=amdgpu-image-intrinsic-opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1150 < %s | FileCheck -check-prefixes=MSAA %s
-; NO-MSAA-NOT: @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32
-; NO-MSAA-NOT: @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32
-
-; MSAA-LABEL: @load_2dmsaa_v4f32_dmask1
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 0
-; MSAA: %2 = extractelement <4 x float> %0, i64 1
-; MSAA: %3 = extractelement <4 x float> %0, i64 2
-; MSAA: %4 = extractelement <4 x float> %0, i64 3
define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0:[0-9]+]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT: ret [4 x float] [[I7]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -24,13 +43,34 @@ main_body:
ret [4 x float] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4f32_dmask2
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 0
-; MSAA: %2 = extractelement <4 x float> %0, i64 1
-; MSAA: %3 = extractelement <4 x float> %0, i64 2
-; MSAA: %4 = extractelement <4 x float> %0, i64 3
define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask2(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask2(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask2(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT: ret [4 x float] [[I7]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -43,13 +83,34 @@ main_body:
ret [4 x float] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4f32_dmask4
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 0
-; MSAA: %2 = extractelement <4 x float> %0, i64 1
-; MSAA: %3 = extractelement <4 x float> %0, i64 2
-; MSAA: %4 = extractelement <4 x float> %0, i64 3
define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask4(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask4(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask4(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT: ret [4 x float] [[I7]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -62,13 +123,34 @@ main_body:
ret [4 x float] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4f32_dmask8
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 0
-; MSAA: %2 = extractelement <4 x float> %0, i64 1
-; MSAA: %3 = extractelement <4 x float> %0, i64 2
-; MSAA: %4 = extractelement <4 x float> %0, i64 3
define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask8(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask8(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask8(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT: ret [4 x float] [[I7]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -81,13 +163,34 @@ main_body:
ret [4 x float] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4f32_reverse
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 3
-; MSAA: %2 = extractelement <4 x float> %0, i64 2
-; MSAA: %3 = extractelement <4 x float> %0, i64 1
-; MSAA: %4 = extractelement <4 x float> %0, i64 0
define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_reverse(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_reverse(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_reverse(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT: ret [4 x float] [[I7]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
@@ -101,9 +204,33 @@ main_body:
}
; Don't combine because the vaddr inputs are not identical.
-; MSAA-LABEL: @load_2dmsaa_v4f32_vaddr
-; MSAA-NOT: @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32
define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_vaddr(<8 x i32> inreg %rsrc, i32 %s0, i32 %t0, i32 %s1, i32 %t1, i32 %s2, i32 %t2, i32 %s3, i32 %t3) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_vaddr(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S0:%.*]], i32 [[T0:%.*]], i32 [[S1:%.*]], i32 [[T1:%.*]], i32 [[S2:%.*]], i32 [[T2:%.*]], i32 [[S3:%.*]], i32 [[T3:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S0]], i32 [[T0]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S1]], i32 [[T1]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S2]], i32 [[T2]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S3]], i32 [[T3]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_vaddr(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S0:%.*]], i32 [[T0:%.*]], i32 [[S1:%.*]], i32 [[T1:%.*]], i32 [[S2:%.*]], i32 [[T2:%.*]], i32 [[S3:%.*]], i32 [[T3:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S0]], i32 [[T0]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S1]], i32 [[T1]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S2]], i32 [[T2]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S3]], i32 [[T3]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT: ret [4 x float] [[I7]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s0, i32 %t0, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s1, i32 %t1, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -116,13 +243,50 @@ main_body:
ret [4 x float] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v8f32
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 0
-; MSAA: %2 = extractelement <4 x float> %0, i64 1
-; MSAA: %3 = extractelement <4 x float> %0, i64 2
-; MSAA: %4 = extractelement <4 x float> %0, i64 3
define amdgpu_ps [8 x float] @load_2dmsaa_v8f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [8 x float] @load_2dmsaa_v8f32(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I5:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I6:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I7:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I8:%.*]] = insertvalue [8 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I9:%.*]] = insertvalue [8 x float] [[I8]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I10:%.*]] = insertvalue [8 x float] [[I9]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I11:%.*]] = insertvalue [8 x float] [[I10]], float [[I3]], 3
+; NO-MSAA-NEXT: [[I12:%.*]] = insertvalue [8 x float] [[I11]], float [[I4]], 4
+; NO-MSAA-NEXT: [[I13:%.*]] = insertvalue [8 x float] [[I12]], float [[I5]], 5
+; NO-MSAA-NEXT: [[I14:%.*]] = insertvalue [8 x float] [[I13]], float [[I6]], 6
+; NO-MSAA-NEXT: [[I15:%.*]] = insertvalue [8 x float] [[I14]], float [[I7]], 7
+; NO-MSAA-NEXT: ret [8 x float] [[I15]]
+;
+; MSAA-LABEL: define amdgpu_ps [8 x float] @load_2dmsaa_v8f32(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I4:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I5:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I6:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I7:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I8:%.*]] = insertvalue [8 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I9:%.*]] = insertvalue [8 x float] [[I8]], float [[I1]], 1
+; MSAA-NEXT: [[I10:%.*]] = insertvalue [8 x float] [[I9]], float [[I2]], 2
+; MSAA-NEXT: [[I11:%.*]] = insertvalue [8 x float] [[I10]], float [[I3]], 3
+; MSAA-NEXT: [[I12:%.*]] = insertvalue [8 x float] [[I11]], float [[I4]], 4
+; MSAA-NEXT: [[I13:%.*]] = insertvalue [8 x float] [[I12]], float [[I5]], 5
+; MSAA-NEXT: [[I14:%.*]] = insertvalue [8 x float] [[I13]], float [[I6]], 6
+; MSAA-NEXT: [[I15:%.*]] = insertvalue [8 x float] [[I14]], float [[I7]], 7
+; MSAA-NEXT: ret [8 x float] [[I15]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -143,13 +307,34 @@ main_body:
ret [8 x float] %i15
}
-; MSAA-LABEL: @load_2dmsaa_v4f32_interleaved
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 0
-; MSAA: %2 = extractelement <4 x float> %0, i64 1
-; MSAA: %3 = extractelement <4 x float> %0, i64 2
-; MSAA: %4 = extractelement <4 x float> %0, i64 3
define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_interleaved(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_interleaved(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = insertvalue [4 x float] [[I1]], float [[I2]], 1
+; NO-MSAA-NEXT: [[I4:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I3]], float [[I4]], 2
+; NO-MSAA-NEXT: [[I6:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I5]], float [[I6]], 3
+; NO-MSAA-NEXT: ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_interleaved(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I4:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I6:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I1:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I3:%.*]] = insertvalue [4 x float] [[I1]], float [[I2]], 1
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I3]], float [[I4]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I5]], float [[I6]], 3
+; MSAA-NEXT: ret [4 x float] [[I7]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = insertvalue [4 x float] undef, float %i, 0
@@ -162,11 +347,26 @@ main_body:
ret [4 x float] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v2f32_fragId01
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 0
-; MSAA: %2 = extractelement <4 x float> %0, i64 1
define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId01(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId01(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = insertvalue [2 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I3:%.*]] = insertvalue [2 x float] [[I2]], float [[I1]], 1
+; NO-MSAA-NEXT: ret [2 x float] [[I3]]
+;
+; MSAA-LABEL: define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId01(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I2:%.*]] = insertvalue [2 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I3:%.*]] = insertvalue [2 x float] [[I2]], float [[I1]], 1
+; MSAA-NEXT: ret [2 x float] [[I3]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -175,11 +375,26 @@ main_body:
ret [2 x float] %i3
}
-; MSAA-LABEL: @load_2dmsaa_v2f32_fragId23
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 2
-; MSAA: %2 = extractelement <4 x float> %0, i64 3
define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId23(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId23(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = insertvalue [2 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I3:%.*]] = insertvalue [2 x float] [[I2]], float [[I1]], 1
+; NO-MSAA-NEXT: ret [2 x float] [[I3]]
+;
+; MSAA-LABEL: define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId23(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I2:%.*]] = insertvalue [2 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I3:%.*]] = insertvalue [2 x float] [[I2]], float [[I1]], 1
+; MSAA-NEXT: ret [2 x float] [[I3]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
@@ -190,9 +405,25 @@ main_body:
; Don't combine because it's not profitable: the resulting msaa loads would
; have 8 vdata outputs.
-; MSAA-LABEL: @load_2dmsaa_v2v2f32_dmask3
-; MSAA-NOT: @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32
define amdgpu_ps [2 x <2 x float>] @load_2dmsaa_v2v2f32_dmask3(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [2 x <2 x float>] @load_2dmsaa_v2v2f32_dmask3(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [2 x <2 x float>] undef, <2 x float> [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [2 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; NO-MSAA-NEXT: ret [2 x <2 x float>] [[I5]]
+;
+; MSAA-LABEL: define amdgpu_ps [2 x <2 x float>] @load_2dmsaa_v2v2f32_dmask3(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [2 x <2 x float>] undef, <2 x float> [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [2 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; MSAA-NEXT: ret [2 x <2 x float>] [[I5]]
+;
main_body:
%i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -201,21 +432,50 @@ main_body:
ret [2 x <2 x float>] %i5
}
-; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask3
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %2 = extractelement <4 x float> %0, i64 0
-; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
-; MSAA: %4 = extractelement <4 x float> %1, i64 0
-; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
-; MSAA: %6 = extractelement <4 x float> %0, i64 1
-; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
-; MSAA: %8 = extractelement <4 x float> %1, i64 1
-; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask3(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask3(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask3(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
+; MSAA-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[I:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i64 1
+; MSAA-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP5]], i64 0
+; MSAA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[I1:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP7]], i64 1
+; MSAA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i64 0
+; MSAA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[I2:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP10]], i64 1
+; MSAA-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP11]], i64 0
+; MSAA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[I3:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP13]], i64 1
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
main_body:
%i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -228,21 +488,50 @@ main_body:
ret [4 x <2 x float>] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask5
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %2 = extractelement <4 x float> %0, i64 0
-; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
-; MSAA: %4 = extractelement <4 x float> %1, i64 0
-; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
-; MSAA: %6 = extractelement <4 x float> %0, i64 1
-; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
-; MSAA: %8 = extractelement <4 x float> %1, i64 1
-; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask5(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask5(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask5(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
+; MSAA-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[I:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i64 1
+; MSAA-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP5]], i64 0
+; MSAA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[I1:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP7]], i64 1
+; MSAA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i64 0
+; MSAA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[I2:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP10]], i64 1
+; MSAA-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP11]], i64 0
+; MSAA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[I3:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP13]], i64 1
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
main_body:
%i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -255,21 +544,50 @@ main_body:
ret [4 x <2 x float>] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask6
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %2 = extractelement <4 x float> %0, i64 0
-; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
-; MSAA: %4 = extractelement <4 x float> %1, i64 0
-; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
-; MSAA: %6 = extractelement <4 x float> %0, i64 1
-; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
-; MSAA: %8 = extractelement <4 x float> %1, i64 1
-; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask6(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask6(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask6(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
+; MSAA-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[I:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i64 1
+; MSAA-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP5]], i64 0
+; MSAA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[I1:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP7]], i64 1
+; MSAA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i64 0
+; MSAA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[I2:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP10]], i64 1
+; MSAA-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP11]], i64 0
+; MSAA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[I3:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP13]], i64 1
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
main_body:
%i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -282,21 +600,50 @@ main_body:
ret [4 x <2 x float>] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask9
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %2 = extractelement <4 x float> %0, i64 0
-; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
-; MSAA: %4 = extractelement <4 x float> %1, i64 0
-; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
-; MSAA: %6 = extractelement <4 x float> %0, i64 1
-; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
-; MSAA: %8 = extractelement <4 x float> %1, i64 1
-; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask9(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask9(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask9(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
+; MSAA-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[I:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i64 1
+; MSAA-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP5]], i64 0
+; MSAA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[I1:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP7]], i64 1
+; MSAA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i64 0
+; MSAA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[I2:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP10]], i64 1
+; MSAA-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP11]], i64 0
+; MSAA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[I3:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP13]], i64 1
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
main_body:
%i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -309,21 +656,50 @@ main_body:
ret [4 x <2 x float>] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask10
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %2 = extractelement <4 x float> %0, i64 0
-; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
-; MSAA: %4 = extractelement <4 x float> %1, i64 0
-; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
-; MSAA: %6 = extractelement <4 x float> %0, i64 1
-; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
-; MSAA: %8 = extractelement <4 x float> %1, i64 1
-; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask10(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask10(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask10(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
+; MSAA-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[I:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i64 1
+; MSAA-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP5]], i64 0
+; MSAA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[I1:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP7]], i64 1
+; MSAA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i64 0
+; MSAA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[I2:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP10]], i64 1
+; MSAA-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP11]], i64 0
+; MSAA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[I3:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP13]], i64 1
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
main_body:
%i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -336,21 +712,50 @@ main_body:
ret [4 x <2 x float>] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4v2f32_dmask12
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %2 = extractelement <4 x float> %0, i64 0
-; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
-; MSAA: %4 = extractelement <4 x float> %1, i64 0
-; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
-; MSAA: %6 = extractelement <4 x float> %0, i64 1
-; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
-; MSAA: %8 = extractelement <4 x float> %1, i64 1
-; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask12(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask12(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask12(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
+; MSAA-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[I:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i64 1
+; MSAA-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP5]], i64 0
+; MSAA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[I1:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP7]], i64 1
+; MSAA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i64 0
+; MSAA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[I2:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP10]], i64 1
+; MSAA-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP11]], i64 0
+; MSAA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[I3:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP13]], i64 1
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
main_body:
%i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -363,11 +768,26 @@ main_body:
ret [4 x <2 x float>] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v2f16_fragId01
-; MSAA: %0 = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x half> %0, i64 0
-; MSAA: %2 = extractelement <4 x half> %0, i64 1
define amdgpu_ps [2 x half] @load_2dmsaa_v2f16_fragId01(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [2 x half] @load_2dmsaa_v2f16_fragId01(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = insertvalue [2 x half] undef, half [[I]], 0
+; NO-MSAA-NEXT: [[I3:%.*]] = insertvalue [2 x half] [[I2]], half [[I1]], 1
+; NO-MSAA-NEXT: ret [2 x half] [[I3]]
+;
+; MSAA-LABEL: define amdgpu_ps [2 x half] @load_2dmsaa_v2f16_fragId01(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x half> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x half> [[TMP0]], i64 1
+; MSAA-NEXT: [[I2:%.*]] = insertvalue [2 x half] undef, half [[I]], 0
+; MSAA-NEXT: [[I3:%.*]] = insertvalue [2 x half] [[I2]], half [[I1]], 1
+; MSAA-NEXT: ret [2 x half] [[I3]]
+;
main_body:
%i = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -376,13 +796,34 @@ main_body:
ret [2 x half] %i3
}
-; MSAA-LABEL: @load_2darraymsaa_v4f32_dmask1
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 0
-; MSAA: %2 = extractelement <4 x float> %0, i64 1
-; MSAA: %3 = extractelement <4 x float> %0, i64 2
-; MSAA: %4 = extractelement <4 x float> %0, i64 3
define amdgpu_ps [4 x float] @load_2darraymsaa_v4f32_dmask1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2darraymsaa_v4f32_dmask1(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2darraymsaa_v4f32_dmask1(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT: ret [4 x float] [[I7]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -395,21 +836,50 @@ main_body:
ret [4 x float] %i7
}
-; MSAA-LABEL: @load_2darraymsaa_v4v2f32_dmask3
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %2 = extractelement <4 x float> %0, i64 0
-; MSAA: %3 = insertelement <2 x float> undef, float %2, i64 0
-; MSAA: %4 = extractelement <4 x float> %1, i64 0
-; MSAA: %5 = insertelement <2 x float> %3, float %4, i64 1
-; MSAA: %6 = extractelement <4 x float> %0, i64 1
-; MSAA: %7 = insertelement <2 x float> undef, float %6, i64 0
-; MSAA: %8 = extractelement <4 x float> %1, i64 1
-; MSAA: %9 = insertelement <2 x float> %7, float %8, i64 1
define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask3(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask3(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask3(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
+; MSAA-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[I:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i64 1
+; MSAA-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP5]], i64 0
+; MSAA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[I1:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP7]], i64 1
+; MSAA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i64 0
+; MSAA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[I2:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP10]], i64 1
+; MSAA-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP11]], i64 0
+; MSAA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[I3:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP13]], i64 1
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; MSAA-NEXT: ret [4 x <2 x float>] [[I7]]
+;
main_body:
%i = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -422,40 +892,61 @@ main_body:
ret [4 x <2 x float>] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4v3f32_dmask7
-
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %2 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-
-; MSAA: %3 = extractelement <4 x float> %0, i64 0
-; MSAA: %4 = insertelement <3 x float> undef, float %3, i64 0
-; MSAA: %5 = extractelement <4 x float> %1, i64 0
-; MSAA: %6 = insertelement <3 x float> %4, float %5, i64 1
-; MSAA: %7 = extractelement <4 x float> %2, i64 0
-; MSAA: %8 = insertelement <3 x float> %6, float %7, i64 2
-
-; MSAA: %9 = extractelement <4 x float> %0, i64 1
-; MSAA: %10 = insertelement <3 x float> undef, float %9, i64 0
-; MSAA: %11 = extractelement <4 x float> %1, i64 1
-; MSAA: %12 = insertelement <3 x float> %10, float %11, i64 1
-; MSAA: %13 = extractelement <4 x float> %2, i64 1
-; MSAA: %14 = insertelement <3 x float> %12, float %13, i64 2
-
-; MSAA: %15 = extractelement <4 x float> %0, i64 2
-; MSAA: %16 = insertelement <3 x float> undef, float %15, i64 0
-; MSAA: %17 = extractelement <4 x float> %1, i64 2
-; MSAA: %18 = insertelement <3 x float> %16, float %17, i64 1
-; MSAA: %19 = extractelement <4 x float> %2, i64 2
-; MSAA: %20 = insertelement <3 x float> %18, float %19, i64 2
-
-; MSAA: %21 = extractelement <4 x float> %0, i64 3
-; MSAA: %22 = insertelement <3 x float> undef, float %21, i64 0
-; MSAA: %23 = extractelement <4 x float> %1, i64 3
-; MSAA: %24 = insertelement <3 x float> %22, float %23, i64 1
-; MSAA: %25 = extractelement <4 x float> %2, i64 3
-; MSAA: %26 = insertelement <3 x float> %24, float %25, i64 2
+
+
+
+
+
define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x <3 x float>] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[TMP4:%.*]] = insertelement <3 x float> undef, float [[TMP3]], i64 0
+; MSAA-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP5]], i64 1
+; MSAA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i64 0
+; MSAA-NEXT: [[I:%.*]] = insertelement <3 x float> [[TMP6]], float [[TMP7]], i64 2
+; MSAA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[TMP9:%.*]] = insertelement <3 x float> undef, float [[TMP8]], i64 0
+; MSAA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[TMP11:%.*]] = insertelement <3 x float> [[TMP9]], float [[TMP10]], i64 1
+; MSAA-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i64 1
+; MSAA-NEXT: [[I1:%.*]] = insertelement <3 x float> [[TMP11]], float [[TMP12]], i64 2
+; MSAA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[TMP14:%.*]] = insertelement <3 x float> undef, float [[TMP13]], i64 0
+; MSAA-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[TMP16:%.*]] = insertelement <3 x float> [[TMP14]], float [[TMP15]], i64 1
+; MSAA-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP2]], i64 2
+; MSAA-NEXT: [[I2:%.*]] = insertelement <3 x float> [[TMP16]], float [[TMP17]], i64 2
+; MSAA-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[TMP19:%.*]] = insertelement <3 x float> undef, float [[TMP18]], i64 0
+; MSAA-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[TMP21:%.*]] = insertelement <3 x float> [[TMP19]], float [[TMP20]], i64 1
+; MSAA-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP2]], i64 3
+; MSAA-NEXT: [[I3:%.*]] = insertelement <3 x float> [[TMP21]], float [[TMP22]], i64 2
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[I3]], 3
+; MSAA-NEXT: ret [4 x <3 x float>] [[I7]]
+;
main_body:
%i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -468,40 +959,61 @@ main_body:
ret [4 x <3 x float>] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4v3f32_dmask7_group1
-
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 4, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 %s, i32 %t, i32 4, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %2 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 4, <8 x i32> %rsrc, i32 0, i32 0)
-
-; MSAA: %3 = extractelement <4 x float> %0, i64 0
-; MSAA: %4 = insertelement <3 x float> undef, float %3, i64 0
-; MSAA: %5 = extractelement <4 x float> %1, i64 0
-; MSAA: %6 = insertelement <3 x float> %4, float %5, i64 1
-; MSAA: %7 = extractelement <4 x float> %2, i64 0
-; MSAA: %8 = insertelement <3 x float> %6, float %7, i64 2
-
-; MSAA: %9 = extractelement <4 x float> %0, i64 1
-; MSAA: %10 = insertelement <3 x float> undef, float %9, i64 0
-; MSAA: %11 = extractelement <4 x float> %1, i64 1
-; MSAA: %12 = insertelement <3 x float> %10, float %11, i64 1
-; MSAA: %13 = extractelement <4 x float> %2, i64 1
-; MSAA: %14 = insertelement <3 x float> %12, float %13, i64 2
-
-; MSAA: %15 = extractelement <4 x float> %0, i64 2
-; MSAA: %16 = insertelement <3 x float> undef, float %15, i64 0
-; MSAA: %17 = extractelement <4 x float> %1, i64 2
-; MSAA: %18 = insertelement <3 x float> %16, float %17, i64 1
-; MSAA: %19 = extractelement <4 x float> %2, i64 2
-; MSAA: %20 = insertelement <3 x float> %18, float %19, i64 2
-
-; MSAA: %21 = extractelement <4 x float> %0, i64 3
-; MSAA: %22 = insertelement <3 x float> undef, float %21, i64 0
-; MSAA: %23 = extractelement <4 x float> %1, i64 3
-; MSAA: %24 = insertelement <3 x float> %22, float %23, i64 1
-; MSAA: %25 = extractelement <4 x float> %2, i64 3
-; MSAA: %26 = insertelement <3 x float> %24, float %25, i64 2
+
+
+
+
+
define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7_group1(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7_group1(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 5, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 6, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 7, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x <3 x float>] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7_group1(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[TMP4:%.*]] = insertelement <3 x float> undef, float [[TMP3]], i64 0
+; MSAA-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP5]], i64 1
+; MSAA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i64 0
+; MSAA-NEXT: [[I:%.*]] = insertelement <3 x float> [[TMP6]], float [[TMP7]], i64 2
+; MSAA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[TMP9:%.*]] = insertelement <3 x float> undef, float [[TMP8]], i64 0
+; MSAA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[TMP11:%.*]] = insertelement <3 x float> [[TMP9]], float [[TMP10]], i64 1
+; MSAA-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i64 1
+; MSAA-NEXT: [[I1:%.*]] = insertelement <3 x float> [[TMP11]], float [[TMP12]], i64 2
+; MSAA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[TMP14:%.*]] = insertelement <3 x float> undef, float [[TMP13]], i64 0
+; MSAA-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[TMP16:%.*]] = insertelement <3 x float> [[TMP14]], float [[TMP15]], i64 1
+; MSAA-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP2]], i64 2
+; MSAA-NEXT: [[I2:%.*]] = insertelement <3 x float> [[TMP16]], float [[TMP17]], i64 2
+; MSAA-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[TMP19:%.*]] = insertelement <3 x float> undef, float [[TMP18]], i64 0
+; MSAA-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[TMP21:%.*]] = insertelement <3 x float> [[TMP19]], float [[TMP20]], i64 1
+; MSAA-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP2]], i64 3
+; MSAA-NEXT: [[I3:%.*]] = insertelement <3 x float> [[TMP21]], float [[TMP22]], i64 2
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[I3]], 3
+; MSAA-NEXT: ret [4 x <3 x float>] [[I7]]
+;
main_body:
%i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 4, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 5, <8 x i32> %rsrc, i32 0, i32 0)
@@ -514,15 +1026,37 @@ main_body:
ret [4 x <3 x float>] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4f32_sections
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 0
-; MSAA: %2 = extractelement <4 x float> %0, i64 1
-; MSAA: call void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float %vdata, i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %3 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %4 = extractelement <4 x float> %3, i64 2
-; MSAA: %5 = extractelement <4 x float> %3, i64 3
define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_sections(<8 x i32> inreg %rsrc, float %vdata, i32 %s, i32 %t, i32 %fragid) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_sections(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], float [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: call void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float [[VDATA]], i32 1, i32 [[S]], i32 [[T]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_sections(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], float [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: call void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float [[VDATA]], i32 1, i32 [[S]], i32 [[T]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT: ret [4 x float] [[I7]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
@@ -536,25 +1070,84 @@ main_body:
ret [4 x float] %i7
}
-; MSAA-LABEL: @load_2dmsaa_v4f32_blocks
-; MSAA: %0 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %1 = extractelement <4 x float> %0, i64 0
-; MSAA: %2 = extractelement <4 x float> %0, i64 1
-; MSAA: %3 = extractelement <4 x float> %0, i64 2
-; MSAA: %4 = extractelement <4 x float> %0, i64 3
-; MSAA-LABEL: if_equal:
-; MSAA: %5 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %6 = extractelement <4 x float> %5, i64 0
-; MSAA: %7 = extractelement <4 x float> %5, i64 1
-; MSAA: %8 = extractelement <4 x float> %5, i64 2
-; MSAA: %9 = extractelement <4 x float> %5, i64 3
-; MSAA-LABEL: if_unequal:
-; MSAA: %10 = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-; MSAA: %11 = extractelement <4 x float> %10, i64 0
-; MSAA: %12 = extractelement <4 x float> %10, i64 1
-; MSAA: %13 = extractelement <4 x float> %10, i64 2
-; MSAA: %14 = extractelement <4 x float> %10, i64 3
define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %cond) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT: [[I8:%.*]] = trunc i32 [[COND]] to i1
+; NO-MSAA-NEXT: br i1 [[I8]], label [[IF_EQUAL:%.*]], label [[IF_UNEQUAL:%.*]]
+; NO-MSAA: if_equal:
+; NO-MSAA-NEXT: [[I9:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I10:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I11:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I12:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I13:%.*]] = insertvalue [4 x float] undef, float [[I9]], 0
+; NO-MSAA-NEXT: [[I14:%.*]] = insertvalue [4 x float] [[I13]], float [[I10]], 1
+; NO-MSAA-NEXT: [[I15:%.*]] = insertvalue [4 x float] [[I14]], float [[I11]], 2
+; NO-MSAA-NEXT: [[I16:%.*]] = insertvalue [4 x float] [[I15]], float [[I12]], 3
+; NO-MSAA-NEXT: br label [[MERGE:%.*]]
+; NO-MSAA: if_unequal:
+; NO-MSAA-NEXT: [[I17:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I18:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I19:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I20:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I21:%.*]] = insertvalue [4 x float] undef, float [[I17]], 0
+; NO-MSAA-NEXT: [[I22:%.*]] = insertvalue [4 x float] [[I21]], float [[I18]], 1
+; NO-MSAA-NEXT: [[I23:%.*]] = insertvalue [4 x float] [[I22]], float [[I19]], 2
+; NO-MSAA-NEXT: [[I24:%.*]] = insertvalue [4 x float] [[I23]], float [[I20]], 3
+; NO-MSAA-NEXT: br label [[MERGE]]
+; NO-MSAA: merge:
+; NO-MSAA-NEXT: [[I25:%.*]] = phi [4 x float] [ [[I16]], [[IF_EQUAL]] ], [ [[I24]], [[IF_UNEQUAL]] ]
+; NO-MSAA-NEXT: ret [4 x float] [[I25]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(
+; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT: [[I8:%.*]] = trunc i32 [[COND]] to i1
+; MSAA-NEXT: br i1 [[I8]], label [[IF_EQUAL:%.*]], label [[IF_UNEQUAL:%.*]]
+; MSAA: if_equal:
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I9:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[I10:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[I11:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; MSAA-NEXT: [[I12:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; MSAA-NEXT: [[I13:%.*]] = insertvalue [4 x float] undef, float [[I9]], 0
+; MSAA-NEXT: [[I14:%.*]] = insertvalue [4 x float] [[I13]], float [[I10]], 1
+; MSAA-NEXT: [[I15:%.*]] = insertvalue [4 x float] [[I14]], float [[I11]], 2
+; MSAA-NEXT: [[I16:%.*]] = insertvalue [4 x float] [[I15]], float [[I12]], 3
+; MSAA-NEXT: br label [[MERGE:%.*]]
+; MSAA: if_unequal:
+; MSAA-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT: [[I17:%.*]] = extractelement <4 x float> [[TMP2]], i64 0
+; MSAA-NEXT: [[I18:%.*]] = extractelement <4 x float> [[TMP2]], i64 1
+; MSAA-NEXT: [[I19:%.*]] = extractelement <4 x float> [[TMP2]], i64 2
+; MSAA-NEXT: [[I20:%.*]] = extractelement <4 x float> [[TMP2]], i64 3
+; MSAA-NEXT: [[I21:%.*]] = insertvalue [4 x float] undef, float [[I17]], 0
+; MSAA-NEXT: [[I22:%.*]] = insertvalue [4 x float] [[I21]], float [[I18]], 1
+; MSAA-NEXT: [[I23:%.*]] = insertvalue [4 x float] [[I22]], float [[I19]], 2
+; MSAA-NEXT: [[I24:%.*]] = insertvalue [4 x float] [[I23]], float [[I20]], 3
+; MSAA-NEXT: br label [[MERGE]]
+; MSAA: merge:
+; MSAA-NEXT: [[I25:%.*]] = phi [4 x float] [ [[I16]], [[IF_EQUAL]] ], [ [[I24]], [[IF_UNEQUAL]] ]
+; MSAA-NEXT: ret [4 x float] [[I25]]
+;
main_body:
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
More information about the llvm-commits
mailing list