[llvm] [AMDGPU] High VGPR lowering on gfx1250 (PR #156965)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 4 14:48:21 PDT 2025
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/156965
>From 12b93d63335b94673a5be994099450bce123d088 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 4 Sep 2025 13:20:48 -0700
Subject: [PATCH] [AMDGPU] High VGPR lowering on gfx1250
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 3 +
.../Target/AMDGPU/AMDGPULowerVGPREncoding.cpp | 354 ++++++++
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 7 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 28 +-
.../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 4 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 106 +++
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 19 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 5 +
.../CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir | 848 ++++++++++++++++++
.../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 +
13 files changed, 1378 insertions(+), 4 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ebe38de1636be..4ca1011ea1312 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -501,6 +501,9 @@ extern char &SIModeRegisterID;
void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &);
extern char &AMDGPUInsertDelayAluID;
+void initializeAMDGPULowerVGPREncodingPass(PassRegistry &);
+extern char &AMDGPULowerVGPREncodingID;
+
void initializeSIInsertHardClausesLegacyPass(PassRegistry &);
extern char &SIInsertHardClausesID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
new file mode 100644
index 0000000000000..ca06c316c2bfc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -0,0 +1,354 @@
+//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Lower VGPRs above first 256 on gfx1250.
+///
+/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
+/// VGPR addressing mode. The mode change is effective until the next change.
+/// This instruction provides high bits of a VGPR address for four of the
+/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
+/// instruction encoding. If bits are set they are added as MSB to the
+/// corresponding operand VGPR number.
+///
+/// There is no need to replace actual register operands because encoding of the
+/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
+/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
+/// VGPRs will survive until actual encoding and will result in a same actual
+/// bit encoding.
+///
+/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
+/// to a VGPR address of the subseqent instructions. The InstPrinter will take
+/// care of the printing a low VGPR instead of a high one. In prinicple this
+/// shall be viable to print actual high VGPR numbers, but that would disagree
+/// with a disasm printing and create a situation where asm text is not
+/// deterministic.
+///
+/// This pass creates a convention where non-fall through basic blocks shall
+/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
+/// An optimization here is possible but deemed not desirable because of the
+/// readbility concerns.
+///
+/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
+/// The pass must run very late in the pipeline to make sure no changes to VGPR
+/// operands will be made after it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/PackedVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"
+
+namespace {
+
+class AMDGPULowerVGPREncoding : public MachineFunctionPass {
+ static constexpr unsigned OpNum = 4;
+ static constexpr unsigned BitsPerField = 2;
+ static constexpr unsigned NumFields = 4;
+ static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
+ using ModeType = PackedVector<unsigned, BitsPerField,
+ std::bitset<BitsPerField * NumFields>>;
+
+ class ModeTy : public ModeType {
+ public:
+ // bitset constructor will set all bits to zero
+ ModeTy() : ModeType(0) {}
+
+ operator int64_t() const { return raw_bits().to_ulong(); }
+
+ static ModeTy fullMask() {
+ ModeTy M;
+ M.raw_bits().flip();
+ return M;
+ }
+ };
+
+public:
+ static char ID;
+
+ AMDGPULowerVGPREncoding() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+
+ /// Most recent s_set_* instruction.
+ MachineInstr *MostRecentModeSet;
+
+ /// Whether the current mode is known.
+ bool CurrentModeKnown;
+
+ /// Current mode bits.
+ ModeTy CurrentMode;
+
+ /// Current mask of mode bits that instructions since MostRecentModeSet care
+ /// about.
+ ModeTy CurrentMask;
+
+ /// Number of current hard clause instructions.
+ unsigned ClauseLen;
+
+ /// Number of hard clause instructions remaining.
+ unsigned ClauseRemaining;
+
+ /// Clause group breaks.
+ unsigned ClauseBreaks;
+
+ /// Last hard clause instruction.
+ MachineInstr *Clause;
+
+ /// Insert mode change before \p I. \returns true if mode was changed.
+ bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
+
+ /// Reset mode to default.
+ void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
+
+ /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
+ std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
+
+ /// Handle single \p MI. \return true if changed.
+ bool runOnMachineInstr(MachineInstr &MI);
+
+ /// Compute the mode and mode mask for a single \p MI given \p Ops operands
+ /// bit mapping. Optionally takes second array \p Ops2 for VOPD.
+ /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
+ /// is checked.
+ void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
+ const AMDGPU::OpName Ops[OpNum],
+ const AMDGPU::OpName *Ops2 = nullptr);
+
+ /// Check if an instruction \p I is within a clause and returns a suitable
+ /// iterator to insert mode change. It may also modify the S_CLAUSE
+ /// instruction to extend it or drop the clause if it cannot be adjusted.
+ MachineInstr *handleClause(MachineInstr *I);
+};
+
+bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
+ MachineInstr *I) {
+ assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
+
+ if (CurrentModeKnown) {
+ auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
+
+ if ((Delta & Mask.raw_bits()).none()) {
+ CurrentMask |= Mask;
+ return false;
+ }
+
+ if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
+ CurrentMode |= NewMode;
+ CurrentMask |= Mask;
+
+ MostRecentModeSet->getOperand(0).setImm(CurrentMode);
+ return true;
+ }
+ }
+
+ I = handleClause(I);
+ MostRecentModeSet =
+ BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
+ .addImm(NewMode);
+
+ CurrentMode = NewMode;
+ CurrentMask = Mask;
+ CurrentModeKnown = true;
+ return true;
+}
+
+std::optional<unsigned>
+AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
+ if (!MO.isReg())
+ return std::nullopt;
+
+ MCRegister Reg = MO.getReg();
+ const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
+ if (!RC || !TRI->isVGPRClass(RC))
+ return std::nullopt;
+
+ unsigned Idx = TRI->getHWRegIndex(Reg);
+ return Idx >> 8;
+}
+
+void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
+ MachineInstr &MI,
+ const AMDGPU::OpName Ops[OpNum],
+ const AMDGPU::OpName *Ops2) {
+ NewMode = {};
+ Mask = {};
+
+ for (unsigned I = 0; I < OpNum; ++I) {
+ MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
+
+ std::optional<unsigned> MSBits;
+ if (Op)
+ MSBits = getMSBs(*Op);
+
+#if !defined(NDEBUG)
+ if (MSBits.has_value() && Ops2) {
+ auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
+ if (Op2) {
+ std::optional<unsigned> MSBits2;
+ MSBits2 = getMSBs(*Op2);
+ if (MSBits2.has_value() && MSBits != MSBits2)
+ llvm_unreachable("Invalid VOPD pair was created");
+ }
+ }
+#endif
+
+ if (!MSBits.has_value() && Ops2) {
+ Op = TII->getNamedOperand(MI, Ops2[I]);
+ if (Op)
+ MSBits = getMSBs(*Op);
+ }
+
+ if (!MSBits.has_value())
+ continue;
+
+ // Skip tied uses of src2 of VOP2, these will be handled along with defs and
+ // only vdst bit affects these operands. We cannot skip tied uses of VOP3,
+ // these uses are real even if must match the vdst.
+ if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
+ (SIInstrInfo::isVOP2(MI) ||
+ (SIInstrInfo::isVOP3(MI) &&
+ TII->hasVALU32BitEncoding(MI.getOpcode()))))
+ continue;
+
+ NewMode[I] = MSBits.value();
+ Mask[I] = FieldMask;
+ }
+}
+
+bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
+ auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
+ if (Ops.first) {
+ ModeTy NewMode, Mask;
+ computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
+ return setMode(NewMode, Mask, &MI);
+ }
+ assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
+
+ return false;
+}
+
+MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
+ if (!ClauseRemaining)
+ return I;
+
+ // A clause cannot start with a special instruction, place it right before
+ // the clause.
+ if (ClauseRemaining == ClauseLen) {
+ I = Clause->getPrevNode();
+ assert(I->isBundle());
+ return I;
+ }
+
+ // If a clause defines breaks each group cannot start with a mode change.
+ // just drop the clause.
+ if (ClauseBreaks) {
+ Clause->eraseFromBundle();
+ ClauseRemaining = 0;
+ return I;
+ }
+
+ // Otherwise adjust a number of instructions in the clause if it fits.
+ // If it does not clause will just become shorter. Since the length
+ // recorded in the clause is one less, increment the length after the
+ // update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
+ if (ClauseLen < 63)
+ Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8));
+
+ ++ClauseLen;
+
+ return I;
+}
+
+bool AMDGPULowerVGPREncoding::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.has1024AddressableVGPRs())
+ return false;
+
+ TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+
+ bool Changed = false;
+ ClauseLen = ClauseRemaining = 0;
+ CurrentMode.reset();
+ CurrentMask.reset();
+ CurrentModeKnown = true;
+ for (auto &MBB : MF) {
+ MostRecentModeSet = nullptr;
+
+ for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
+ if (MI.isMetaInstruction())
+ continue;
+
+ if (MI.isTerminator() || MI.isCall()) {
+ if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+ MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+ CurrentMode.reset();
+ CurrentModeKnown = true;
+ } else
+ resetMode(&MI);
+ continue;
+ }
+
+ if (MI.isInlineAsm()) {
+ if (TII->hasVGPRUses(MI))
+ resetMode(&MI);
+ continue;
+ }
+
+ if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
+ assert(!ClauseRemaining && "Nested clauses are not supported");
+ ClauseLen = MI.getOperand(0).getImm();
+ ClauseBreaks = (ClauseLen >> 8) & 15;
+ ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
+ Clause = &MI;
+ continue;
+ }
+
+ Changed |= runOnMachineInstr(MI);
+
+ if (ClauseRemaining)
+ --ClauseRemaining;
+ }
+
+ // If we're falling through to a block that has at least one other
+ // predecessor, we no longer know the mode.
+ MachineBasicBlock *Next = MBB.getNextNode();
+ if (Next && Next->pred_size() >= 2 &&
+ llvm::is_contained(Next->predecessors(), &MBB)) {
+ if (CurrentMode.raw_bits().any())
+ CurrentModeKnown = false;
+ }
+ }
+
+ return Changed;
+}
+
+} // namespace
+
+char AMDGPULowerVGPREncoding::ID = 0;
+
+char &llvm::AMDGPULowerVGPREncodingID = AMDGPULowerVGPREncoding::ID;
+
+INITIALIZE_PASS(AMDGPULowerVGPREncoding, DEBUG_TYPE,
+ "AMDGPU Lower VGPR Encoding", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index c84a0f6e31384..6acbf52b97de5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -373,6 +373,13 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
MF->getInfo<SIMachineFunctionInfo>(),
*OutStreamer);
+ if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
+ unsigned V = MI->getOperand(0).getImm();
+ OutStreamer->AddComment(
+ " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) +
+ " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3));
+ }
+
MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4a2f0a13b1325..072becb9a2ad5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -584,6 +584,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeSIAnnotateControlFlowLegacyPass(*PR);
initializeAMDGPUInsertDelayAluLegacyPass(*PR);
+ initializeAMDGPULowerVGPREncodingPass(*PR);
initializeSIInsertHardClausesLegacyPass(*PR);
initializeSIInsertWaitcntsLegacyPass(*PR);
initializeSIModeRegisterLegacyPass(*PR);
@@ -1799,6 +1800,8 @@ void GCNPassConfig::addPreEmitPass() {
addPass(&AMDGPUWaitSGPRHazardsLegacyID);
+ addPass(&AMDGPULowerVGPREncodingID);
+
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
addPass(&AMDGPUInsertDelayAluID);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index a915c4076ca2a..aae56eef73edd 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -86,6 +86,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMCInstLower.cpp
AMDGPUMemoryUtils.cpp
AMDGPUIGroupLP.cpp
+ AMDGPULowerVGPREncoding.cpp
AMDGPUMCResourceInfo.cpp
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ad122390e1f03..d1e8b7e4bad0d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -324,6 +324,18 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
}
}
+// \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or
+// \p Reg itself otherwise.
+static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+ unsigned Enc = MRI.getEncodingValue(Reg);
+ unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ if (Idx < 0x100)
+ return Reg;
+
+ const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
+ return RC->getRegister(Idx % 0x100);
+}
+
void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
const MCRegisterInfo &MRI) {
#if !defined(NDEBUG)
@@ -337,7 +349,17 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
}
#endif
- O << getRegisterName(Reg);
+ unsigned PrintReg = getRegForPrinting(Reg, MRI);
+ O << getRegisterName(PrintReg);
+
+ if (PrintReg != Reg.id())
+ O << " /*" << getRegisterName(Reg) << "*/";
+}
+
+void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, unsigned Opc,
+ unsigned OpNo, raw_ostream &O,
+ const MCRegisterInfo &MRI) {
+ printRegOperand(Reg, O, MRI);
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
@@ -722,7 +744,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isReg()) {
- printRegOperand(Op.getReg(), O, MRI);
+ printRegOperand(Op.getReg(), MI->getOpcode(), OpNo, O, MRI);
// Check if operand register class contains register used.
// Intention: print disassembler message when invalid code is decoded,
@@ -1133,7 +1155,7 @@ void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
OpNo = OpNo - N + N / 2;
if (En & (1 << N))
- printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
+ printRegOperand(MI->getOperand(OpNo).getReg(), Opc, OpNo, O, MRI);
else
O << "off";
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index a92f99c3c0e4b..21cc2f229de91 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -35,6 +35,8 @@ class AMDGPUInstPrinter : public MCInstPrinter {
const MCSubtargetInfo &STI, raw_ostream &O) override;
static void printRegOperand(MCRegister Reg, raw_ostream &O,
const MCRegisterInfo &MRI);
+ void printRegOperand(MCRegister Reg, unsigned Opc, unsigned OpNo,
+ raw_ostream &O, const MCRegisterInfo &MRI);
private:
void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
@@ -70,7 +72,7 @@ class AMDGPUInstPrinter : public MCInstPrinter {
void printSymbolicFormat(const MCInst *MI,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printRegOperand(unsigned RegNo, raw_ostream &O);
+ void printRegOperand(MCRegister Reg, raw_ostream &O);
void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fe849cafb65d1..643c664e39f1e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9270,6 +9270,9 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
AMDGPU::OpName OperandName) const {
+ if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
+ return nullptr;
+
int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
if (Idx == -1)
return nullptr;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ff5cbd55484cf..6348d3607878e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3338,6 +3338,112 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
: getGfx9BufferFormatInfo(Format);
}
+const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+ const MCRegisterInfo &MRI) {
+ const unsigned VGPRClasses[] = {
+ AMDGPU::VGPR_16RegClassID, AMDGPU::VGPR_32RegClassID,
+ AMDGPU::VReg_64RegClassID, AMDGPU::VReg_96RegClassID,
+ AMDGPU::VReg_128RegClassID, AMDGPU::VReg_160RegClassID,
+ AMDGPU::VReg_192RegClassID, AMDGPU::VReg_224RegClassID,
+ AMDGPU::VReg_256RegClassID, AMDGPU::VReg_288RegClassID,
+ AMDGPU::VReg_320RegClassID, AMDGPU::VReg_352RegClassID,
+ AMDGPU::VReg_384RegClassID, AMDGPU::VReg_512RegClassID,
+ AMDGPU::VReg_1024RegClassID};
+
+ for (unsigned RCID : VGPRClasses) {
+ const MCRegisterClass &RC = MRI.getRegClass(RCID);
+ if (RC.contains(Reg))
+ return &RC;
+ }
+
+ return nullptr;
+}
+
+unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+ unsigned Enc = MRI.getEncodingValue(Reg);
+ unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ return Idx >> 8;
+}
+
+MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
+ const MCRegisterInfo &MRI) {
+ unsigned Enc = MRI.getEncodingValue(Reg);
+ unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ if (Idx >= 0x100)
+ return AMDGPU::NoRegister;
+
+ const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
+ if (!RC)
+ return AMDGPU::NoRegister;
+ return RC->getRegister(Idx | (MSBs << 8));
+}
+
+std::pair<const AMDGPU::OpName *, const AMDGPU::OpName *>
+getVGPRLoweringOperandTables(const MCInstrDesc &Desc) {
+ static const AMDGPU::OpName VOPOps[4] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2,
+ AMDGPU::OpName::vdst};
+ static const AMDGPU::OpName VDSOps[4] = {
+ AMDGPU::OpName::addr, AMDGPU::OpName::data0, AMDGPU::OpName::data1,
+ AMDGPU::OpName::vdst};
+ static const AMDGPU::OpName FLATOps[4] = {
+ AMDGPU::OpName::vaddr, AMDGPU::OpName::vdata,
+ AMDGPU::OpName::NUM_OPERAND_NAMES, AMDGPU::OpName::vdst};
+ static const AMDGPU::OpName BUFOps[4] = {
+ AMDGPU::OpName::vaddr, AMDGPU::OpName::NUM_OPERAND_NAMES,
+ AMDGPU::OpName::NUM_OPERAND_NAMES, AMDGPU::OpName::vdata};
+ static const AMDGPU::OpName VIMGOps[4] = {
+ AMDGPU::OpName::vaddr0, AMDGPU::OpName::vaddr1, AMDGPU::OpName::vaddr2,
+ AMDGPU::OpName::vdata};
+
+ // For VOPD instructions MSB of a corresponding Y component operand VGPR
+ // address is supposed to match X operand, otherwise VOPD shall not be
+ // combined.
+ static const AMDGPU::OpName VOPDOpsX[4] = {
+ AMDGPU::OpName::src0X, AMDGPU::OpName::vsrc1X, AMDGPU::OpName::vsrc2X,
+ AMDGPU::OpName::vdstX};
+ static const AMDGPU::OpName VOPDOpsY[4] = {
+ AMDGPU::OpName::src0Y, AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vsrc2Y,
+ AMDGPU::OpName::vdstY};
+
+ unsigned TSFlags = Desc.TSFlags;
+
+ if (TSFlags &
+ (SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 |
+ SIInstrFlags::VOP3P | SIInstrFlags::VOPC | SIInstrFlags::DPP)) {
+ // LD_SCALE operands ignore MSB.
+ if (Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32 ||
+ Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250 ||
+ Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64 ||
+ Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250)
+ return {};
+ return {VOPOps, nullptr};
+ }
+
+ if (TSFlags & SIInstrFlags::DS)
+ return {VDSOps, nullptr};
+
+ if (TSFlags & SIInstrFlags::FLAT)
+ return {FLATOps, nullptr};
+
+ if (TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF))
+ return {BUFOps, nullptr};
+
+ if (TSFlags & SIInstrFlags::VIMAGE)
+ return {VIMGOps, nullptr};
+
+ if (AMDGPU::isVOPD(Desc.getOpcode()))
+ return {VOPDOpsX, VOPDOpsY};
+
+ assert(!(TSFlags & SIInstrFlags::MIMG));
+
+ if (TSFlags & (SIInstrFlags::VSAMPLE | SIInstrFlags::EXP))
+ llvm_unreachable("Sample and export VGPR lowering is not implemented and"
+ " these instructions are not expected on gfx1250");
+
+ return {};
+}
+
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) {
uint64_t TSFlags = MII.get(Opcode).TSFlags;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 4ab17d8056459..3fcd16f9290b1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1786,6 +1786,25 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID);
/// \returns true if the intrinsic is uniform
bool isIntrinsicAlwaysUniform(unsigned IntrID);
+/// \returns a register class for the physical register \p Reg if it is a VGPR
+/// or nullptr otherwise.
+const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+ const MCRegisterInfo &MRI);
+
+/// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the
+/// physical register \p Reg.
+unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI);
+
+/// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set.
+MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
+ const MCRegisterInfo &MRI);
+
+// Returns a table for the opcode with a given \p Desc to map the VGPR MSB
+// set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2
+// maps, one for X and one for Y component.
+std::pair<const AMDGPU::OpName *, const AMDGPU::OpName *>
+getVGPRLoweringOperandTables(const MCInstrDesc &Desc);
+
/// \returns true if a memory instruction supports scale_offset modifier.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode);
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 36231abda87db..65d0102a9d0dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -141,6 +141,7 @@
; GCN-O0-NEXT: SI Final Branch Preparation
; GCN-O0-NEXT: Post RA hazard recognizer
; GCN-O0-NEXT: AMDGPU Insert waits for SGPR read hazards
+; GCN-O0-NEXT: AMDGPU Lower VGPR Encoding
; GCN-O0-NEXT: Branch relaxation pass
; GCN-O0-NEXT: Register Usage Information Collector Pass
; GCN-O0-NEXT: Remove Loads Into Fake Uses
@@ -426,6 +427,7 @@
; GCN-O1-NEXT: SI peephole optimizations
; GCN-O1-NEXT: Post RA hazard recognizer
; GCN-O1-NEXT: AMDGPU Insert waits for SGPR read hazards
+; GCN-O1-NEXT: AMDGPU Lower VGPR Encoding
; GCN-O1-NEXT: AMDGPU Insert Delay ALU
; GCN-O1-NEXT: Branch relaxation pass
; GCN-O1-NEXT: Register Usage Information Collector Pass
@@ -740,6 +742,7 @@
; GCN-O1-OPTS-NEXT: SI peephole optimizations
; GCN-O1-OPTS-NEXT: Post RA hazard recognizer
; GCN-O1-OPTS-NEXT: AMDGPU Insert waits for SGPR read hazards
+; GCN-O1-OPTS-NEXT: AMDGPU Lower VGPR Encoding
; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU
; GCN-O1-OPTS-NEXT: Branch relaxation pass
; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass
@@ -1060,6 +1063,7 @@
; GCN-O2-NEXT: SI peephole optimizations
; GCN-O2-NEXT: Post RA hazard recognizer
; GCN-O2-NEXT: AMDGPU Insert waits for SGPR read hazards
+; GCN-O2-NEXT: AMDGPU Lower VGPR Encoding
; GCN-O2-NEXT: AMDGPU Insert Delay ALU
; GCN-O2-NEXT: Branch relaxation pass
; GCN-O2-NEXT: Register Usage Information Collector Pass
@@ -1393,6 +1397,7 @@
; GCN-O3-NEXT: SI peephole optimizations
; GCN-O3-NEXT: Post RA hazard recognizer
; GCN-O3-NEXT: AMDGPU Insert waits for SGPR read hazards
+; GCN-O3-NEXT: AMDGPU Lower VGPR Encoding
; GCN-O3-NEXT: AMDGPU Insert Delay ALU
; GCN-O3-NEXT: Branch relaxation pass
; GCN-O3-NEXT: Register Usage Information Collector Pass
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
new file mode 100644
index 0000000000000..e7d676c6ba05c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
@@ -0,0 +1,848 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=amdgpu-lower-vgpr-encoding -o - %s | FileCheck -check-prefixes=GCN,ASM %s
+
+# ASM-LABEL: {{^}}high_vgprs:
+# DIS-LABEL: <high_vgprs>:
+---
+name: high_vgprs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+
+ ; VOP1
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v255 /*v511*/
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr511, implicit $exec
+
+ ; No mask change
+ ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v254 /*v510*/
+ $vgpr257 = V_MOV_B32_e32 undef $vgpr510, implicit $exec
+
+ ; Single bit change
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0
+ ; GCN-NEXT: v_rcp_f32_e64 v255, v2 /*v258*/
+ $vgpr255 = V_RCP_F32_e64 0, undef $vgpr258, 0, 0, implicit $exec, implicit $mode
+
+ ; Reset
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
+ ; GCN-NEXT: v_rcp_f32_e64 v255, v1
+ $vgpr255 = V_RCP_F32_e64 0, undef $vgpr1, 0, 0, implicit $exec, implicit $mode
+
+ ; VOP2
+
+ ; GCN-NEXT: s_set_vgpr_msb 5
+ ; ASM-SAME: ; msbs: dst=0 src0=1 src1=1 src2=0
+ ; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/
+ $vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0
+ ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; GCN-NEXT: v_add_f32_e64 v2 /*v258*/, v0, v251 /*v507*/
+ $vgpr258 = V_ADD_F32_e64 0, $vgpr0, 0, undef $vgpr507, 0, 0, implicit $exec, implicit $mode
+
+ ; VOP3
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x55
+ ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1
+ ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
+ $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
+
+ ; No change
+ ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
+ $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
+
+ ; Tuple crossing the 256 boundary
+ ; GCN-NEXT: s_set_vgpr_msb 17
+ ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1
+ ; GCN-NEXT: v_mqsad_u32_u8 v[254:257], v[2:3] /*v[258:259]*/, v0, v[244:247] /*v[500:503]*/
+ $vgpr254_vgpr255_vgpr256_vgpr257 = V_MQSAD_U32_U8_e64 $vgpr258_vgpr259, $vgpr0, undef $vgpr500_vgpr501_vgpr502_vgpr503, 0, implicit $exec
+
+ ; DPP/tied operand
+ ; GCN-NEXT: s_set_vgpr_msb 0x45
+ ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=0
+ ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; GCN-NEXT: v_add_nc_u16_e64_dpp v0 /*v256*/, v1 /*v257*/, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+ $vgpr256 = V_ADD_NC_U16_fake16_e64_dpp $vgpr256, 0, $vgpr257, 0, undef $vgpr258, 0, 0, 1, 15, 15, 1, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 17
+ ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1
+ ; GCN-NEXT: v_add3_u32_e64_dpp v0, v1 /*v257*/, v0, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+ $vgpr0 = V_ADD3_U32_e64_dpp $vgpr0, $vgpr257, $vgpr0, undef $vgpr258, 1, 15, 15, 1, implicit $exec
+
+ ; DS (addr, data0, and data1 operands)
+
+ ; GCN-NEXT: s_set_vgpr_msb 20
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=1 src2=1
+ ; GCN-NEXT: ds_store_2addr_b32 v0, v248 /*v504*/, v249 /*v505*/ offset1:1
+ DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr504, undef $vgpr505, 0, 1, 0, implicit $exec
+
+ ; Reset
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
+ ; GCN-NEXT: ds_store_2addr_b32 v0, v248, v249 offset1:1
+ DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr248, undef $vgpr249, 0, 1, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0
+ ; GCN-NEXT: ds_load_b32 v0, v255 /*v511*/
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr511, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0
+ ; GCN-NEXT: ds_add_rtn_u32 v255 /*v511*/, v0, v248 /*v504*/
+ $vgpr511 = DS_ADD_RTN_U32_gfx9 $vgpr0, undef $vgpr504, 0, 0, implicit $exec
+
+ ; Reset
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
+ ; GCN-NEXT: ds_add_rtn_u32 v0, v0, v0
+ $vgpr0 = DS_ADD_RTN_U32_gfx9 $vgpr0, $vgpr0, 0, 0, implicit $exec
+
+ ; FLAT (vaddr, vdata and vdst operands)
+
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0
+ ; GCN-NEXT: global_load_b32 v2, v[2:3] /*v[258:259]*/, off
+ $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr258_vgpr259, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; ASM-SAME: ; msbs: dst=1 src0=0 src1=0 src2=0
+ ; GCN-NEXT: global_load_b32 v255 /*v511*/, v0, s[0:1]
+ $vgpr511 = GLOBAL_LOAD_DWORD_SADDR undef $sgpr0_sgpr1, $vgpr0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0
+ ; GCN-NEXT: scratch_load_u8 v0, v255 /*v511*/, s0
+ $vgpr0 = SCRATCH_LOAD_UBYTE_SVS $vgpr511, undef $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
+ ; GCN-NEXT: global_store_b32 v[0:1], v2, off
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 5
+ ; ASM-SAME: ; msbs: dst=0 src0=1 src1=1 src2=0
+ ; GCN-NEXT: global_store_b32 v[0:1] /*v[256:257]*/, v255 /*v511*/, off
+ GLOBAL_STORE_DWORD $vgpr256_vgpr257, $vgpr511, 0, 0, implicit $exec
+
+ ; No change
+ ; GCN-NEXT: global_store_b96 v[0:1] /*v[256:257]*/, v[244:246] /*v[500:502]*/, off
+ GLOBAL_STORE_DWORDX3 $vgpr256_vgpr257, $vgpr500_vgpr501_vgpr502, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0
+ ; GCN-NEXT: flat_atomic_add_u32 v254 /*v510*/, v[0:1], v255 /*v511*/ th:TH_ATOMIC_RETURN
+ $vgpr510 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr511, 0, 1, implicit $exec, implicit $flat_scr
+
+ ; Reset
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
+ ; GCN-NEXT: flat_atomic_add_u32 v0, v[0:1], v255 th:TH_ATOMIC_RETURN
+ $vgpr0 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr255, 0, 1, implicit $exec, implicit $flat_scr
+
+ ; VBUFFER (vdata, vaddr operands)
+
+ ; GCN-NEXT: buffer_load_b32 v1, v0, s[8:11], s3 offen
+ $vgpr1 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; ASM-SAME: ; msbs: dst=1 src0=0 src1=0 src2=0
+ ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0, s[8:11], s3 offen
+ $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0
+ ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0 /*v256*/, s[8:11], s3 offen
+ $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr256, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
+ ; GCN-NEXT: buffer_store_b32 v0, v1, s[0:3], s3 offen
+ BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0
+ ; GCN-NEXT: buffer_store_b32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen
+ BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
+ ; GCN-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s3 offen
+ BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0
+ ; GCN-NEXT: buffer_atomic_add_f32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen
+ BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec
+
+ ; VGPRs above 512
+
+ ; GCN-NEXT: s_set_vgpr_msb 0xaa
+ ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=2
+ ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/
+ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0xab
+ ; ASM-SAME: ; msbs: dst=2 src0=3 src1=2 src2=2
+ ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v0 /*v768*/, v2 /*v514*/, v3 /*v515*/
+ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr768, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0xae
+ ; ASM-SAME: ; msbs: dst=2 src0=2 src1=3 src2=2
+ ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v770*/, v3 /*v515*/
+ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr770, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0xba
+ ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=3
+ ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v771*/
+ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0xea
+ ; ASM-SAME: ; msbs: dst=3 src0=2 src1=2 src2=2
+ ; GCN-NEXT: v_fma_f32 v255 /*v1023*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/
+ $vgpr1023 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0xff
+ ; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=3
+ ; GCN-NEXT: v_fma_f32 v0 /*v768*/, v1 /*v769*/, v2 /*v770*/, v3 /*v771*/
+ $vgpr768 = V_FMA_F32_e64 0, undef $vgpr769, 0, undef $vgpr770, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x42
+ ; ASM-SAME: ; msbs: dst=1 src0=2 src1=0 src2=0
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 /*v512*/
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr512, implicit $exec
+
+ ; Reset
+
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
+ ; GCN-NEXT: v_fma_f32 v0, v1, v2, v3
+ $vgpr0 = V_FMA_F32_e64 0, undef $vgpr1, 0, undef $vgpr2, 0, undef $vgpr3, 0, 0, implicit $exec, implicit $mode
+
+ ; Tuples
+
+ ; GCN-NEXT: s_set_vgpr_msb 10
+ ; ASM-SAME: ; msbs: dst=0 src0=2 src1=2 src2=0
+ ; GCN-NEXT: global_store_b96 v[0:1] /*v[512:513]*/, v[0:2] /*v[512:514]*/, off
+ GLOBAL_STORE_DWORDX3 $vgpr512_vgpr513, $vgpr512_vgpr513_vgpr514, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 11
+ ; ASM-SAME: ; msbs: dst=0 src0=3 src1=2 src2=0
+ ; GCN-NEXT: global_store_b64 v[254:255] /*v[1022:1023]*/, v[254:255] /*v[766:767]*/, off
+ GLOBAL_STORE_DWORDX2 $vgpr1022_vgpr1023, $vgpr766_vgpr767, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x55
+ ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1
+ ; GCN-NEXT: v_wmma_f32_16x16x32_bf16 v[14:21] /*v[270:277]*/, v[26:33] /*v[282:289]*/, v[34:41] /*v[290:297]*/, v[14:21] /*v[270:277]*/
+ early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, 8, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec
+
+ ; ASM: NumVgprs: 1024
+
+...
+
+# ASM-LABEL: {{^}}vopd:
+# DIS-LABEL: <vopd>:
+---
+name: vopd
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+
+ ; GCN-NEXT: v_dual_sub_f32 v255, v1, v1 :: v_dual_mul_f32 v6, v0, v0
+ $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, v1, v2 :: v_dual_mul_f32 v0 /*v256*/, v3, v4
+ $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr2, undef $vgpr3, undef $vgpr4, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, s1, v2 :: v_dual_mul_f32 v0 /*v256*/, v44 /*v300*/, v4
+ $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $sgpr1, undef $vgpr2, undef $vgpr300, undef $vgpr4, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: v_dual_sub_f32 v255, v1, v44 /*v300*/ :: v_dual_mul_f32 v6, v0, v1 /*v257*/
+ $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr300, undef $vgpr0, $vgpr257, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: v_dual_sub_f32 v255, 0, v1 :: v_dual_mul_f32 v6, v44 /*v300*/, v3
+ $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 0, undef $vgpr1, undef $vgpr300, undef $vgpr3, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_dual_fmamk_f32 v243 /*v499*/, v0, 0xa, v3 :: v_dual_fmac_f32 v0 /*v256*/, v1, v1
+ $vgpr499, $vgpr256 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr3, undef $vgpr1, undef $vgpr1, $vgpr256, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: v_dual_mov_b32 v2, v3 /*v259*/ :: v_dual_add_f32 v3, v1 /*v257*/, v2 /*v258*/
+ $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1250 undef $vgpr259, undef $vgpr257, undef $vgpr258, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: v_dual_fmamk_f32 v244 /*v500*/, v0, 0xa, v44 /*v300*/ :: v_dual_fmac_f32 v3 /*v259*/, v1, v1 /*v257*/
+ $vgpr500, $vgpr259 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr300, undef $vgpr1, undef $vgpr257, $vgpr259, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 16
+ ; GCN-NEXT: v_dual_fma_f32 v0, v6, v6, v44 /*v300*/ :: v_dual_fma_f32 v1, v4, v5, v45 /*v301*/
+ $vgpr0, $vgpr1 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, 0, undef $vgpr300, 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $vgpr301, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: v_dual_fmac_f32 v2, v6, v6 :: v_dual_fma_f32 v3, v4, v5, v3
+ $vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, undef $vgpr2, 0, undef $vgpr4, 0, undef $vgpr5, 0, $vgpr3, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_dual_fma_f32 v244 /*v500*/, v6, v7, v8 :: v_dual_add_f32 v3 /*v259*/, v4, v5
+ $vgpr500, $vgpr259 = V_DUAL_FMA_F32_e64_X_ADD_F32_e32_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr7, 0, undef $vgpr8, 0, undef $vgpr4, 0, undef $vgpr5, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0xae
+ ; GCN-NEXT: v_dual_fmac_f32 v2 /*v514*/, v6 /*v518*/, v8 /*v776*/ :: v_dual_fma_f32 v3 /*v515*/, v4 /*v516*/, v7 /*v775*/, v3 /*v515*/
+ $vgpr514, $vgpr515 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr518, 0, undef $vgpr776, undef $vgpr514, 0, undef $vgpr516, 0, undef $vgpr775, 0, $vgpr515, implicit $mode, implicit $exec
+
+ ; ASM: NumVgprs: 777
+
+...
+
+# ASM-LABEL: {{^}}fmaak_fmamk:
+# DIS-LABEL: <fmaak_fmamk>:
+---
+name: fmaak_fmamk
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x45
+ ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2 /*v258*/, 0x1
+ $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: v_fmaak_f32 v0, v1 /*v257*/, v2 /*v258*/, 0x1
+ $vgpr0 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2, 0x1
+ $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr2, 1, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1, v2 /*v258*/, 0x1
+ $vgpr256 = V_FMAAK_F32 undef $vgpr1, undef $vgpr258, 1, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x45
+ ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 /*v258*/
+ $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: v_fmamk_f32 v0, v1 /*v257*/, 0x1, v2 /*v258*/
+ $vgpr0 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2
+ $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr2, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1, 0x1, v2 /*v258*/
+ $vgpr256 = V_FMAMK_F32 undef $vgpr1, 1, undef $vgpr258, implicit $exec, implicit $mode
+
+ ; ASM: NumVgprs: 259
+
+...
+
+# ASM-LABEL: {{^}}fmac:
+# DIS-LABEL: <fmac>:
+---
+name: fmac
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+
+ ; Accumulation instructions apply DST to both the destination and one of the source VGPRs
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_fmac_f32_e64 v0 /*v256*/, |v0|, |v1| clamp mul:4
+ $vgpr256 = V_FMAC_F32_e64 2, undef $vgpr0, 2, undef $vgpr1, 2, undef $vgpr256, 1, 2, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: v_fmac_f32_e32 v1 /*v257*/, v0, v1
+ $vgpr257 = V_FMAC_F32_e32 undef $vgpr0, undef $vgpr1, undef $vgpr257, implicit $mode, implicit $exec
+
+ ; ASM: NumVgprs: 258
+
+...
+
+# ASM-LABEL: {{^}}rev_opcodes:
+# DIS-LABEL: <rev_opcodes>:
+---
+name: rev_opcodes
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+
+ ; V_LSHLREV, V_SUBREV: SRC0 and SRC1 apply to the operands in the order in the ISA (before "reversing")
+ ; e.g. v_lshlrev_b32 v0(vdst), v1(src0), v2(src1) // v0 = v2 << v1
+ ; DST applies to V0, SRC0 applies to V1, and SRC1 applies to V2.
+
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: v_lshlrev_b32_e64 v0, v0 /*v256*/, v2
+ $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr256, undef $vgpr2, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: v_lshlrev_b32_e64 v0, v1, v0 /*v256*/
+ $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr1, undef $vgpr256, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v0 /*v256*/, v2
+ $vgpr0 = V_SUBREV_U32_e32 undef $vgpr256, undef $vgpr2, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v1, v0 /*v256*/
+ $vgpr0 = V_SUBREV_U32_e32 undef $vgpr1, undef $vgpr256, implicit $exec
+
+ ; ASM: NumVgprs: 257
+...
+
+# ASM-LABEL: {{^}}minimal_mode_change:
+# DIS-LABEL: <minimal_mode_change>:
+---
+name: minimal_mode_change
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x55
+ ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
+ $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v2
+ $vgpr0 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec
+
+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
+ $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v3 /*v259*/, v1
+ $vgpr259 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+
+ ; GCN-NEXT: v_add_nc_u32_e32 v0 /*v256*/, v1, v2
+ $vgpr256 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: v_fma_f32 v3, v4, v5, s2
+ $vgpr3 = V_FMA_F32_e64 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $sgpr2, 0, 0, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: v_fma_f32 v3, v4 /*v260*/, v5, 1
+ $vgpr3 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr5, 0, 1, 0, 0, implicit $exec, implicit $mode
+
+ ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
+ $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+
+ ; GCN-NEXT: v_add_nc_u32_e32 v2, v1, v3 /*v259*/
+ $vgpr2 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr259, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: v_mov_b32_e32 v0, v0 /*v256*/
+ ; GCN-NEXT: v_add_nc_u32_e32 v1, v1 /*v257*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: v_add_nc_u32_e32 v2, v2 /*v258*/, v2 /*v258*/
+ $vgpr0 = V_MOV_B32_e32 undef $vgpr256, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 undef $vgpr257, undef $vgpr1, implicit $exec
+ $vgpr2 = V_ADD_U32_e32 undef $vgpr258, undef $vgpr258, implicit $exec
+
+ ; ASM: NumVgprs: 263
+
+...
+
+# ASM-LABEL: {{^}}terminators:
+# DIS-LABEL: <terminators>:
+---
+name: terminators
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+ ; GCN-NEXT: s_nop 0
+ ; GCN-NEXT: s_branch
+ S_NOP 0
+ S_BRANCH %bb.1
+
+ ; No mode switch if it was zero
+
+ bb.1:
+ ; ASM: .LBB{{.*_1}}:
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+
+ ; No mode switch on fall through
+
+ bb.2:
+ ; ASM-NEXT: %bb.2:
+ ; GCN-NEXT: s_nop 0
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_branch
+ S_NOP 0
+ S_BRANCH %bb.3
+
+ ; Reset mode on terminator
+
+ bb.3:
+ ; ASM: .LBB{{.*_3}}:
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_swap_pc_i64
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $exec = S_SWAPPC_B64 undef $sgpr0_sgpr1
+
+ ; Reset mode before a call
+
+ bb.4:
+ ; ASM-NEXT: %bb.4:
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_endpgm
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ S_ENDPGM 0
+
+ ; No mode reset before S_ENDPGM
+
+ bb.5:
+ ; ASM-NEXT: %bb.5:
+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_pc_i64
+ $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ S_SETPC_B64 undef $sgpr0_sgpr1, implicit-def $exec
+
+ ; Assume mode zero at block begin even if we did not reset if before
+ ; Reset mode before branch
+
+ bb.6:
+ ; ASM-NEXT: %bb.6:
+ ; GCN-NEXT: s_set_pc_i64
+ S_SETPC_B64 undef $sgpr0_sgpr1, implicit-def $exec
+
+ ; But do not reset mode before a branch if it was zero
+
+ bb.7:
+ ; ASM-NEXT: %bb.7:
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; ASM-NEXT: ; return to shader part epilog
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec
+
+ ; Reset mode before returning from a call
+
+ bb.8:
+ ; ASM-NEXT: %bb.8:
+ ; ASM-NEXT: ; return to shader part epilog
+ SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec
+
+ ; But do not reset mode before a call return if it was zero
+
+ bb.9:
+ ; ASM-NEXT: %bb.9:
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_pc_i64
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr0_sgpr1, implicit-def $exec
+
+ ; ASM: NumVgprs: 257
+...
+
+# ASM-LABEL: {{^}}control_flow:
+# DIS-LABEL: <control_flow>:
+---
+name: control_flow
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr0, implicit $exec
+
+ bb.1:
+ ; ASM: .LBB{{[0-9]+}}_1:
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_cbranch_scc0
+ $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ S_CBRANCH_SCC0 %bb.1, undef implicit $scc
+
+ bb.2:
+ ; ASM: %bb.2:
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v2 /*v258*/, v2
+ ; GCN-NEXT: s_endpgm
+ $vgpr258 = V_MOV_B32_e32 undef $vgpr2, implicit $exec
+ S_ENDPGM 0
+...
+
+# ASM-LABEL: {{^}}inline_asm:
+# DIS-LABEL: <inline_asm>:
+---
+name: inline_asm
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; ASM: def v0
+ ; GCN-NOT: s_set_vgpr_msb
+ ; ASM: use v0
+ ; GCN-NOT: s_set_vgpr_msb
+ ; ASM: use v1
+ ; GCN: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NOT: s_set_vgpr_msb
+ ; ASM: no vgprs, mode preserved
+ ; GCN-NOT: s_set_vgpr_msb
+ ; GCN: v_mov_b32_e32 v0 /*v256*/, v1
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ INLINEASM &"; def v0", 1, 327690, def $vgpr0
+ INLINEASM &"; use v0", 1, 327690, $vgpr0
+ INLINEASM &"; use v1", 1, 327690, undef $vgpr1
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ INLINEASM &"; no vgprs, mode preserved", 1, 327690, undef $sgpr0
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+
+ ; ASM: NumVgprs: 257
+...
+
+# ASM-LABEL: {{^}}bundle:
+# DIS-LABEL: <bundle>:
+---
+name: bundle
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_nop 0
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: v_mov_b32_e32 v1, v0 /*v256*/
+ BUNDLE implicit-def $vgpr256 {
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ }
+ BUNDLE implicit $vgpr256 {
+ S_NOP 0
+ $vgpr1 = V_MOV_B32_e32 $vgpr256, implicit $exec
+ }
+
+ ; ASM: NumVgprs: 257
+...
+
+# ASM-LABEL: {{^}}hard_clauses:
+# DIS-LABEL: <hard_clauses>:
+---
+name: hard_clauses
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+
+ ; s_set_vgpr_msb cannot be a first instruction in a clause and must be placed before it.
+
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: s_clause 0x2
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
+ ; GCN-NEXT: v_mov_b32_e32 v2 /*v258*/, v1
+ BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr248, implicit undef $vgpr1 {
+ S_CLAUSE 2
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr258 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ }
+
+ ; S_CLAUSE 515 means 4 instructions broken in groups of 2.
+ ; A mode change cannot be a first instruction of each group.
+ ; If we cannot insert a mode change right before the clause just drop it.
+
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: v_mov_b32_e32 v2, v1
+ ; GCN-NEXT: v_mov_b32_e32 v3, v1
+ BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr1 {
+ S_CLAUSE 515
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr3 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ }
+
+ ; Check that we properly update the clause length.
+
+ ; GCN-NEXT: s_clause 0x3
+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
+ ; GCN-NEXT: v_mov_b32_e32 v2 /*v258*/, v1
+ BUNDLE implicit-def $vgpr0, implicit-def $vgpr257, implicit-def $vgpr248, implicit undef $vgpr1 {
+ S_CLAUSE 2
+ $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr258 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ }
+
+ ; Check that we do not exceed the limit of 63 instructions or simm16 value of 62.
+
+ ; GCN-NEXT: s_clause 0x3e
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: v_mov_b32_e32 v1, v1
+ ; GCN-NEXT: v_mov_b32_e32 v2, v1
+ ; GCN-COUNT-60: v_mov_b32_e32 v1, v1
+ BUNDLE implicit-def $vgpr256, implicit-def $vgpr1, implicit-def $vgpr2, implicit undef $vgpr1 {
+ S_CLAUSE 62
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+ }
+
+ ; ASM: NumVgprs: 259
+...
+
+# ASM-LABEL: {{^}}pseudo:
+# DIS-LABEL: <pseudo>:
+---
+name: pseudo
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ $sgpr0 = SI_ILLEGAL_COPY killed $vgpr0, implicit-def $exec, implicit-def $vcc, implicit $exec
+ ; Just do not assert here.
+ ; ASM: illegal copy v0 to s0
+ SI_RETURN_TO_EPILOG killed $sgpr0
+ S_ENDPGM 0
+...
+
+# LD_SCALE operands ignores MSB and always use low 256 VGPRs.
+
+# ASM-LABEL: {{^}}ld_scale:
+# DIS-LABEL: <ld_scale>:
+---
+name: ld_scale
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+
+ ; GCN: s_set_vgpr_msb 5
+ ; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/
+ $vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec
+
+ ; Do not change mode for LD_SCALE.
+
+ ; GCN-NOT: s_set_vgpr_msb
+ ; GCN-NEXT: v_wmma_ld_scale_paired_b32 v1, v2
+ V_WMMA_LD_SCALE_PAIRED_B32 undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, implicit $exec
+
+ ; GCN-NOT: s_set_vgpr_msb
+ ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v1, v2
+ $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v1, v2
+ $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[0:15], v[10:17], v1, v2
+ $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+
+ ; GCN-NOT: s_set_vgpr_msb
+ ; GCN-NEXT: v_wmma_ld_scale16_paired_b64 v[0:1], v[2:3]
+ V_WMMA_LD_SCALE16_PAIRED_B64 undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v[0:1], v[2:3]
+ $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v[0:1], v[2:3]
+ $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[0:15], v[10:17], v[0:1], v[2:3]
+ $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+...
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 987fb042cd089..2208ae5622386 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -164,6 +164,7 @@ static_library("LLVMAMDGPUCodeGen") {
"AMDGPULowerKernelArguments.cpp",
"AMDGPULowerKernelAttributes.cpp",
"AMDGPULowerModuleLDSPass.cpp",
+ "AMDGPULowerVGPREncoding.cpp",
"AMDGPUMCInstLower.cpp",
"AMDGPUMCResourceInfo.cpp",
"AMDGPUMIRFormatter.cpp",
More information about the llvm-commits
mailing list