[llvm] 28233b1 - [AMDGPU] New AMDGPUInsertSingleUseVDST pass (#72388)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 24 02:23:10 PST 2023


Author: Jay Foad
Date: 2023-11-24T10:23:06Z
New Revision: 28233b11ac0efd28ab1f6c83842342d2e7f0a4d3

URL: https://github.com/llvm/llvm-project/commit/28233b11ac0efd28ab1f6c83842342d2e7f0a4d3
DIFF: https://github.com/llvm/llvm-project/commit/28233b11ac0efd28ab1f6c83842342d2e7f0a4d3.diff

LOG: [AMDGPU] New AMDGPUInsertSingleUseVDST pass (#72388)

Add support for emitting GFX11.5 s_singleuse_vdst instructions. This is
a power saving feature whereby the compiler can annotate VALU
instructions whose results are known to have only a single use, so the
hardware can in some cases avoid writing the result back to VGPR RAM.

To begin with the pass is disabled by default because of one missing
feature: we need an exclusion list of opcodes that never qualify as
single-use producers and/or consumers. A future patch will implement
this and enable the pass by default.

---------

Co-authored-by: Scott Egerton <scott.egerton at amd.com>

Added: 
    llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
    llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 403014db56171ac..323560a46f31de2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -335,6 +335,9 @@ extern char &SIModeRegisterID;
 void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
 extern char &AMDGPUInsertDelayAluID;
 
+void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &);
+extern char &AMDGPUInsertSingleUseVDSTID;
+
 void initializeSIInsertHardClausesPass(PassRegistry &);
 extern char &SIInsertHardClausesID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
new file mode 100644
index 000000000000000..93ed77bb6f7efe8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -0,0 +1,122 @@
+//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
+/// instructions that produce single-use VGPR values. If the value is forwarded
+/// to the consumer instruction prior to VGPR writeback, the hardware can
+/// then skip (kill) the VGPR write.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCRegister.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-insert-single-use-vdst"
+
+namespace {
+class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
+private:
+  const SIInstrInfo *SII;
+
+public:
+  static char ID;
+
+  AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
+
+  void emitSingleUseVDST(MachineInstr &MI) const {
+    // Mark the following instruction as a single-use producer:
+    //   s_singleuse_vdst { supr0: 1 }
+    BuildMI(*MI.getParent(), MI, DebugLoc(), SII->get(AMDGPU::S_SINGLEUSE_VDST))
+        .addImm(0x1);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    const auto &ST = MF.getSubtarget<GCNSubtarget>();
+    if (!ST.hasVGPRSingleUseHintInsts())
+      return false;
+
+    SII = ST.getInstrInfo();
+    const auto *TRI = &SII->getRegisterInfo();
+    bool InstructionEmitted = false;
+
+    for (MachineBasicBlock &MBB : MF) {
+      DenseMap<MCPhysReg, unsigned> RegisterUseCount; // TODO: MCRegUnits
+
+      // Handle boundaries at the end of basic block separately to avoid
+      // false positives. If they are live at the end of a basic block then
+      // assume it has more uses later on.
+      for (const auto &Liveouts : MBB.liveouts())
+        RegisterUseCount[Liveouts.PhysReg] = 2;
+
+      for (MachineInstr &MI : reverse(MBB.instrs())) {
+        // All registers in all operands need to be single use for an
+        // instruction to be marked as a single use producer.
+        bool AllProducerOperandsAreSingleUse = true;
+
+        for (const auto &Operand : MI.operands()) {
+          if (!Operand.isReg())
+            continue;
+          const auto Reg = Operand.getReg();
+
+          // Count the number of times each register is read.
+          if (Operand.readsReg())
+            RegisterUseCount[Reg]++;
+
+          // Do not attempt to optimise across exec mask changes.
+          if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+            for (auto &UsedReg : RegisterUseCount)
+              UsedReg.second = 2;
+          }
+
+          // If we are at the point where the register first became live,
+          // check if the operands are single use.
+          if (!MI.modifiesRegister(Reg, TRI))
+            continue;
+          if (RegisterUseCount[Reg] > 1)
+            AllProducerOperandsAreSingleUse = false;
+          // Reset uses count when a register is no longer live.
+          RegisterUseCount.erase(Reg);
+        }
+        if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
+          // TODO: Replace with candidate logging for instruction grouping
+          // later.
+          emitSingleUseVDST(MI);
+          InstructionEmitted = true;
+        }
+      }
+    }
+    return InstructionEmitted;
+  }
+};
+} // namespace
+
+char AMDGPUInsertSingleUseVDST::ID = 0;
+
+char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;
+
+INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
+                "AMDGPU Insert SingleUseVDST", false, false)

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 951ed9420594b19..0c38fa32c6f33a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -286,6 +286,12 @@ static cl::opt<bool> EnableSIModeRegisterPass(
   cl::init(true),
   cl::Hidden);
 
+// Enable GFX11.5+ s_singleuse_vdst insertion
+static cl::opt<bool>
+    EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
+                              cl::desc("Enable s_singleuse_vdst insertion"),
+                              cl::init(false), cl::Hidden);
+
 // Enable GFX11+ s_delay_alu insertion
 static cl::opt<bool>
     EnableInsertDelayAlu("amdgpu-enable-delay-alu",
@@ -404,6 +410,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
+  initializeAMDGPUInsertSingleUseVDSTPass(*PR);
   initializeAMDGPUInsertDelayAluPass(*PR);
   initializeSIInsertHardClausesPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
@@ -1448,6 +1455,9 @@ void GCNPassConfig::addPreEmitPass() {
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
+  if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
+    addPass(&AMDGPUInsertSingleUseVDSTID);
+
   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
     addPass(&AMDGPUInsertDelayAluID);
 

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 0c0720890794b66..53a33f8210d2a84 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -77,6 +77,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMacroFusion.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUIGroupLP.cpp
+  AMDGPUInsertSingleUseVDST.cpp
   AMDGPUMIRFormatter.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
   AMDGPUPerfHintAnalysis.cpp

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
new file mode 100644
index 000000000000000..5b75e42b75d2983
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -0,0 +1,627 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
+
+# One single-use producer.
+---
+name: one_producer
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: one_producer
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr0, $vgpr2
+...
+
+# One single-use producer of a 64-bit value.
+---
+name: one_producer_64bit
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: one_producer_64bit
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec
+  ; CHECK-NEXT:   $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr4_vgpr5
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec
+    $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec
+  bb.1:
+    liveins: $vgpr4_vgpr5
+...
+
+# Two consecutive single-use producers.
+---
+name: two_producers
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: two_producers
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr3
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+  bb.1:
+    liveins: $vgpr0, $vgpr3
+...
+
+# Redefinitions of v0.
+---
+name: redefinitions
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: redefinitions
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  bb.0:
+    liveins: $vgpr0
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+...
+
+# One producer with no consumers.
+---
+name: no_consumer
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: no_consumer
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+  bb.1:
+...
+
+# One consumer with two uses of the same value.
+---
+name: one_consumer_two_uses
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: one_consumer_two_uses
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr0, $vgpr2
+...
+
+# A longer example.
+---
+name: longer_example
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: longer_example
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode
+  ; CHECK-NEXT:   $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode
+  ; CHECK-NEXT:   $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode
+  ; CHECK-NEXT:   $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
+  ; CHECK-NEXT:   $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
+  ; CHECK-NEXT:   $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode
+  ; CHECK-NEXT:   $vgpr18 = V_EXP_F32_e32 $vgpr15, implicit $exec, implicit $mode
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr16, $vgpr18
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19
+    $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode
+    $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode
+    $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode
+    $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode
+    $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode
+    $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode
+    $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
+    $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
+    $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode
+    $vgpr18 = V_EXP_F32_e32 $vgpr15, implicit $exec, implicit $mode
+  bb.1:
+    liveins: $vgpr16, $vgpr18
+...
+
+# Multiple uses of v0.
+---
+name: multiple_uses_1
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: multiple_uses_1
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr1, $vgpr2
+...
+
+# Multiple uses of v0 and redefinitions of v1 and v2.
+---
+name: multiple_uses_2
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: multiple_uses_2
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr1, $vgpr2
+...
+
+# Multiple uses of all but v1.
+---
+name: multiple_uses_3
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: multiple_uses_3
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr3
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr2, $vgpr3
+...
+
+# Results are live-in to another basic block.
+---
+name: basic_block_1
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: basic_block_1
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.2:
+    liveins: $vgpr1, $vgpr2
+...
+
+# Result v2 has multiple uses in another basic block.
+---
+name: basic_block_2
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: basic_block_2
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $vgpr3
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr2
+    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  bb.2:
+    liveins: $vgpr3
+...
+
+# Results are redefined in another basic block.
+---
+name: basic_block_3
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: basic_block_3
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr0, $vgpr1
+    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+  bb.2:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+...
+
+# Exec modified between producer and consumer.
+---
+name: exec_mask
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: exec_mask
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   $exec = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $exec = COPY $sgpr0_sgpr1
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr0
+...
+
+# Exec_lo modified between producer and consumer.
+---
+name: exec_mask_lo
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: exec_mask_lo
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   $exec_lo = COPY $sgpr0
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $sgpr0
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $exec_lo = COPY $sgpr0
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr0
+...
+
+# Exec_hi modified between producer and consumer.
+---
+name: exec_mask_hi
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: exec_mask_hi
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   $exec_hi = COPY $sgpr0
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $sgpr0
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $exec_hi = COPY $sgpr0
+    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr0
+...
+
+# Write 32-bit vgpr and then read from low 16 bits.
+---
+name: write_full_read_lo
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: write_full_read_lo
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1_lo16
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
+  bb.1:
+    liveins: $vgpr1_lo16
+...
+
+# Write 32-bit vgpr and then read from high 16 bits.
+---
+name: write_full_read_hi
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: write_full_read_hi
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1_hi16
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
+  bb.1:
+    liveins: $vgpr1_hi16
+...
+
+# Write 32-bit vgpr and then read from both halves.
+---
+name: write_full_read_both
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: write_full_read_both
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
+    $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
+  bb.1:
+    liveins: $vgpr1
+...
+
+# Write 32-bit vgpr and then read from both halves in the same instruction.
+---
+name: write_full_read_both_same_instruction
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: write_full_read_both_same_instruction
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1_lo16 = V_ADD_F16_t16_e32 $vgpr0_lo16, $vgpr0_hi16, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1_lo16
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1_lo16 = V_ADD_F16_t16_e32 $vgpr0_lo16, $vgpr0_hi16, implicit $mode, implicit $exec
+  bb.1:
+    liveins: $vgpr1_lo16
+...
+
+# Write low 16-bits and then read 32-bit vgpr.
+---
+name: write_lo_read_full
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: write_lo_read_full
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0
+    $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr1
+...
+
+# Write high 16-bits and then read 32-bit vgpr.
+---
+name: write_hi_read_full
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: write_hi_read_full
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  bb.0:
+    liveins: $vgpr0
+    $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr1
+...


        


More information about the llvm-commits mailing list