[llvm] r318505 - AMDGPU: Fix breaking SMEM clauses
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 16 20:18:24 PST 2017
Author: arsenm
Date: Thu Nov 16 20:18:24 2017
New Revision: 318505
URL: http://llvm.org/viewvc/llvm-project?rev=318505&view=rev
Log:
AMDGPU: Fix breaking SMEM clauses
This was completely ignoring subregisters,
so was not very useful. Also only break them
if xnack is actually enabled.
Added:
llvm/trunk/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir
Modified:
llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h
llvm/trunk/test/CodeGen/AMDGPU/immv216.ll
llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
Modified: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp?rev=318505&r1=318504&r2=318505&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp Thu Nov 16 20:18:24 2017
@@ -40,7 +40,10 @@ GCNHazardRecognizer::GCNHazardRecognizer
CurrCycleInstr(nullptr),
MF(MF),
ST(MF.getSubtarget<SISubtarget>()),
- TII(*ST.getInstrInfo()) {
+ TII(*ST.getInstrInfo()),
+ TRI(TII.getRegisterInfo()),
+ ClauseUses(TRI.getNumRegUnits()),
+ ClauseDefs(TRI.getNumRegUnits()) {
MaxLookAhead = 5;
}
@@ -258,19 +261,35 @@ int GCNHazardRecognizer::getWaitStatesSi
// No-op Hazard Detection
//===----------------------------------------------------------------------===//
-static void addRegsToSet(iterator_range<MachineInstr::const_mop_iterator> Ops,
- std::set<unsigned> &Set) {
+static void addRegUnits(const SIRegisterInfo &TRI,
+ BitVector &BV, unsigned Reg) {
+ for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
+ BV.set(*RUI);
+}
+
+static void addRegsToSet(const SIRegisterInfo &TRI,
+ iterator_range<MachineInstr::const_mop_iterator> Ops,
+ BitVector &Set) {
for (const MachineOperand &Op : Ops) {
if (Op.isReg())
- Set.insert(Op.getReg());
+ addRegUnits(TRI, Set, Op.getReg());
}
}
+void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
+ // XXX: Do we need to worry about implicit operands
+ addRegsToSet(TRI, MI.defs(), ClauseDefs);
+ addRegsToSet(TRI, MI.uses(), ClauseUses);
+}
+
int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
- // SMEM soft clause are only present on VI+
- if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ // SMEM soft clause are only present on VI+, and only matter if xnack is
+ // enabled.
+ if (!ST.isXNACKEnabled())
return 0;
+ resetClause();
+
// A soft-clause is any group of consecutive SMEM instructions. The
// instructions in this group may return out of order and/or may be
// replayed (i.e. the same instruction issued more than once).
@@ -281,21 +300,16 @@ int GCNHazardRecognizer::checkSMEMSoftCl
// (including itself). If we encounter this situaion, we need to break the
// clause by inserting a non SMEM instruction.
- std::set<unsigned> ClauseDefs;
- std::set<unsigned> ClauseUses;
-
for (MachineInstr *MI : EmittedInstrs) {
-
// When we hit a non-SMEM instruction then we have passed the start of the
// clause and we can stop.
if (!MI || !SIInstrInfo::isSMRD(*MI))
break;
- addRegsToSet(MI->defs(), ClauseDefs);
- addRegsToSet(MI->uses(), ClauseUses);
+ addClauseInst(*MI);
}
- if (ClauseDefs.empty())
+ if (ClauseDefs.none())
return 0;
// FIXME: When we support stores, we need to make sure not to put loads and
@@ -304,21 +318,11 @@ int GCNHazardRecognizer::checkSMEMSoftCl
if (SMEM->mayStore())
return 1;
- addRegsToSet(SMEM->defs(), ClauseDefs);
- addRegsToSet(SMEM->uses(), ClauseUses);
-
- std::vector<unsigned> Result(std::max(ClauseDefs.size(), ClauseUses.size()));
- std::vector<unsigned>::iterator End;
-
- End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(),
- ClauseUses.begin(), ClauseUses.end(), Result.begin());
+ addClauseInst(*SMEM);
// If the set of defs and uses intersect then we cannot add this instruction
// to the clause, so we have a hazard.
- if (End != Result.begin())
- return 1;
-
- return 0;
+ return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
}
int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
Modified: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h?rev=318505&r1=318504&r2=318505&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h Thu Nov 16 20:18:24 2017
@@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include <list>
@@ -24,6 +25,7 @@ class MachineFunction;
class MachineInstr;
class ScheduleDAG;
class SIInstrInfo;
+class SIRegisterInfo;
class SISubtarget;
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
@@ -35,6 +37,20 @@ class GCNHazardRecognizer final : public
const MachineFunction &MF;
const SISubtarget &ST;
const SIInstrInfo &TII;
+ const SIRegisterInfo &TRI;
+
+ /// RegUnits of uses in the current soft memory clause.
+ BitVector ClauseUses;
+
+ /// RegUnits of defs in the current soft memory clause.
+ BitVector ClauseDefs;
+
+ void resetClause() {
+ ClauseUses.reset();
+ ClauseDefs.reset();
+ }
+
+ void addClauseInst(const MachineInstr &MI);
int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
int getWaitStatesSinceDef(unsigned Reg,
Added: llvm/trunk/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir?rev=318505&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir Thu Nov 16 20:18:24 2017
@@ -0,0 +1,351 @@
+# RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
+
+---
+# Trivial clause at beginning of program
+name: trivial_smem_clause_load_smrd4_x1
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x1
+ ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_smem_clause_load_smrd4_x2
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x2
+ ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr1 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr1 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_smem_clause_load_smrd4_x3
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x3
+ ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: %sgpr1 = S_LOAD_DWORD_IMM %sgpr6_sgpr7, 0, 0
+ ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ %sgpr1 = S_LOAD_DWORD_IMM %sgpr6_sgpr7, 0, 0
+ %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+ S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_smem_clause_load_smrd4_x4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x4
+ ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: %sgpr1 = S_LOAD_DWORD_IMM %sgpr8_sgpr9, 0, 0
+ ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+ ; GCN-NEXT: %sgpr3 = S_LOAD_DWORD_IMM %sgpr16_sgpr17, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ %sgpr1 = S_LOAD_DWORD_IMM %sgpr8_sgpr9, 0, 0
+ %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+ %sgpr3 = S_LOAD_DWORD_IMM %sgpr16_sgpr17, 0, 0
+ S_ENDPGM
+...
+---
+# Reuse of same input pointer is OK
+name: trivial_smem_clause_load_smrd4_x2_sameptr
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x2_sameptr
+ ; GCN: %sgpr12 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr12 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ S_ENDPGM
+...
+---
+# 32-bit load partially clobbers its own ptr reg
+name: smrd_load4_overwrite_ptr_lo
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: smrd_load4_overwrite_ptr_lo
+ ; GCN: %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ S_ENDPGM
+...
+---
+# 32-bit load partially clobbers its own ptr reg
+name: smrd_load4_overwrite_ptr_hi
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: smrd_load4_overwrite_ptr_hi
+ ; GCN: %sgpr11 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr11 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ S_ENDPGM
+...
+---
+# 64-bit load clobbers its own ptr reg
+name: smrd_load8_overwrite_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: smrd_load8_overwrite_ptr
+ ; GCN: %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+ S_ENDPGM
+...
+---
+# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt
+# breaks the clause.
+
+name: break_smem_clause_at_max_smem_clause_size_smrd_load4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_smem_clause_at_max_smem_clause_size_smrd_load4
+ ; GCN: %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr14 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr15 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr16 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr17 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr18 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr19 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr20 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr21 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr22 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr23 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr24 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr25 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr26 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr27 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr28 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr0 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0
+ ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0, implicit %sgpr13, implicit %sgpr14, implicit %sgpr15, implicit %sgpr16, implicit %sgpr17, implicit %sgpr18, implicit %sgpr19, implicit %sgpr20, implicit %sgpr21, implicit %sgpr22, implicit %sgpr23, implicit %sgpr24, implicit %sgpr25, implicit %sgpr26, implicit %sgpr27, implicit %sgpr28
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr14 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr15 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr16 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+
+ %sgpr17 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr18 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr19 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr20 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+
+ %sgpr21 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr22 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr23 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr24 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+
+ %sgpr25 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr26 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr27 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr28 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+
+ %sgpr0 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0
+ %sgpr0 = S_MOV_B32 %sgpr0, implicit %sgpr13, implicit %sgpr14, implicit %sgpr15, implicit %sgpr16, implicit %sgpr17, implicit %sgpr18, implicit %sgpr19, implicit %sgpr20, implicit %sgpr21, implicit %sgpr22, implicit %sgpr23, implicit %sgpr24, implicit %sgpr25, implicit %sgpr26, implicit %sgpr27, implicit %sgpr28
+ S_ENDPGM
+...
+---
+
+name: break_smem_clause_simple_load_smrd4_lo_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_smem_clause_simple_load_smrd4_lo_ptr
+ ; GCN: %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %sgpr12 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr12 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ S_ENDPGM
+...
+---
+
+name: break_smem_clause_simple_load_smrd4_hi_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_smem_clause_simple_load_smrd4_hi_ptr
+ ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr3 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr3 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ S_ENDPGM
+...
+---
+
+name: break_smem_clause_simple_load_smrd8_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr
+ ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0
+ S_ENDPGM
+...
+---
+
+name: break_smem_clause_simple_load_smrd16_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_smem_clause_simple_load_smrd16_ptr
+ ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX4_IMM %sgpr6_sgpr7, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX4_IMM %sgpr6_sgpr7, 0, 0
+ S_ENDPGM
+...
+---
+
+name: break_smem_clause_block_boundary_load_smrd8_ptr
+
+body: |
+ ; GCN-LABEL: name: break_smem_clause_block_boundary_load_smrd8_ptr
+ ; GCN: bb.0:
+ ; GCN: successors: %bb.1(0x80000000)
+ ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN: bb.1:
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ bb.0:
+ %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+
+ bb.1:
+ %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0
+ S_ENDPGM
+...
+---
+# The load clobbers the pointer of the store, so it needs to break.
+
+name: break_smem_clause_store_load_into_ptr_smrd4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_smem_clause_store_load_into_ptr_smrd4
+ ; GCN: S_STORE_DWORD_IMM %sgpr16, %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr12 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ S_STORE_DWORD_IMM %sgpr16, %sgpr10_sgpr11, 0, 0
+ %sgpr12 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+ S_ENDPGM
+...
+---
+# The load clobbers the data of the store, so it needs to break.
+# FIXME: Would it be better to s_nop and wait later?
+
+name: break_smem_clause_store_load_into_data_smrd4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_smem_clause_store_load_into_data_smrd4
+ ; GCN: S_STORE_DWORD_IMM %sgpr8, %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr8 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ S_STORE_DWORD_IMM %sgpr8, %sgpr10_sgpr11, 0, 0
+ %sgpr8 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ S_ENDPGM
+...
+---
+# Regular VALU instruction breaks clause, no nop needed
+name: valu_inst_breaks_smem_clause
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: valu_inst_breaks_smem_clause
+ ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %vgpr8 = V_MOV_B32_e32 0, implicit %exec
+ ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %vgpr8 = V_MOV_B32_e32 0, implicit %exec
+ %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ S_ENDPGM
+...
+---
+# Regular SALU instruction breaks clause, no nop needed
+name: salu_inst_breaks_smem_clause
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: salu_inst_breaks_smem_clause
+ ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %sgpr8 = S_MOV_B32 0
+ ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %sgpr8 = S_MOV_B32 0
+ %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ S_ENDPGM
+...
+---
+name: ds_inst_breaks_smem_clause
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: ds_inst_breaks_smem_clause
+ ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec
+ ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec
+ %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ S_ENDPGM
+...
+---
+
+name: flat_inst_breaks_smem_clause
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: flat_inst_breaks_smem_clause
+ ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+ S_ENDPGM
+...
+---
+# FIXME: Should this be handled?
+name: implicit_use_breaks_smem_clause
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: implicit_use_breaks_smem_clause
+ ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0, implicit %sgpr12_sgpr13
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %sgpr12_sgpr13 = S_LOAD_DWORDX2_IMM %sgpr6_sgpr7, 0, 0
+ ; GCN-NEXT: S_ENDPGM
+ %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0, implicit %sgpr12_sgpr13
+ %sgpr12_sgpr13 = S_LOAD_DWORDX2_IMM %sgpr6_sgpr7, 0, 0
+ S_ENDPGM
+...
Modified: llvm/trunk/test/CodeGen/AMDGPU/immv216.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/immv216.ll?rev=318505&r1=318504&r2=318505&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/immv216.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/immv216.ll Thu Nov 16 20:18:24 2017
@@ -282,9 +282,9 @@ define amdgpu_kernel void @add_inline_im
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
; GFX9: buffer_store_dword [[REG]]
-; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
; VI: buffer_load_dword
; VI-NOT: and
+; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
; VI: v_or_b32
Modified: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll?rev=318505&r1=318504&r2=318505&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll Thu Nov 16 20:18:24 2017
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
@@ -428,10 +428,12 @@ define amdgpu_kernel void @v_inserteleme
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
@@ -455,11 +457,12 @@ define amdgpu_kernel void @v_inserteleme
; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll?rev=318505&r1=318504&r2=318505&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll Thu Nov 16 20:18:24 2017
@@ -81,8 +81,8 @@ entry:
}
; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_b
-; VI: v_mov_b32_e32 v[[AB_F16:[0-9]+]], 0x4200{{$}}
-; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
+; VI-DAG: v_mov_b32_e32 v[[AB_F16:[0-9]+]], 0x4200{{$}}
+; GCN-DAG: buffer_load_ushort v[[C_F16:[0-9]+]]
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
@@ -97,8 +97,8 @@ entry:
}
; GCN-LABEL: {{^}}div_fixup_f16_imm_b_imm_c
-; VI: v_mov_b32_e32 v[[BC_F16:[0-9]+]], 0x4200{{$}}
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
+; VI-DAG: v_mov_b32_e32 v[[BC_F16:[0-9]+]], 0x4200{{$}}
+; GCN-DAG: buffer_load_ushort v[[A_F16:[0-9]+]]
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
@@ -113,8 +113,8 @@ entry:
}
; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_c
-; VI: v_mov_b32_e32 v[[AC_F16:[0-9]+]], 0x4200{{$}}
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
+; VI-DAG: v_mov_b32_e32 v[[AC_F16:[0-9]+]], 0x4200{{$}}
+; GCN-DAG: buffer_load_ushort v[[B_F16:[0-9]+]]
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
Modified: llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll?rev=318505&r1=318504&r2=318505&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll Thu Nov 16 20:18:24 2017
@@ -45,14 +45,16 @@ define amdgpu_kernel void @test_sgpr_use
; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
; SI: buffer_load_dword [[VA0:v[0-9]+]]
-; SI: buffer_load_dword [[VA1:v[0-9]+]]
+; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
; GCN-NOT: v_mov_b32
-; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; GCN-NOT: v_mov_b32
; VI: buffer_load_dword [[VA0:v[0-9]+]]
-; VI: buffer_load_dword [[VA1:v[0-9]+]]
+; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
+
+; GCN-NOT: v_mov_b32
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN-NOT: v_mov_b32
; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SA]], [[VA0]], [[VB]]
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SA]], [[VA1]], [[VB]]
More information about the llvm-commits
mailing list