[llvm] r303347 - [AMDGPU] SDWA operands should not intersect with potential MIs
Sam Kolton via llvm-commits
llvm-commits at lists.llvm.org
Thu May 18 05:12:04 PDT 2017
Author: skolton
Date: Thu May 18 07:12:03 2017
New Revision: 303347
URL: http://llvm.org/viewvc/llvm-project?rev=303347&view=rev
Log:
[AMDGPU] SDWA operands should not intersect with potential MIs
Summary:
There should be no intesection between SDWA operands and potential MIs. E.g.:
```
v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
v_add_u32 v3, v4, v2
```
In that example it is possible that we would fold 2nd instruction into 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was already destroyed). So if SDWAOperand is also a potential MI then do not apply it.
Reviewers: vpykhtin, arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye
Differential Revision: https://reviews.llvm.org/D32804
Modified:
llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp?rev=303347&r1=303346&r2=303347&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp Thu May 18 07:12:03 2017
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include <unordered_map>
+#include <unordered_set>
using namespace llvm;
@@ -44,26 +45,29 @@ namespace {
class SDWAOperand;
class SIPeepholeSDWA : public MachineFunctionPass {
+public:
+ typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector;
+
private:
MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+ std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
Optional<int64_t> foldToImm(const MachineOperand &Op) const;
public:
static char ID;
- typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;
-
SIPeepholeSDWA() : MachineFunctionPass(ID) {
initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineFunction &MF);
+ bool isConvertibleToSDWA(const MachineInstr &MI) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
StringRef getPassName() const override { return "SI Peephole SDWA"; }
@@ -468,7 +472,7 @@ void SIPeepholeSDWA::matchSDWAOperands(M
if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
auto SDWADst =
- make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+ make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
SDWAOperands[&MI] = std::move(SDWADst);
++NumSDWAPatternsFound;
@@ -575,8 +579,7 @@ void SIPeepholeSDWA::matchSDWAOperands(M
}
}
-bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
- const SDWAOperandsVector &SDWAOperands) {
+bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
// Check if this instruction can be converted to SDWA:
// 1. Does this opcode support SDWA
if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
@@ -588,6 +591,11 @@ bool SIPeepholeSDWA::convertToSDWA(Machi
return false;
}
+ return true;
+}
+
+bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
+ const SDWAOperandsVector &SDWAOperands) {
// Convert to sdwa
int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
assert(SDWAOpcode != -1);
@@ -664,7 +672,18 @@ bool SIPeepholeSDWA::convertToSDWA(Machi
// Apply all sdwa operand pattenrs
bool Converted = false;
for (auto &Operand : SDWAOperands) {
- Converted |= Operand->convertToSDWA(*SDWAInst, TII);
+ // There should be no intesection between SDWA operands and potential MIs
+ // e.g.:
+ // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
+ // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
+ // v_add_u32 v3, v4, v2
+ //
+ // In that example it is possible that we would fold 2nd instruction into 3rd
+ // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
+ // already destroyed). So if SDWAOperand is also a potential MI then do not
+ // apply it.
+ if (PotentialMatches.count(Operand->getParentInst()) == 0)
+ Converted |= Operand->convertToSDWA(*SDWAInst, TII);
}
if (!Converted) {
SDWAInst->eraseFromParent();
@@ -690,16 +709,15 @@ bool SIPeepholeSDWA::runOnMachineFunctio
MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();
-
- std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
-
+
+ // Find all SDWA operands in MF.
matchSDWAOperands(MF);
- for (auto &OperandPair : SDWAOperands) {
- auto &Operand = OperandPair.second;
+ for (const auto &OperandPair : SDWAOperands) {
+ const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI) {
- PotentialMatches[PotentialMI].push_back(std::move(Operand));
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
+ PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
@@ -708,6 +726,7 @@ bool SIPeepholeSDWA::runOnMachineFunctio
convertToSDWA(PotentialMI, PotentialPair.second);
}
+ PotentialMatches.clear();
SDWAOperands.clear();
return false;
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll?rev=303347&r1=303346&r2=303347&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll Thu May 18 07:12:03 2017
@@ -393,3 +393,53 @@ store_label:
store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4
ret void
}
+
+
+; Check that "pulling out" SDWA operands works correctly.
+; GCN-LABEL: {{^}}pulled_out_test:
+; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_and_b32_sdwa
+; NOSDWA-NOT: v_or_b32_sdwa
+
+; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+
+define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) {
+entry:
+ %idxprom = ashr exact i64 15, 32
+ %arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom
+ %tmp = load <8 x i8>, <8 x i8> addrspace(1)* %arrayidx, align 8
+
+ %tmp1 = extractelement <8 x i8> %tmp, i32 0
+ %tmp2 = extractelement <8 x i8> %tmp, i32 1
+ %tmp3 = extractelement <8 x i8> %tmp, i32 2
+ %tmp4 = extractelement <8 x i8> %tmp, i32 3
+ %tmp5 = extractelement <8 x i8> %tmp, i32 4
+ %tmp6 = extractelement <8 x i8> %tmp, i32 5
+ %tmp7 = extractelement <8 x i8> %tmp, i32 6
+ %tmp8 = extractelement <8 x i8> %tmp, i32 7
+
+ %tmp9 = insertelement <2 x i8> undef, i8 %tmp1, i32 0
+ %tmp10 = insertelement <2 x i8> %tmp9, i8 %tmp2, i32 1
+ %tmp11 = insertelement <2 x i8> undef, i8 %tmp3, i32 0
+ %tmp12 = insertelement <2 x i8> %tmp11, i8 %tmp4, i32 1
+ %tmp13 = insertelement <2 x i8> undef, i8 %tmp5, i32 0
+ %tmp14 = insertelement <2 x i8> %tmp13, i8 %tmp6, i32 1
+ %tmp15 = insertelement <2 x i8> undef, i8 %tmp7, i32 0
+ %tmp16 = insertelement <2 x i8> %tmp15, i8 %tmp8, i32 1
+
+ %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+ %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom
+ store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8
+ ret void
+}
More information about the llvm-commits
mailing list