[llvm] r369745 - [AMDGPU] gfx10 atomic optimizer changes.
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 23 03:07:43 PDT 2019
Author: foad
Date: Fri Aug 23 03:07:43 2019
New Revision: 369745
URL: http://llvm.org/viewvc/llvm-project?rev=369745&view=rev
Log:
[AMDGPU] gfx10 atomic optimizer changes.
Summary:
Add support for gfx10, where all DPP operations are confined to work
within a single row of 16 lanes, and wave32.
Reviewers: arsenm, sheredom, critson, rampitec
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, jfb, dstuttard, tpr, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65644
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
llvm/trunk/lib/Target/AMDGPU/SIDefines.h
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp?rev=369745&r1=369744&r2=369745&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp Fri Aug 23 03:07:43 2019
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
@@ -24,20 +25,10 @@
#define DEBUG_TYPE "amdgpu-atomic-optimizer"
using namespace llvm;
+using namespace llvm::AMDGPU;
namespace {
-enum DPP_CTRL {
- DPP_ROW_SR1 = 0x111,
- DPP_ROW_SR2 = 0x112,
- DPP_ROW_SR3 = 0x113,
- DPP_ROW_SR4 = 0x114,
- DPP_ROW_SR8 = 0x118,
- DPP_WF_SR1 = 0x138,
- DPP_ROW_BCAST15 = 0x142,
- DPP_ROW_BCAST31 = 0x143
-};
-
struct ReplacementInfo {
Instruction *I;
AtomicRMWInst::BinOp Op;
@@ -52,9 +43,12 @@ private:
const LegacyDivergenceAnalysis *DA;
const DataLayout *DL;
DominatorTree *DT;
- bool HasDPP;
+ const GCNSubtarget *ST;
bool IsPixelShader;
+ Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const;
+ Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
bool ValDivergent) const;
@@ -93,8 +87,7 @@ bool AMDGPUAtomicOptimizer::runOnFunctio
DT = DTW ? &DTW->getDomTree() : nullptr;
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- HasDPP = ST.hasDPP();
+ ST = &TM.getSubtarget<GCNSubtarget>(F);
IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
visit(F);
@@ -152,7 +145,8 @@ void AMDGPUAtomicOptimizer::visitAtomicR
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
- if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+ if (ValDivergent &&
+ (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
return;
}
@@ -225,7 +219,8 @@ void AMDGPUAtomicOptimizer::visitIntrins
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
- if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+ if (ValDivergent &&
+ (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
return;
}
@@ -282,6 +277,111 @@ static Value *buildNonAtomicBinOp(IRBuil
return B.CreateSelect(Cond, LHS, RHS);
}
+// Use the builder to create an inclusive scan of V across the wavefront, with
+// all lanes active.
+Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+ Value *V, Value *const Identity) const {
+ Type *const Ty = V->getType();
+ Module *M = B.GetInsertBlock()->getModule();
+ Function *UpdateDPP =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+ Function *PermLaneX16 =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+
+ for (unsigned Idx = 0; Idx < 4; Idx++) {
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+ }
+ if (ST->hasDPPBroadcasts()) {
+ // GFX9 has DPP row broadcast operations.
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
+ B.getInt32(0xf), B.getFalse()}));
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
+ B.getInt32(0xf), B.getFalse()}));
+ } else {
+ // On GFX10 all DPP operations are confined to a single row. To get cross-
+ // row operations we have to use permlane or readlane.
+
+ // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
+ // 48..63).
+ Value *const PermX =
+ B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1),
+ B.getFalse(), B.getFalse()});
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
+ B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
+ if (!ST->isWave32()) {
+ // Combine lane 31 into lanes 32..63.
+ Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)});
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
+ B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
+ }
+ }
+ return V;
+}
+
+// Use the builder to create a shift right of V across the wavefront, with all
+// lanes active, to turn an inclusive scan into an exclusive scan.
+Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
+ Value *const Identity) const {
+ Type *const Ty = V->getType();
+ Module *M = B.GetInsertBlock()->getModule();
+ Function *UpdateDPP =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Function *WriteLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+
+ if (ST->hasDPPWavefrontShifts()) {
+ // GFX9 has DPP wavefront shift operations.
+ V = B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
+ B.getInt32(0xf), B.getFalse()});
+ } else {
+ // On GFX10 all DPP operations are confined to a single row. To get cross-
+ // row operations we have to use permlane or readlane.
+ Value *Old = V;
+ V = B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
+
+ // Copy the old lane 15 to the new lane 16.
+ V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
+ B.getInt32(16), V});
+
+ if (!ST->isWave32()) {
+ // Copy the old lane 31 to the new lane 32.
+ V = B.CreateCall(
+ WriteLane,
+ {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
+
+ // Copy the old lane 47 to the new lane 48.
+ V = B.CreateCall(
+ WriteLane,
+ {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
+ }
+ }
+
+ return V;
+}
+
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
unsigned BitWidth) {
switch (Op) {
@@ -345,23 +445,29 @@ void AMDGPUAtomicOptimizer::optimizeAtom
// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.
+ Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
CallInst *const Ballot = B.CreateIntrinsic(
- Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()},
+ Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()},
{B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});
// We need to know how many lanes are active within the wavefront that are
// below us. If we counted each lane linearly starting from 0, a lane is
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
- Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
- Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
- Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
- CallInst *const PartialMbcnt = B.CreateIntrinsic(
- Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
- Value *const Mbcnt =
- B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
- {ExtractHi, PartialMbcnt}),
- Ty, false);
+ Value *Mbcnt;
+ if (ST->isWave32()) {
+ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ {Ballot, B.getInt32(0)});
+ } else {
+ Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
+ Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
+ Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
+ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ {ExtractLo, B.getInt32(0)});
+ Mbcnt =
+ B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
+ }
+ Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
@@ -373,45 +479,25 @@ void AMDGPUAtomicOptimizer::optimizeAtom
if (ValDivergent) {
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
- CallInst *const SetInactive =
- B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-
- ExclScan =
- B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
- {Identity, SetInactive, B.getInt32(DPP_WF_SR1),
- B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
-
- const unsigned Iters = 6;
- const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
- DPP_ROW_SR4, DPP_ROW_SR8,
- DPP_ROW_BCAST15, DPP_ROW_BCAST31};
- const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
- const unsigned BankMask[Iters] = {0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
-
- // This loop performs an exclusive scan across the wavefront, with all lanes
- // active (by using the WWM intrinsic).
- for (unsigned Idx = 0; Idx < Iters; Idx++) {
- CallInst *const DPP = B.CreateIntrinsic(
- Intrinsic::amdgcn_update_dpp, Ty,
- {Identity, ExclScan, B.getInt32(DPPCtrl[Idx]),
- B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
-
- ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
- }
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
- NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
+ const AtomicRMWInst::BinOp ScanOp =
+ Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+ NewV = buildScan(B, ScanOp, NewV, Identity);
+ ExclScan = buildShiftRight(B, NewV, Identity);
// Read the value from the last lane, which has accumlated the values of
// each active lane in the wavefront. This will be our new value which we
// will provide to the atomic operation.
+ Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
+ B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
CallInst *const ReadLaneLo = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
+ Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
CallInst *const ReadLaneHi = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
+ Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
Value *const PartialInsert = B.CreateInsertElement(
UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
Value *const Insert =
@@ -419,7 +505,7 @@ void AMDGPUAtomicOptimizer::optimizeAtom
NewV = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
- {NewV, B.getInt32(63)});
+ {NewV, LastLaneIdx});
} else {
llvm_unreachable("Unhandled atomic bit width");
}
@@ -506,7 +592,7 @@ void AMDGPUAtomicOptimizer::optimizeAtom
if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
+ B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
CallInst *const ReadFirstLaneLo =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
CallInst *const ReadFirstLaneHi =
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h?rev=369745&r1=369744&r2=369745&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h Fri Aug 23 03:07:43 2019
@@ -949,6 +949,14 @@ public:
return HasDPP;
}
+ bool hasDPPBroadcasts() const {
+ return HasDPP && getGeneration() < GFX10;
+ }
+
+ bool hasDPPWavefrontShifts() const {
+ return HasDPP && getGeneration() < GFX10;
+ }
+
bool hasDPP8() const {
return HasDPP8;
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIDefines.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIDefines.h?rev=369745&r1=369744&r2=369745&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIDefines.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIDefines.h Fri Aug 23 03:07:43 2019
@@ -444,6 +444,7 @@ namespace DPP {
enum DppCtrl : unsigned {
QUAD_PERM_FIRST = 0,
+ QUAD_PERM_ID = 0xE4, // identity permutation
QUAD_PERM_LAST = 0xFF,
DPP_UNUSED1 = 0x100,
ROW_SHL0 = 0x100,
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll?rev=369745&r1=369744&r2=369745&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll Fri Aug 23 03:07:43 2019
@@ -1,6 +1,8 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)
@@ -9,11 +11,13 @@ declare i32 @llvm.amdgcn.buffer.atomic.s
; Show that what the atomic optimization pass will do for raw buffers.
; GCN-LABEL: add_i32_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: buffer_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
@@ -24,11 +28,13 @@ entry:
}
; GCN-LABEL: add_i32_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: buffer_atomic_add v[[value]]
@@ -44,14 +50,15 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
@@ -76,11 +83,13 @@ entry:
}
; GCN-LABEL: sub_i32_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: buffer_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
@@ -91,11 +100,13 @@ entry:
}
; GCN-LABEL: sub_i32_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: buffer_atomic_sub v[[value]]
@@ -111,14 +122,15 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll?rev=369745&r1=369744&r2=369745&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll Fri Aug 23 03:07:43 2019
@@ -1,17 +1,21 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s
declare i32 @llvm.amdgcn.workitem.id.x()
; Show that what the atomic optimization pass will do for global pointers.
; GCN-LABEL: add_i32_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: {{flat|buffer|global}}_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
@@ -22,11 +26,13 @@ entry:
}
; GCN-LABEL: add_i32_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: {{flat|buffer|global}}_atomic_add v[[value]]
@@ -42,9 +48,10 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
-; GFX8MORE: v_add_u32_dpp
-; GFX8MORE: v_add_u32_dpp
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; DPPCOMB: v_add_u32_dpp
+; DPPCOMB: v_add_u32_dpp
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
@@ -56,11 +63,13 @@ entry:
}
; GCN-LABEL: add_i64_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
@@ -72,11 +81,13 @@ entry:
}
; GCN-LABEL: add_i64_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) {
entry:
@@ -100,11 +111,13 @@ entry:
}
; GCN-LABEL: sub_i32_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: {{flat|buffer|global}}_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
@@ -115,11 +128,13 @@ entry:
}
; GCN-LABEL: sub_i32_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: {{flat|buffer|global}}_atomic_sub v[[value]]
@@ -135,9 +150,10 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
-; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; DPPCOMB: v_add_u32_dpp
+; DPPCOMB: v_add_u32_dpp
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
@@ -149,11 +165,13 @@ entry:
}
; GCN-LABEL: sub_i64_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
@@ -165,11 +183,13 @@ entry:
}
; GCN-LABEL: sub_i64_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) {
entry:
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll?rev=369745&r1=369744&r2=369745&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll Fri Aug 23 03:07:43 2019
@@ -1,6 +1,9 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX1064 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX1032 %s
declare i32 @llvm.amdgcn.workitem.id.x()
@@ -10,11 +13,13 @@ declare i32 @llvm.amdgcn.workitem.id.x()
; Show that what the atomic optimization pass will do for local pointers.
; GCN-LABEL: add_i32_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
@@ -25,11 +30,13 @@ entry:
}
; GCN-LABEL: add_i32_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
@@ -45,9 +52,10 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GFX8MORE: v_add_u32_dpp
-; GFX8MORE: v_add_u32_dpp
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; DPPCOMB: v_add_u32_dpp
+; DPPCOMB: v_add_u32_dpp
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
@@ -58,12 +66,156 @@ entry:
ret void
}
+define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
+; GFX1032-LABEL: add_i32_varying_gfx1032:
+; GFX1032: v_mov_b32_e32 v2, v0
+; GFX1032: s_or_saveexec_b32 s2, -1
+; GFX1032: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032: v_mov_b32_e32 v1, 0
+; GFX1032: s_mov_b32 exec_lo, s2
+; GFX1032: v_cmp_ne_u32_e64 s2, 1, 0
+; GFX1032: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX1032: s_not_b32 exec_lo, exec_lo
+; GFX1032: v_mov_b32_e32 v2, 0
+; GFX1032: s_not_b32 exec_lo, exec_lo
+; GFX1032: s_or_saveexec_b32 s4, -1
+; GFX1032: v_mov_b32_e32 v3, v1
+; GFX1032: v_mov_b32_e32 v4, v1
+; GFX1032: s_mov_b32 s2, -1
+; GFX1032: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032: v_add_nc_u32_e32 v2, v2, v3
+; GFX1032: v_mov_b32_e32 v3, v1
+; GFX1032: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032: v_add_nc_u32_e32 v2, v2, v3
+; GFX1032: v_mov_b32_e32 v3, v1
+; GFX1032: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032: v_add_nc_u32_e32 v2, v2, v3
+; GFX1032: v_mov_b32_e32 v3, v1
+; GFX1032: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032: v_add_nc_u32_e32 v2, v2, v3
+; GFX1032: v_mov_b32_e32 v3, v2
+; GFX1032: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1032: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032: v_add_nc_u32_e32 v2, v2, v4
+; GFX1032: v_readlane_b32 s3, v2, 31
+; GFX1032: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032: v_readlane_b32 s5, v2, 15
+; GFX1032: v_writelane_b32 v1, s5, 16
+; GFX1032: s_mov_b32 exec_lo, s4
+; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032: s_cbranch_execz BB3_2
+; GFX1032: BB3_1:
+; GFX1032: v_mov_b32_e32 v0, local_var32 at abs32@lo
+; GFX1032: v_mov_b32_e32 v5, s3
+; GFX1032: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1032: s_waitcnt_vscnt null, 0x0
+; GFX1032: ds_add_rtn_u32 v0, v0, v5
+; GFX1032: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1032: buffer_gl0_inv
+; GFX1032: buffer_gl1_inv
+; GFX1032: BB3_2:
+; GFX1032: v_nop
+; GFX1032: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032: v_readfirstlane_b32 s3, v0
+; GFX1032: v_mov_b32_e32 v0, v1
+; GFX1032: v_add_nc_u32_e32 v0, s3, v0
+; GFX1032: s_mov_b32 s3, 0x31016000
+; GFX1032: s_nop 1
+; GFX1032: s_waitcnt lgkmcnt(0)
+; GFX1032: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+ store i32 %old, i32 addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
+; GFX1064-LABEL: add_i32_varying_gfx1064:
+; GFX1064: v_mov_b32_e32 v2, v0
+; GFX1064: s_or_saveexec_b64 s[2:3], -1
+; GFX1064: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064: v_mov_b32_e32 v1, 0
+; GFX1064: s_mov_b64 exec, s[2:3]
+; GFX1064: v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX1064: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
+; GFX1064: s_not_b64 exec, exec
+; GFX1064: v_mov_b32_e32 v2, 0
+; GFX1064: s_not_b64 exec, exec
+; GFX1064: s_or_saveexec_b64 s[4:5], -1
+; GFX1064: v_mov_b32_e32 v3, v1
+; GFX1064: v_mov_b32_e32 v4, v1
+; GFX1064: s_mov_b32 s2, -1
+; GFX1064: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064: v_add_nc_u32_e32 v2, v2, v3
+; GFX1064: v_mov_b32_e32 v3, v1
+; GFX1064: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064: v_add_nc_u32_e32 v2, v2, v3
+; GFX1064: v_mov_b32_e32 v3, v1
+; GFX1064: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064: v_add_nc_u32_e32 v2, v2, v3
+; GFX1064: v_mov_b32_e32 v3, v1
+; GFX1064: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064: v_add_nc_u32_e32 v2, v2, v3
+; GFX1064: v_mov_b32_e32 v3, v2
+; GFX1064: v_permlanex16_b32 v3, v3, -1, -1
+; GFX1064: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064: v_add_nc_u32_e32 v2, v2, v4
+; GFX1064: v_mov_b32_e32 v4, v1
+; GFX1064: v_readlane_b32 s3, v2, 31
+; GFX1064: v_mov_b32_e32 v3, s3
+; GFX1064: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064: v_add_nc_u32_e32 v2, v2, v4
+; GFX1064: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064: v_readlane_b32 s3, v2, 15
+; GFX1064: v_readlane_b32 s6, v2, 31
+; GFX1064: v_writelane_b32 v1, s3, 16
+; GFX1064: v_readlane_b32 s3, v2, 63
+; GFX1064: v_writelane_b32 v1, s6, 32
+; GFX1064: v_readlane_b32 s6, v2, 47
+; GFX1064: v_writelane_b32 v1, s6, 48
+; GFX1064: s_mov_b64 exec, s[4:5]
+; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064: s_cbranch_execz BB4_2
+; GFX1064: BB4_1:
+; GFX1064: v_mov_b32_e32 v0, local_var32 at abs32@lo
+; GFX1064: v_mov_b32_e32 v5, s3
+; GFX1064: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1064: s_waitcnt_vscnt null, 0x0
+; GFX1064: ds_add_rtn_u32 v0, v0, v5
+; GFX1064: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1064: buffer_gl0_inv
+; GFX1064: buffer_gl1_inv
+; GFX1064: BB4_2:
+; GFX1064: v_nop
+; GFX1064: s_or_b64 exec, exec, s[4:5]
+; GFX1064: v_readfirstlane_b32 s3, v0
+; GFX1064: v_mov_b32_e32 v0, v1
+; GFX1064: v_add_nc_u32_e32 v0, s3, v0
+; GFX1064: s_mov_b32 s3, 0x31016000
+; GFX1064: s_nop 1
+; GFX1064: s_waitcnt lgkmcnt(0)
+; GFX1064: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+ store i32 %old, i32 addrspace(1)* %out
+ ret void
+}
+
; GCN-LABEL: add_i64_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
@@ -75,11 +227,13 @@ entry:
}
; GCN-LABEL: add_i64_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
entry:
@@ -103,11 +257,13 @@ entry:
}
; GCN-LABEL: sub_i32_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
@@ -118,11 +274,13 @@ entry:
}
; GCN-LABEL: sub_i32_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
@@ -138,9 +296,10 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
-; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; DPPCOMB: v_add_u32_dpp
+; DPPCOMB: v_add_u32_dpp
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
@@ -152,11 +311,13 @@ entry:
}
; GCN-LABEL: sub_i64_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
@@ -168,11 +329,13 @@ entry:
}
; GCN-LABEL: sub_i64_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
entry:
@@ -196,7 +359,8 @@ entry:
}
; GCN-LABEL: and_i32_varying:
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
@@ -208,7 +372,8 @@ entry:
}
; GCN-LABEL: or_i32_varying:
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
@@ -220,7 +385,8 @@ entry:
}
; GCN-LABEL: xor_i32_varying:
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
@@ -232,7 +398,8 @@ entry:
}
; GCN-LABEL: max_i32_varying:
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
@@ -244,10 +411,11 @@ entry:
}
; GCN-LABEL: max_i64_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_max_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
@@ -259,7 +427,8 @@ entry:
}
; GCN-LABEL: min_i32_varying:
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
@@ -271,10 +440,11 @@ entry:
}
; GCN-LABEL: min_i64_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_min_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
@@ -286,7 +456,8 @@ entry:
}
; GCN-LABEL: umax_i32_varying:
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
@@ -298,10 +469,11 @@ entry:
}
; GCN-LABEL: umax_i64_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_max_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
@@ -313,7 +485,8 @@ entry:
}
; GCN-LABEL: umin_i32_varying:
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
@@ -325,10 +498,11 @@ entry:
}
; GCN-LABEL: umin_i64_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_min_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll?rev=369745&r1=369744&r2=369745&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll Fri Aug 23 03:07:43 2019
@@ -1,6 +1,8 @@
-; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s
declare i1 @llvm.amdgcn.wqm.vote(i1)
declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)
@@ -10,11 +12,13 @@ declare void @llvm.amdgcn.buffer.store.f
; GCN-LABEL: add_i32_constant:
; GCN-LABEL: BB0_1:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: buffer_atomic_add v[[value]]
; GCN: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]
@@ -36,13 +40,15 @@ else:
; GCN-LABEL: add_i32_varying:
; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
-; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GFX8MORE: v_add_u32_dpp
-; GFX8MORE: v_add_u32_dpp
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
-; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GFX8MORE32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GFX8MORE64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GFX8MORE64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; DPPCOMB: v_add_u32_dpp
+; DPPCOMB: v_add_u32_dpp
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_add v[[value]]
; GFX8MORE: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll?rev=369745&r1=369744&r2=369745&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll Fri Aug 23 03:07:43 2019
@@ -1,6 +1,8 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32)
@@ -9,11 +11,13 @@ declare i32 @llvm.amdgcn.raw.buffer.atom
; Show that what the atomic optimization pass will do for raw buffers.
; GCN-LABEL: add_i32_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: buffer_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
@@ -24,11 +28,13 @@ entry:
}
; GCN-LABEL: add_i32_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: buffer_atomic_add v[[value]]
@@ -44,9 +50,10 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
-; GFX8MORE: v_add_u32_dpp
-; GFX8MORE: v_add_u32_dpp
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; DPPCOMB: v_add_u32_dpp
+; DPPCOMB: v_add_u32_dpp
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
@@ -71,11 +78,13 @@ entry:
}
; GCN-LABEL: sub_i32_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: buffer_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
@@ -86,11 +95,13 @@ entry:
}
; GCN-LABEL: sub_i32_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: buffer_atomic_sub v[[value]]
@@ -106,9 +117,10 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
-; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; DPPCOMB: v_add_u32_dpp
+; DPPCOMB: v_add_u32_dpp
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll?rev=369745&r1=369744&r2=369745&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll Fri Aug 23 03:07:43 2019
@@ -1,6 +1,8 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32)
@@ -9,11 +11,13 @@ declare i32 @llvm.amdgcn.struct.buffer.a
; Show that what the atomic optimization pass will do for struct buffers.
; GCN-LABEL: add_i32_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: buffer_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
@@ -24,11 +28,13 @@ entry:
}
; GCN-LABEL: add_i32_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: buffer_atomic_add v[[value]]
@@ -44,9 +50,10 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
-; GFX8MORE: v_add_u32_dpp
-; GFX8MORE: v_add_u32_dpp
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; DPPCOMB: v_add_u32_dpp
+; DPPCOMB: v_add_u32_dpp
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
@@ -84,11 +91,13 @@ entry:
}
; GCN-LABEL: sub_i32_constant:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: buffer_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
@@ -99,11 +108,13 @@ entry:
}
; GCN-LABEL: sub_i32_uniform:
-; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
-; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
-; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
-; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
-; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
+; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
+; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
+; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: buffer_atomic_sub v[[value]]
@@ -119,9 +130,10 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
-; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; DPPCOMB: v_add_u32_dpp
+; DPPCOMB: v_add_u32_dpp
+; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
+; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
More information about the llvm-commits
mailing list