[llvm] r351379 - AMDGPU: Adjust the chain for loads writing to the HI part of a register.
Changpeng Fang via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 16 13:32:54 PST 2019
Author: chfang
Date: Wed Jan 16 13:32:53 2019
New Revision: 351379
URL: http://llvm.org/viewvc/llvm-project?rev=351379&view=rev
Log:
AMDGPU: Adjust the chain for loads writing to the HI part of a register.
Summary:
For these loads that write to the HI part of a register, we should chain them to the op that writes to the LO part
of the register to maintain the appropriate order.
Reviewers:
rampitec, arsenm
Differential Revision:
https://reviews.llvm.org/D56454
Added:
llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=351379&r1=351378&r2=351379&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed Jan 16 13:32:53 2019
@@ -9299,6 +9299,51 @@ SDNode *SITargetLowering::PostISelFoldin
Ops.push_back(ImpDef.getValue(1));
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
+ case AMDGPU::FLAT_LOAD_UBYTE_D16_HI:
+ case AMDGPU::FLAT_LOAD_SBYTE_D16_HI:
+ case AMDGPU::FLAT_LOAD_SHORT_D16_HI:
+ case AMDGPU::GLOBAL_LOAD_UBYTE_D16_HI:
+ case AMDGPU::GLOBAL_LOAD_SBYTE_D16_HI:
+ case AMDGPU::GLOBAL_LOAD_SHORT_D16_HI:
+ case AMDGPU::DS_READ_U16_D16_HI:
+ case AMDGPU::DS_READ_I8_D16_HI:
+ case AMDGPU::DS_READ_U8_D16_HI:
+ case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
+ case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
+ case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
+ case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
+ case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
+ case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: {
+ // For these loads that write to the HI part of a register,
+ // we should chain them to the op that writes to the LO part
+ // of the register to maintain the order.
+ unsigned NumOps = Node->getNumOperands();
+ SDValue OldChain = Node->getOperand(NumOps-1);
+
+ if (OldChain.getValueType() != MVT::Other)
+ break;
+
+ // Look for the chain to replace to.
+ SDValue Lo = Node->getOperand(NumOps-2);
+ SDNode *LoNode = Lo.getNode();
+ if (LoNode->getNumValues() == 1 ||
+ LoNode->getValueType(LoNode->getNumValues() - 1) != MVT::Other)
+ break;
+
+ SDValue NewChain = Lo.getValue(LoNode->getNumValues() - 1);
+ if (NewChain == OldChain) // Already replaced.
+ break;
+
+ SmallVector<SDValue, 16> Ops;
+ for (unsigned I = 0; I < NumOps-1; ++I)
+ Ops.push_back(Node->getOperand(I));
+ // Repalce the Chain.
+ Ops.push_back(NewChain);
+ MachineSDNode *NewNode = DAG.getMachineNode(Opcode, SDLoc(Node),
+ Node->getVTList(), Ops);
+ DAG.setNodeMemRefs(NewNode, Node->memoperands());
+ return NewNode;
+ }
default:
break;
}
Added: llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll?rev=351379&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll Wed Jan 16 13:32:53 2019
@@ -0,0 +1,141 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_private:
+; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_short_d16_hi [[DST]], off, [[RSRC]], [[SOFF]]
+define <2 x half> @chain_hi_to_lo_private() {
+bb:
+ %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
+ %load_lo = load half, half addrspace(5)* %gep_lo
+ %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0
+ %load_hi = load half, half addrspace(5)* %gep_hi
+
+ %temp = insertelement <2 x half> undef, half %load_lo, i32 0
+ %result = insertelement <2 x half> %temp, half %load_hi, i32 1
+
+ ret <2 x half> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_private_different_bases:
+; GCN: buffer_load_ushort [[DST:v[0-9]+]], v{{[0-9]+}}, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_short_d16_hi [[DST]], v{{[0-9]+}}, [[RSRC]], [[SOFF]] offen
+define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) {
+bb:
+ %load_lo = load half, half addrspace(5)* %base_lo
+ %load_hi = load half, half addrspace(5)* %base_hi
+
+ %temp = insertelement <2 x half> undef, half %load_lo, i32 0
+ %result = insertelement <2 x half> %temp, half %load_hi, i32 1
+
+ ret <2 x half> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_arithmatic:
+; GCN: v_add_f16_e32 [[DST:v[0-9]+]], 1.0, v{{[0-9]+}}
+; GCN-NEXT: buffer_load_short_d16_hi [[DST]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) {
+bb:
+ %arith_lo = fadd half %in, 1.0
+ %load_hi = load half, half addrspace(5)* %base
+
+ %temp = insertelement <2 x half> undef, half %arith_lo, i32 0
+ %result = insertelement <2 x half> %temp, half %load_hi, i32 1
+
+ ret <2 x half> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_group:
+; GCN: ds_read_u16 [[DST:v[0-9]+]], [[ADDR:v[0-9]+]] offset:2
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_read_u16_d16_hi [[DST]], [[ADDR]]
+define <2 x half> @chain_hi_to_lo_group() {
+bb:
+ %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1
+ %load_lo = load half, half addrspace(3)* %gep_lo
+ %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0
+ %load_hi = load half, half addrspace(3)* %gep_hi
+
+ %temp = insertelement <2 x half> undef, half %load_lo, i32 0
+ %result = insertelement <2 x half> %temp, half %load_hi, i32 1
+
+ ret <2 x half> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_group_different_bases:
+; GCN: ds_read_u16 [[DST:v[0-9]+]], v{{[0-9]+}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_read_u16_d16_hi [[DST]], v{{[0-9]+}}
+define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base_lo, half addrspace(3)* %base_hi) {
+bb:
+ %load_lo = load half, half addrspace(3)* %base_lo
+ %load_hi = load half, half addrspace(3)* %base_hi
+
+ %temp = insertelement <2 x half> undef, half %load_lo, i32 0
+ %result = insertelement <2 x half> %temp, half %load_hi, i32 1
+
+ ret <2 x half> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_global:
+; GCN: global_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off
+; GCN: global_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}}, off
+define <2 x half> @chain_hi_to_lo_global() {
+bb:
+ %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1
+ %load_lo = load half, half addrspace(1)* %gep_lo
+ %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0
+ %load_hi = load half, half addrspace(1)* %gep_hi
+
+ %temp = insertelement <2 x half> undef, half %load_lo, i32 0
+ %result = insertelement <2 x half> %temp, half %load_hi, i32 1
+
+ ret <2 x half> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_global_different_bases:
+; GCN: global_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}}, off
+define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %base_lo, half addrspace(1)* %base_hi) {
+bb:
+ %load_lo = load half, half addrspace(1)* %base_lo
+ %load_hi = load half, half addrspace(1)* %base_hi
+
+ %temp = insertelement <2 x half> undef, half %load_lo, i32 0
+ %result = insertelement <2 x half> %temp, half %load_hi, i32 1
+
+ ret <2 x half> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_flat:
+; GCN: flat_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}
+; GCN: flat_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}}
+define <2 x half> @chain_hi_to_lo_flat() {
+bb:
+ %gep_lo = getelementptr inbounds half, half* null, i64 1
+ %load_lo = load half, half* %gep_lo
+ %gep_hi = getelementptr inbounds half, half* null, i64 0
+ %load_hi = load half, half* %gep_hi
+
+ %temp = insertelement <2 x half> undef, half %load_lo, i32 0
+ %result = insertelement <2 x half> %temp, half %load_hi, i32 1
+
+ ret <2 x half> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_flat_different_bases:
+; GCN: flat_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: flat_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}}
+define <2 x half> @chain_hi_to_lo_flat_different_bases(half* %base_lo, half* %base_hi) {
+bb:
+ %load_lo = load half, half* %base_lo
+ %load_hi = load half, half* %base_hi
+
+ %temp = insertelement <2 x half> undef, half %load_lo, i32 0
+ %result = insertelement <2 x half> %temp, half %load_hi, i32 1
+
+ ret <2 x half> %result
+}
More information about the llvm-commits
mailing list