[llvm] r256072 - AMDGPU: fix overlapping copies in copyPhysReg
Nicolai Haehnle via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 18 17:16:07 PST 2015
Author: nha
Date: Fri Dec 18 19:16:06 2015
New Revision: 256072
URL: http://llvm.org/viewvc/llvm-project?rev=256072&view=rev
Log:
AMDGPU: fix overlapping copies in copyPhysReg
Summary:
When copying aggregate registers within the same register class, there may
be an overlap between source and destination that forces us to do the copy
backwards.
Do the simplest possible thing that guarantees the correct order of moves
when there are overlaps, and does whatever when there is no overlap. (The
last part forces some trivial adjustments to test cases.)
Together with r255906, this fixes a VM fault in Unreal Elemental Demo.
While at it, change the generation of kill and def flags to something that
looks more reasonable. This method is used very late during compilation, so
it probably doesn't matter in practice, and to be honest, I don't know if
this change is actually correct because the semantics in connection with
aggregate registers vs. sub-registers are not clear to me.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93264
Reviewers: arsenm, tstellarAMD
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D15622
Modified:
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll
llvm/trunk/test/CodeGen/AMDGPU/flat-address-space.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=256072&r1=256071&r2=256072&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Fri Dec 18 19:16:06 2015
@@ -323,28 +323,29 @@ SIInstrInfo::copyPhysReg(MachineBasicBlo
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
- AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
};
static const int16_t Sub0_7[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
};
static const int16_t Sub0_3[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
};
static const int16_t Sub0_2[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
};
static const int16_t Sub0_1[] = {
- AMDGPU::sub0, AMDGPU::sub1, 0
+ AMDGPU::sub0, AMDGPU::sub1,
};
unsigned Opcode;
- const int16_t *SubIndices;
+ ArrayRef<int16_t> SubIndices;
+ bool Forward;
if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
@@ -428,13 +429,27 @@ SIInstrInfo::copyPhysReg(MachineBasicBlo
llvm_unreachable("Can't copy register!");
}
- while (unsigned SubIdx = *SubIndices++) {
+ if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg))
+ Forward = true;
+ else
+ Forward = false;
+
+ for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+ unsigned SubIdx;
+ if (Forward)
+ SubIdx = SubIndices[Idx];
+ else
+ SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+
MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
get(Opcode), RI.getSubReg(DestReg, SubIdx));
- Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
+ Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
+
+ if (Idx == SubIndices.size() - 1)
+ Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit);
- if (*SubIndices)
+ if (Idx == 0)
Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
}
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll?rev=256072&r1=256071&r2=256072&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll Fri Dec 18 19:16:06 2015
@@ -117,8 +117,8 @@ define void @v_ctpop_v4i64(<4 x i32> add
; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
-; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
-; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
; GCN: s_endpgm
define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
Modified: llvm/trunk/test/CodeGen/AMDGPU/flat-address-space.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/flat-address-space.ll?rev=256072&r1=256071&r2=256072&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/flat-address-space.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/flat-address-space.ll Fri Dec 18 19:16:06 2015
@@ -11,9 +11,12 @@
; remove generic pointers.
; CHECK-LABEL: {{^}}store_flat_i32:
-; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
-; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
-; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK-DAG: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]],
+; CHECK-DAG: s_load_dword s[[SDATA:[0-9]+]],
+; CHECK: s_waitcnt lgkmcnt(0)
+; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]]
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
%fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
More information about the llvm-commits
mailing list