[PATCH] D58210: [SelectionDAGLegalize] Improve promotion of CTLZ
Jonas Paulsson via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 13 14:08:00 PST 2019
jonpa created this revision.
jonpa added reviewers: spatel, uweigand, arsenm.
Herald added subscribers: tpr, nhaehnle, wdng, jvesely.
When CTLZ is first promoted from i16 to i32 by type-legalization, and then again from i32 to i64 during Legalization, two unfolded subtractions of constants will remain in output.
This was discussed on llvm-dev, see http://lists.llvm.org/pipermail/llvm-dev/2019-February/thread.html#130115. Sanjay pointed out that the reason for this missed optimization is that DAGCombiner::visitTRUNCATE() is restricted to handle this pre-legalize only.
As an alternative to enabling the DAGCombiner post-legalize also, this is a patch that addresses this issue directly by detecting this case during promotion of CTLZ and propagating the constant to the present add (of negative constant) instead of always creating a new sub.
Does this look reasonable? This is NFC on SystemZ/SPEC, but improves tests both on SystemZ and AMDGPU.
https://reviews.llvm.org/D58210
Files:
lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
test/CodeGen/AMDGPU/ctlz.ll
test/CodeGen/SystemZ/scalar-ctlz.ll
Index: test/CodeGen/SystemZ/scalar-ctlz.ll
===================================================================
--- test/CodeGen/SystemZ/scalar-ctlz.ll
+++ test/CodeGen/SystemZ/scalar-ctlz.ll
@@ -1,6 +1,4 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
-;
-; FIXME: two consecutive immediate adds not fused in i16/i8 functions.
declare i64 @llvm.ctlz.i64(i64, i1)
declare i32 @llvm.ctlz.i32(i32, i1)
@@ -56,8 +54,7 @@
; CHECK-NEXT: # kill
; CHECK-NEXT: llghr %r0, %r2
; CHECK-NEXT: flogr %r2, %r0
-; CHECK-NEXT: aghi %r2, -32
-; CHECK-NEXT: ahi %r2, -16
+; CHECK-NEXT: ahi %r2, -48
; CHECK-NEXT: # kill
; CHECK-NEXT: br %r14
%1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 false)
@@ -70,8 +67,7 @@
; CHECK-NEXT: # kill
; CHECK-NEXT: llghr %r0, %r2
; CHECK-NEXT: flogr %r2, %r0
-; CHECK-NEXT: aghi %r2, -32
-; CHECK-NEXT: ahi %r2, -16
+; CHECK-NEXT: ahi %r2, -48
; CHECK-NEXT: # kill
; CHECK-NEXT: br %r14
%1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 true)
@@ -84,8 +80,7 @@
; CHECK-NEXT: # kill
; CHECK-NEXT: llgcr %r0, %r2
; CHECK-NEXT: flogr %r2, %r0
-; CHECK-NEXT: aghi %r2, -32
-; CHECK-NEXT: ahi %r2, -24
+; CHECK-NEXT: ahi %r2, -56
; CHECK-NEXT: # kill
; CHECK-NEXT: br %r14
%1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 false)
@@ -98,8 +93,7 @@
; CHECK-NEXT: # kill
; CHECK-NEXT: llgcr %r0, %r2
; CHECK-NEXT: flogr %r2, %r0
-; CHECK-NEXT: aghi %r2, -32
-; CHECK-NEXT: ahi %r2, -24
+; CHECK-NEXT: ahi %r2, -56
; CHECK-NEXT: # kill
; CHECK-NEXT: br %r14
%1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 true)
Index: test/CodeGen/AMDGPU/ctlz.ll
===================================================================
--- test/CodeGen/AMDGPU/ctlz.ll
+++ test/CodeGen/AMDGPU/ctlz.ll
@@ -112,7 +112,7 @@
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 32, [[FFBH]], vcc
; SI: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, 24, [[SELECT]]
-; VI: v_add_u32_e32 [[RESULT:v[0-9]+]], vcc, -16, [[SELECT]]
+; VI: v_u16_e32 [[RESULT:v[0-9]+]], 24, [[SELECT]]
; GCN: buffer_store_byte [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4142,10 +4142,24 @@
Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
if (Node->getOpcode() == ISD::CTLZ ||
Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
- // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
- Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
- DAG.getConstant(NVT.getSizeInBits() -
- OVT.getSizeInBits(), dl, NVT));
+ unsigned Bits = NVT.getSizeInBits() - OVT.getSizeInBits();
+ // If type-legalization has already promoted the CTLZ once, replace the
+ // add with an updated constant to avoid the extra unfolded add. This
+ // is needed as long as DAGCombiner::visitTRUNCATE() will not help this
+ // case post-legalize.
+ SDValue NewAdd;
+ if (Node->hasOneUse() && Node->use_begin()->getOpcode() == ISD::ADD) {
+ SDNode *Add = *Node->use_begin();
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Add->getOperand(1))) {
+ NewAdd = DAG.getNode(ISD::ADD, dl, OVT, Add->getOperand(0),
+ DAG.getConstant(C->getSExtValue() - Bits, dl, OVT));
+ ReplaceNode(SDValue(Add, 0), NewAdd);
+ }
+ }
+ if (!NewAdd.getNode())
+ // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
+ Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
+ DAG.getConstant(Bits, dl, NVT));
}
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
break;
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D58210.186738.patch
Type: text/x-patch
Size: 3897 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190213/8fd5ba5a/attachment.bin>
More information about the llvm-commits
mailing list