[llvm] r292299 - [NVPTX] Improve lowering of llvm.ctlz.
Justin Lebar via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 17 16:07:35 PST 2017
Author: jlebar
Date: Tue Jan 17 18:07:35 2017
New Revision: 292299
URL: http://llvm.org/viewvc/llvm-project?rev=292299&view=rev
Log:
[NVPTX] Improve lowering of llvm.ctlz.
Summary:
* Disable "ctlz speculation", which inserts a branch on every ctlz(x) which
has defined behavior on x == 0 to check whether x is, in fact zero.
* Add DAG patterns that avoid re-truncating or re-expanding the result
of the 16- and 64-bit ctz instructions.
Reviewers: tra
Subscribers: llvm-commits, jholewinski
Differential Revision: https://reviews.llvm.org/D28719
Modified:
llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.h
llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/trunk/test/CodeGen/NVPTX/ctlz.ll
Modified: llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.h?rev=292299&r1=292298&r2=292299&view=diff
==============================================================================
--- llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.h (original)
+++ llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.h Tue Jan 17 18:07:35 2017
@@ -517,6 +517,12 @@ public:
bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
+ // The default is to transform llvm.ctlz(x, false) (where false indicates that
+ // x == 0 is not undefined behavior) into a branch that checks whether x is 0
+ // and avoids calling ctlz in that case. We have a dedicated ctlz
+ // instruction, so we say that ctlz is cheap to speculate.
+ bool isCheapToSpeculateCtlz() const override { return true; }
+
private:
const NVPTXSubtarget &STI; // cache the subtarget here
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
Modified: llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td?rev=292299&r1=292298&r2=292299&view=diff
==============================================================================
--- llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td (original)
+++ llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td Tue Jan 17 18:07:35 2017
@@ -2774,18 +2774,32 @@ let hasSideEffects = 0 in {
// 32-bit has a direct PTX instruction
def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
-// to 64-bit to match the LLVM semantics
+// The return type of the ctlz ISD node is the same as its input, but the PTX
+// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
+// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
+// truncating back down to 32 bits.
def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back
-// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
-// than 16 bits to store). We also need to subtract 16 because the
-// high-order 16 zeros were counted.
+// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
+// result back to 16-bits if necessary. We also need to subtract 16 because
+// the high-order 16 zeros were counted.
+//
+// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
+// use to save one SASS instruction (on sm_35 anyway):
+//
+// mov.b32 $tmp, {0xffff, $a}
+// ctlz.b32 $result, $tmp
+//
+// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
+// and then ctlz that value. This way we don't have to subtract 16 from the
+// result. Unfortunately today we don't have a way to generate
+// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
def : Pat<(ctlz Int16Regs:$a),
- (SUBi16ri (CVT_u16_u32 (CLZr32
- (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
- CvtNONE), 16)>;
+ (SUBi16ri (CVT_u16_u32
+ (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
+def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+ (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
// Population count
let hasSideEffects = 0 in {
Modified: llvm/trunk/test/CodeGen/NVPTX/ctlz.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/NVPTX/ctlz.ll?rev=292299&r1=292298&r2=292299&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/NVPTX/ctlz.ll (original)
+++ llvm/trunk/test/CodeGen/NVPTX/ctlz.ll Tue Jan 17 18:07:35 2017
@@ -6,39 +6,127 @@ declare i16 @llvm.ctlz.i16(i16, i1) read
declare i32 @llvm.ctlz.i32(i32, i1) readnone
declare i64 @llvm.ctlz.i64(i64, i1) readnone
+; There should be no difference between llvm.ctlz.i32(%a, true) and
+; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
+
+; CHECK-LABEL: myctpop(
define i32 @myctpop(i32 %a) {
-; CHECK: clz.b32
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
ret i32 %val
}
-
-define i16 @myctpop16(i16 %a) {
-; CHECK: clz.b32
- %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
- ret i16 %val
+; CHECK-LABEL: myctpop_2(
+define i32 @myctpop_2(i32 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
+ ret i32 %val
}
+; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
+; value, so here we have to zero-extend it.
+; CHECK-LABEL: myctpop64(
define i64 @myctpop64(i64 %a) {
-; CHECK: clz.b64
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: cvt.u64.u32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
ret i64 %val
}
+; CHECK-LABEL: myctpop64_2(
+define i64 @myctpop64_2(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: cvt.u64.u32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+ %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
+ ret i64 %val
+}
-
-define i32 @myctpop_2(i32 %a) {
-; CHECK: clz.b32
- %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
- ret i32 %val
+; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
+; natural return width of ptx's clz.b64 instruction. No conversions should be
+; necessary in the PTX.
+; CHECK-LABEL: myctpop64_as_32(
+define i32 @myctpop64_as_32(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+ %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
+ %trunc = trunc i64 %val to i32
+ ret i32 %trunc
+}
+; CHECK-LABEL: myctpop64_as_32_2(
+define i32 @myctpop64_as_32_2(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+ %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
+ %trunc = trunc i64 %val to i32
+ ret i32 %trunc
}
-define i16 @myctpop16_2(i16 %a) {
-; CHECK: clz.b32
+; ctlz.i16 is implemented by extending the input to i32, computing the result,
+; and then truncating the result back down to i16. But the NVPTX ABI
+; zero-extends i16 return values to i32, so the final truncation doesn't appear
+; in this function.
+; CHECK-LABEL: myctpop_ret16(
+define i16 @myctpop_ret16(i16 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: sub.
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+ ret i16 %val
+}
+; CHECK-LABEL: myctpop_ret16_2(
+define i16 @myctpop_ret16_2(i16 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: sub.
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
ret i16 %val
}
-define i64 @myctpop64_2(i64 %a) {
-; CHECK: clz.b64
- %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
- ret i64 %val
+; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
+; remain.
+; CHECK-LABEL: myctpop_store16(
+define void @myctpop_store16(i16 %a, i16* %b) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NET: clz.b32
+; CHECK-DAG: cvt.u16.u32
+; CHECK-DAG: sub.
+; CHECK: st.{{[a-z]}}16
+; CHECK: ret;
+ %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+ store i16 %val, i16* %b
+ ret void
+}
+; CHECK-LABEL: myctpop_store16_2(
+define void @myctpop_store16_2(i16 %a, i16* %b) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NET: clz.b32
+; CHECK-DAG: cvt.u16.u32
+; CHECK-DAG: sub.
+; CHECK: st.{{[a-z]}}16
+; CHECK: ret;
+ %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+ store i16 %val, i16* %b
+ ret void
}
More information about the llvm-commits
mailing list