[PATCH] D11926: [NVPTX] Use 32-bit divides instead of 64-bit divides where possible

Tue Aug 11 12:17:46 PDT 2015

meheff updated this revision to Diff 31848.
meheff added a comment.

Test added. Unfortunately the stand alone Eigen3 benchmarks don't show much improvement with this patch because, I believe, they use 32-bit indices throughout. Where we see the huge speedup is in the larger-scale benchmarks using Eigen with 64-bit indices.


http://reviews.llvm.org/D11926

Files:
  lib/Target/NVPTX/NVPTXISelLowering.cpp
  test/CodeGen/NVPTX/bypass-div.ll

Index: test/CodeGen/NVPTX/bypass-div.ll
===================================================================

--- test/CodeGen/NVPTX/bypass-div.ll
+++ test/CodeGen/NVPTX/bypass-div.ll
@@ -0,0 +1,80 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+
+; 64-bit divides and rems should be split into a fast and slow path where
+; the fast path uses a 32-bit operation.
+
+define void @sdiv64(i64 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: sdiv64(
+; CHECK:        div.s64
+; CHECK:        div.u32
+; CHECK:        ret
+  %d = sdiv i64 %a, %b
+  store i64 %d, i64* %retptr
+  ret void
+}
+
+define void @udiv64(i64 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: udiv64(
+; CHECK:        div.u64
+; CHECK:        div.u32
+; CHECK:        ret
+  %d = udiv i64 %a, %b
+  store i64 %d, i64* %retptr
+  ret void
+}
+
+define void @srem64(i64 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: srem64(
+; CHECK:        rem.s64
+; CHECK:        rem.u32
+; CHECK:        ret
+  %d = srem i64 %a, %b
+  store i64 %d, i64* %retptr
+  ret void
+}
+
+define void @urem64(i64 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: urem64(
+; CHECK:        rem.u64
+; CHECK:        rem.u32
+; CHECK:        ret
+  %d = urem i64 %a, %b
+  store i64 %d, i64* %retptr
+  ret void
+}
+
+define void @sdiv32(i32 %a, i32 %b, i32* %retptr) {
+; CHECK-LABEL: sdiv32(
+; CHECK: div.s32
+; CHECK-NOT: div.
+  %d = sdiv i32 %a, %b
+  store i32 %d, i32* %retptr
+  ret void
+}
+
+define void @udiv32(i32 %a, i32 %b, i32* %retptr) {
+; CHECK-LABEL: udiv32(
+; CHECK: div.u32
+; CHECK-NOT: div.
+  %d = udiv i32 %a, %b
+  store i32 %d, i32* %retptr
+  ret void
+}
+
+define void @srem32(i32 %a, i32 %b, i32* %retptr) {
+; CHECK-LABEL: srem32(
+; CHECK: rem.s32
+; CHECK-NOT: rem.
+  %d = srem i32 %a, %b
+  store i32 %d, i32* %retptr
+  ret void
+}
+
+define void @urem32(i32 %a, i32 %b, i32* %retptr) {
+; CHECK-LABEL: urem32(
+; CHECK: rem.u32
+; CHECK-NOT: rem.
+  %d = urem i32 %a, %b
+  store i32 %d, i32* %retptr
+  ret void
+}
Index: lib/Target/NVPTX/NVPTXISelLowering.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -124,6 +124,10 @@
   // condition branches.
   setJumpIsExpensive(true);
 
+  // Wide divides are _very_ slow. Try to reduce the width of the divide if
+  // possible.
+  addBypassSlowDiv(64, 32);
+
   // By default, use the Source scheduling
   if (sched4reg)
     setSchedulingPreference(Sched::RegPressure);


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D11926.31848.patch
Type: text/x-patch
Size: 2505 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150811/b86f9ce0/attachment.bin>