[llvm] 920befb - [FastISel] Reduce spills around mem-intrinsic calls

Mon Nov 9 09:49:55 PST 2020

Author: Paul Robinson
Date: 2020-11-09T09:45:14-08:00
New Revision: 920befb337ae950009d691aed31ab97089009db2

URL: https://github.com/llvm/llvm-project/commit/920befb337ae950009d691aed31ab97089009db2
DIFF: https://github.com/llvm/llvm-project/commit/920befb337ae950009d691aed31ab97089009db2.diff

LOG: [FastISel] Reduce spills around mem-intrinsic calls

FastISel generates instructions to materialize "local values" at the
top of a block, in the hope that these values could be reused within
the block.  To reduce spills and restores, FastISel treats calls as
sub-block boundaries, flushing the "local value map" at each call.

This patch treats the mem* intrinsics as if they were calls, because
at O0 generally they are calls.  Eliminating these spills/restores is
actually better for debugging (especially a "continue at this line"
command), code size, stack frame size, and maybe even performance.

Differential Revision: https://reviews.llvm.org/D90877

Added: 
    llvm/test/CodeGen/X86/fastisel-memset-flush.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
    llvm/test/CodeGen/AArch64/arm64-abi_align.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 3a5a007371d0..6cf0be5c6533 100644

--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1515,6 +1515,20 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     return selectXRayCustomEvent(II);
   case Intrinsic::xray_typedevent:
     return selectXRayTypedEvent(II);
+
+  case Intrinsic::memcpy:
+  case Intrinsic::memcpy_element_unordered_atomic:
+  case Intrinsic::memcpy_inline:
+  case Intrinsic::memmove:
+  case Intrinsic::memmove_element_unordered_atomic:
+  case Intrinsic::memset:
+  case Intrinsic::memset_element_unordered_atomic:
+    // Flush the local value map just like we do for regular calls,
+    // to avoid excessive spills and reloads.
+    // These intrinsics mostly turn into library calls at O0; and
+    // even memcpy_inline should be treated like one for this purpose.
+    flushLocalValueMap();
+    break;
   }
 
   return fastLowerIntrinsicCall(II);

diff  --git a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
index b76d453c630a..ba2995affe80 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -290,13 +290,13 @@ entry:
 ; Space for s2 is allocated at sp
 
 ; FAST-LABEL: caller42
-; FAST: sub sp, sp, #96
-; Space for s1 is allocated at fp-24 = sp+56
-; FAST: sub x[[A:[0-9]+]], x29, #24
+; FAST: sub sp, sp, #64
+; Space for s1 is allocated at fp-24 = sp+24
+; FAST: add x[[A:[0-9]+]], sp, #24
 ; Call memcpy with size = 24 (0x18)
 ; FAST: mov {{x[0-9]+}}, #24
-; Space for s2 is allocated at sp+32
-; FAST: add x[[A:[0-9]+]], sp, #32
+; Space for s2 is allocated at sp
+; FAST: mov x[[A:[0-9]+]], sp
 ; FAST: bl _memcpy
   %tmp = alloca %struct.s42, align 4
   %tmp1 = alloca %struct.s42, align 4
@@ -339,8 +339,8 @@ entry:
 ; Call memcpy with size = 24 (0x18)
 ; FAST: mov {{x[0-9]+}}, #24
 ; FAST: bl _memcpy
-; Space for s2 is allocated at fp-48
-; FAST: sub x[[B:[0-9]+]], x29, #48
+; Space for s2 is allocated at sp+32
+; FAST: add x[[B:[0-9]+]], sp, #32
 ; Call memcpy again
 ; FAST: bl _memcpy
 ; Address of s1 is passed on stack at sp+8

diff  --git a/llvm/test/CodeGen/X86/fastisel-memset-flush.ll b/llvm/test/CodeGen/X86/fastisel-memset-flush.ll
new file mode 100644
index 000000000000..5d63f5574393
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fastisel-memset-flush.ll
@@ -0,0 +1,40 @@
+; RUN: %llc_dwarf -O0 < %s | FileCheck %s
+
+define dso_local void @foo() !dbg !7 {
+entry:
+  %a = alloca i32, align 4
+  store i32 0, i32* %a, align 4, !dbg !9
+  %0 = bitcast i32* %a to i8*, !dbg !10
+  call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 -86, i64 4, i1 false), !dbg !10
+  %1 = bitcast i32* %a to i8*, !dbg !11
+  call void @other(i8* %1), !dbg !12
+  ret void, !dbg !13
+}
+; CHECK:      callq memset
+; CHECK-NEXT: .loc 1 9 9
+; CHECK-NEXT: leaq
+; CHECK-NEXT: .loc 1 9 3
+; CHECK-NEXT: callq other
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
+
+declare dso_local void @other(i8*)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0 (https://github.com/llvm/llvm-project eaae6fdf67e1f61599331d69a41a7dafe6199667)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "memset-test.c", directory: "/home/probinson/projects/scratch")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project eaae6fdf67e1f61599331d69a41a7dafe6199667)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, type: !8, scopeLine: 5, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 6, column: 7, scope: !7)
+!10 = !DILocation(line: 8, column: 3, scope: !7)
+!11 = !DILocation(line: 9, column: 9, scope: !7)
+!12 = !DILocation(line: 9, column: 3, scope: !7)
+!13 = !DILocation(line: 10, column: 1, scope: !7)