[llvm] CodeGenPrepare: Remove threadlocal_address intrinsic when cheap to recompute. (PR #87844)

Matthias Braun via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 8 10:39:50 PDT 2024


https://github.com/MatzeB updated https://github.com/llvm/llvm-project/pull/87844

>From 1e058c9f1544fdb30dc67b3c5dccf1d78cfca492 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Fri, 5 Apr 2024 14:28:21 -0700
Subject: [PATCH 1/2] Add test for TLS handling change

---
 llvm/test/CodeGen/X86/tls-multi-use.ll | 175 +++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/tls-multi-use.ll

diff --git a/llvm/test/CodeGen/X86/tls-multi-use.ll b/llvm/test/CodeGen/X86/tls-multi-use.ll
new file mode 100644
index 00000000000000..74f18560db5249
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tls-multi-use.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -o - %s | FileCheck %s
+; RUN: llc -o - -relocation-model=pic %s | FileCheck --check-prefix=CHECK_PIC %s
+target triple = "x86_64--linux-gnu"
+
+ at foo = dso_local thread_local global i32 0, align 4
+
+declare i32 @rand()
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+define i32 @tls_multi_use(i32 %arg) {
+; CHECK-LABEL: tls_multi_use:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset %rbx, -24
+; CHECK-NEXT:    .cfi_offset %r14, -16
+; CHECK-NEXT:    movl %fs:foo at TPOFF, %ebx
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    movq %fs:0, %rax
+; CHECK-NEXT:    leaq foo at TPOFF(%rax), %r14
+; CHECK-NEXT:    callq rand at PLT
+; CHECK-NEXT:    movl (%r14), %eax
+; CHECK-NEXT:  .LBB0_2: # %if.end
+; CHECK-NEXT:    addl %eax, %ebx
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; CHECK_PIC-LABEL: tls_multi_use:
+; CHECK_PIC:       # %bb.0: # %entry
+; CHECK_PIC-NEXT:    pushq %rbp
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 16
+; CHECK_PIC-NEXT:    pushq %r14
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 24
+; CHECK_PIC-NEXT:    pushq %rbx
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 32
+; CHECK_PIC-NEXT:    .cfi_offset %rbx, -32
+; CHECK_PIC-NEXT:    .cfi_offset %r14, -24
+; CHECK_PIC-NEXT:    .cfi_offset %rbp, -16
+; CHECK_PIC-NEXT:    movl %edi, %ebp
+; CHECK_PIC-NEXT:    leaq .Lfoo$local at TLSLD(%rip), %rdi
+; CHECK_PIC-NEXT:    callq __tls_get_addr at PLT
+; CHECK_PIC-NEXT:    movl .Lfoo$local at DTPOFF(%rax), %ebx
+; CHECK_PIC-NEXT:    testl %ebp, %ebp
+; CHECK_PIC-NEXT:    movl %ebx, %ecx
+; CHECK_PIC-NEXT:    jne .LBB0_2
+; CHECK_PIC-NEXT:  # %bb.1: # %if.then
+; CHECK_PIC-NEXT:    leaq .Lfoo$local at DTPOFF(%rax), %r14
+; CHECK_PIC-NEXT:    callq rand at PLT
+; CHECK_PIC-NEXT:    movl (%r14), %ecx
+; CHECK_PIC-NEXT:  .LBB0_2: # %if.end
+; CHECK_PIC-NEXT:    addl %ecx, %ebx
+; CHECK_PIC-NEXT:    movl %ebx, %eax
+; CHECK_PIC-NEXT:    popq %rbx
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 24
+; CHECK_PIC-NEXT:    popq %r14
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 16
+; CHECK_PIC-NEXT:    popq %rbp
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 8
+; CHECK_PIC-NEXT:    retq
+entry:
+  %addr = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call i32 @rand()
+  %load1 = load i32, ptr %addr, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %add = add nsw i32 %load0, %phi
+  ret i32 %add
+}
+
+ at foo_nonlocal = thread_local global i32 0, align 4
+
+define i32 @tls_multi_use_nonlocal(i32 %arg) {
+; CHECK-LABEL: tls_multi_use_nonlocal:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset %rbx, -24
+; CHECK-NEXT:    .cfi_offset %r14, -16
+; CHECK-NEXT:    movq foo_nonlocal at GOTTPOFF(%rip), %r14
+; CHECK-NEXT:    movl %fs:(%r14), %ebx
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    jne .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    addq %fs:0, %r14
+; CHECK-NEXT:    callq rand at PLT
+; CHECK-NEXT:    movl (%r14), %eax
+; CHECK-NEXT:  .LBB1_2: # %if.end
+; CHECK-NEXT:    addl %eax, %ebx
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; CHECK_PIC-LABEL: tls_multi_use_nonlocal:
+; CHECK_PIC:       # %bb.0: # %entry
+; CHECK_PIC-NEXT:    pushq %rbp
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 16
+; CHECK_PIC-NEXT:    pushq %r14
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 24
+; CHECK_PIC-NEXT:    pushq %rbx
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 32
+; CHECK_PIC-NEXT:    .cfi_offset %rbx, -32
+; CHECK_PIC-NEXT:    .cfi_offset %r14, -24
+; CHECK_PIC-NEXT:    .cfi_offset %rbp, -16
+; CHECK_PIC-NEXT:    movl %edi, %ebp
+; CHECK_PIC-NEXT:    data16
+; CHECK_PIC-NEXT:    leaq foo_nonlocal at TLSGD(%rip), %rdi
+; CHECK_PIC-NEXT:    data16
+; CHECK_PIC-NEXT:    data16
+; CHECK_PIC-NEXT:    rex64
+; CHECK_PIC-NEXT:    callq __tls_get_addr at PLT
+; CHECK_PIC-NEXT:    movq %rax, %r14
+; CHECK_PIC-NEXT:    movl (%rax), %ebx
+; CHECK_PIC-NEXT:    testl %ebp, %ebp
+; CHECK_PIC-NEXT:    movl %ebx, %eax
+; CHECK_PIC-NEXT:    jne .LBB1_2
+; CHECK_PIC-NEXT:  # %bb.1: # %if.then
+; CHECK_PIC-NEXT:    callq rand at PLT
+; CHECK_PIC-NEXT:    movl (%r14), %eax
+; CHECK_PIC-NEXT:  .LBB1_2: # %if.end
+; CHECK_PIC-NEXT:    addl %eax, %ebx
+; CHECK_PIC-NEXT:    movl %ebx, %eax
+; CHECK_PIC-NEXT:    popq %rbx
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 24
+; CHECK_PIC-NEXT:    popq %r14
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 16
+; CHECK_PIC-NEXT:    popq %rbp
+; CHECK_PIC-NEXT:    .cfi_def_cfa_offset 8
+; CHECK_PIC-NEXT:    retq
+entry:
+  %addr = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo_nonlocal)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call i32 @rand()
+  %load1 = load i32, ptr %addr, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %add = add nsw i32 %load0, %phi
+  ret i32 %add
+}

>From c25fd0befa635cde1578f4af9190ffce2ef8f41e Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Fri, 5 Apr 2024 11:59:23 -0700
Subject: [PATCH 2/2] CodeGenPrepare: Remove threadlocal_address intrinsic when
 cheap to recompute.

The `threadlocal_address` intrinsic is currently ignored/removed for
instruction selection by the `SelectionDAGBuilder` (see also
https://reviews.llvm.org/D125291 ).

However being an Instruction means `SelectionDAG` will assign a register
to it and share the value across basic blocks. This sharing is
suboptimal in the "LocalExec" TLS model on x86 where it is cheaper to
just recompute the address. We saw a 0.5% regression in a codebase with
a lot of TLS usage (HHVM).

This introduces a new `cheapToRecomputeTLSAddress` target lowering
callback and removes the `threadlocal_address` intrinsic in
`CodeGenPrepare` to restore the efficient behavior from before the
introduction of the `threadlocal_address` intrinsic.

This fixes #87437
---
 llvm/include/llvm/CodeGen/TargetLowering.h |  7 +++++++
 llvm/lib/CodeGen/CodeGenPrepare.cpp        | 12 ++++++++++++
 llvm/lib/Target/X86/X86ISelLowering.cpp    | 10 ++++++++++
 llvm/lib/Target/X86/X86ISelLowering.h      |  2 ++
 llvm/test/CodeGen/X86/tls-multi-use.ll     | 17 +++--------------
 5 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a4dc097446186a..0152e8506cdb75 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3393,6 +3393,13 @@ class TargetLoweringBase {
     return nullptr;
   }
 
+  /// Returns true if thread local storage (TLS) addresses are so cheap to
+  /// re-compute that it is not worth keeping them in a register between basic
+  /// blocks.
+  virtual bool cheapToRecomputeTLSAddress(const GlobalVariable &) const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e657872c382848..32caf402844213 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2528,6 +2528,18 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
       return optimizeGatherScatterInst(II, II->getArgOperand(0));
     case Intrinsic::masked_scatter:
       return optimizeGatherScatterInst(II, II->getArgOperand(1));
+    case Intrinsic::threadlocal_address:
+      // SelectionDAGBuilder currently skips this intrinsic anyway; but removing
+      // it earlier means the addresses will not be kept in registers accross
+      // basic blocks but recomputed. This is preferable on architectures where
+      // TLS is part of normal addressing modes.
+      GlobalVariable &GV = cast<GlobalVariable>(*II->getArgOperand(0));
+      if (TLI->cheapToRecomputeTLSAddress(GV)) {
+        replaceAllUsesWith(II, &GV, FreshBBs, IsHugeFunc);
+        II->eraseFromParent();
+        return true;
+      }
+      break;
     }
 
     SmallVector<Value *, 2> PtrOps;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6f65344215c020..ba99abcb9899b5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18913,6 +18913,16 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("TLS not implemented for this target.");
 }
 
+bool X86TargetLowering::cheapToRecomputeTLSAddress(
+    const GlobalVariable &GV) const {
+  if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
+    const TargetMachine &TM = getTargetMachine();
+    TLSModel::Model Model = TM.getTLSModel(&GV);
+    return Model == TLSModel::LocalExec;
+  }
+  return false;
+}
+
 /// Lower SRA_PARTS and friends, which return two i32 values
 /// and take a 2 x i32 value to shift plus a shift amount.
 /// TODO: Can this be moved to general expansion code?
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 0a1e8ca4427314..e3480a84bd9581 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1661,6 +1661,8 @@ namespace llvm {
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
 
+    bool cheapToRecomputeTLSAddress(const GlobalVariable &GV) const override;
+
     /// Creates target global address or external symbol nodes for calls or
     /// other uses.
     SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/X86/tls-multi-use.ll b/llvm/test/CodeGen/X86/tls-multi-use.ll
index 74f18560db5249..6d5e00afef62e3 100644
--- a/llvm/test/CodeGen/X86/tls-multi-use.ll
+++ b/llvm/test/CodeGen/X86/tls-multi-use.ll
@@ -11,31 +11,20 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
 define i32 @tls_multi_use(i32 %arg) {
 ; CHECK-LABEL: tls_multi_use:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbx, -16
 ; CHECK-NEXT:    movl %fs:foo at TPOFF, %ebx
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    movl %ebx, %eax
 ; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
-; CHECK-NEXT:    movq %fs:0, %rax
-; CHECK-NEXT:    leaq foo at TPOFF(%rax), %r14
 ; CHECK-NEXT:    callq rand at PLT
-; CHECK-NEXT:    movl (%r14), %eax
+; CHECK-NEXT:    movl %fs:foo at TPOFF, %eax
 ; CHECK-NEXT:  .LBB0_2: # %if.end
 ; CHECK-NEXT:    addl %eax, %ebx
 ; CHECK-NEXT:    movl %ebx, %eax
-; CHECK-NEXT:    addq $8, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 ;



More information about the llvm-commits mailing list