[llvm] CodeGenPrepare: Remove threadlocal_address intrinsic when cheap to recompute. (PR #87844)

Matthias Braun via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 10 18:04:24 PDT 2024


https://github.com/MatzeB updated https://github.com/llvm/llvm-project/pull/87844

>From ac60224687a45aef2b8a3dd3e4e12eef10f42de3 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Fri, 5 Apr 2024 14:28:21 -0700
Subject: [PATCH 1/2] Add test for TLS handling change

---
 .../X86/codegen-prepare-addrmode-tls.ll       | 235 ++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll

diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
new file mode 100644
index 00000000000000..882111bf4f9bc8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -o - %s | FileCheck %s --check-prefix=NOPIC
+; RUN: llc -o - %s -relocation-model=pic | FileCheck %s --check-prefix=PIC
+; RUN: llc -o - %s -relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=TLSDESC
+
+target triple = "x86_64--linux-gnu"
+
+declare void @effect()
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+ at foo_local = dso_local thread_local(localexec) global i32 0, align 4
+
+define i32 @func_local_tls(i32 %arg0, i32 %arg1) {
+; NOPIC-LABEL: func_local_tls:
+; NOPIC:       # %bb.0: # %entry
+; NOPIC-NEXT:    pushq %r14
+; NOPIC-NEXT:    .cfi_def_cfa_offset 16
+; NOPIC-NEXT:    pushq %rbx
+; NOPIC-NEXT:    .cfi_def_cfa_offset 24
+; NOPIC-NEXT:    pushq %rax
+; NOPIC-NEXT:    .cfi_def_cfa_offset 32
+; NOPIC-NEXT:    .cfi_offset %rbx, -24
+; NOPIC-NEXT:    .cfi_offset %r14, -16
+; NOPIC-NEXT:    movl %fs:foo_local at TPOFF, %ebx
+; NOPIC-NEXT:    testl %edi, %edi
+; NOPIC-NEXT:    movl %ebx, %eax
+; NOPIC-NEXT:    jne .LBB0_2
+; NOPIC-NEXT:  # %bb.1: # %if.then
+; NOPIC-NEXT:    movq %fs:0, %rax
+; NOPIC-NEXT:    leaq foo_local at TPOFF(%rax), %r14
+; NOPIC-NEXT:    callq effect at PLT
+; NOPIC-NEXT:    movl (%r14), %eax
+; NOPIC-NEXT:  .LBB0_2: # %if.end
+; NOPIC-NEXT:    addl %ebx, %eax
+; NOPIC-NEXT:    addq $8, %rsp
+; NOPIC-NEXT:    .cfi_def_cfa_offset 24
+; NOPIC-NEXT:    popq %rbx
+; NOPIC-NEXT:    .cfi_def_cfa_offset 16
+; NOPIC-NEXT:    popq %r14
+; NOPIC-NEXT:    .cfi_def_cfa_offset 8
+; NOPIC-NEXT:    retq
+;
+; PIC-LABEL: func_local_tls:
+; PIC:       # %bb.0: # %entry
+; PIC-NEXT:    pushq %r14
+; PIC-NEXT:    .cfi_def_cfa_offset 16
+; PIC-NEXT:    pushq %rbx
+; PIC-NEXT:    .cfi_def_cfa_offset 24
+; PIC-NEXT:    pushq %rax
+; PIC-NEXT:    .cfi_def_cfa_offset 32
+; PIC-NEXT:    .cfi_offset %rbx, -24
+; PIC-NEXT:    .cfi_offset %r14, -16
+; PIC-NEXT:    movl %fs:.Lfoo_local$local at TPOFF, %ebx
+; PIC-NEXT:    testl %edi, %edi
+; PIC-NEXT:    movl %ebx, %eax
+; PIC-NEXT:    jne .LBB0_2
+; PIC-NEXT:  # %bb.1: # %if.then
+; PIC-NEXT:    movq %fs:0, %rax
+; PIC-NEXT:    leaq .Lfoo_local$local at TPOFF(%rax), %r14
+; PIC-NEXT:    callq effect at PLT
+; PIC-NEXT:    movl (%r14), %eax
+; PIC-NEXT:  .LBB0_2: # %if.end
+; PIC-NEXT:    addl %ebx, %eax
+; PIC-NEXT:    addq $8, %rsp
+; PIC-NEXT:    .cfi_def_cfa_offset 24
+; PIC-NEXT:    popq %rbx
+; PIC-NEXT:    .cfi_def_cfa_offset 16
+; PIC-NEXT:    popq %r14
+; PIC-NEXT:    .cfi_def_cfa_offset 8
+; PIC-NEXT:    retq
+;
+; TLSDESC-LABEL: func_local_tls:
+; TLSDESC:       # %bb.0: # %entry
+; TLSDESC-NEXT:    pushq %r14
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 16
+; TLSDESC-NEXT:    pushq %rbx
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 24
+; TLSDESC-NEXT:    pushq %rax
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 32
+; TLSDESC-NEXT:    .cfi_offset %rbx, -24
+; TLSDESC-NEXT:    .cfi_offset %r14, -16
+; TLSDESC-NEXT:    movl %fs:.Lfoo_local$local at TPOFF, %ebx
+; TLSDESC-NEXT:    testl %edi, %edi
+; TLSDESC-NEXT:    movl %ebx, %eax
+; TLSDESC-NEXT:    jne .LBB0_2
+; TLSDESC-NEXT:  # %bb.1: # %if.then
+; TLSDESC-NEXT:    movq %fs:0, %rax
+; TLSDESC-NEXT:    leaq .Lfoo_local$local at TPOFF(%rax), %r14
+; TLSDESC-NEXT:    callq effect at PLT
+; TLSDESC-NEXT:    movl (%r14), %eax
+; TLSDESC-NEXT:  .LBB0_2: # %if.end
+; TLSDESC-NEXT:    addl %ebx, %eax
+; TLSDESC-NEXT:    addq $8, %rsp
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 24
+; TLSDESC-NEXT:    popq %rbx
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 16
+; TLSDESC-NEXT:    popq %r14
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 8
+; TLSDESC-NEXT:    retq
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_local)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg0, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %x = add i32 %arg1, 42
+  %addr1 = getelementptr inbounds i32, ptr %addr, i32 %x
+  %load1 = load i32, ptr %addr, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}
+
+ at foo_nonlocal = thread_local global i32 0, align 4
+
+define i32 @func_nonlocal_tls(i32 %arg0, i32 %arg1) {
+; NOPIC-LABEL: func_nonlocal_tls:
+; NOPIC:       # %bb.0: # %entry
+; NOPIC-NEXT:    pushq %r14
+; NOPIC-NEXT:    .cfi_def_cfa_offset 16
+; NOPIC-NEXT:    pushq %rbx
+; NOPIC-NEXT:    .cfi_def_cfa_offset 24
+; NOPIC-NEXT:    pushq %rax
+; NOPIC-NEXT:    .cfi_def_cfa_offset 32
+; NOPIC-NEXT:    .cfi_offset %rbx, -24
+; NOPIC-NEXT:    .cfi_offset %r14, -16
+; NOPIC-NEXT:    movq foo_nonlocal at GOTTPOFF(%rip), %r14
+; NOPIC-NEXT:    movl %fs:(%r14), %ebx
+; NOPIC-NEXT:    testl %edi, %edi
+; NOPIC-NEXT:    movl %ebx, %eax
+; NOPIC-NEXT:    jne .LBB1_2
+; NOPIC-NEXT:  # %bb.1: # %if.then
+; NOPIC-NEXT:    addq %fs:0, %r14
+; NOPIC-NEXT:    callq effect at PLT
+; NOPIC-NEXT:    movl (%r14), %eax
+; NOPIC-NEXT:  .LBB1_2: # %if.end
+; NOPIC-NEXT:    addl %ebx, %eax
+; NOPIC-NEXT:    addq $8, %rsp
+; NOPIC-NEXT:    .cfi_def_cfa_offset 24
+; NOPIC-NEXT:    popq %rbx
+; NOPIC-NEXT:    .cfi_def_cfa_offset 16
+; NOPIC-NEXT:    popq %r14
+; NOPIC-NEXT:    .cfi_def_cfa_offset 8
+; NOPIC-NEXT:    retq
+;
+; PIC-LABEL: func_nonlocal_tls:
+; PIC:       # %bb.0: # %entry
+; PIC-NEXT:    pushq %rbp
+; PIC-NEXT:    .cfi_def_cfa_offset 16
+; PIC-NEXT:    pushq %r14
+; PIC-NEXT:    .cfi_def_cfa_offset 24
+; PIC-NEXT:    pushq %rbx
+; PIC-NEXT:    .cfi_def_cfa_offset 32
+; PIC-NEXT:    .cfi_offset %rbx, -32
+; PIC-NEXT:    .cfi_offset %r14, -24
+; PIC-NEXT:    .cfi_offset %rbp, -16
+; PIC-NEXT:    movl %edi, %ebp
+; PIC-NEXT:    data16
+; PIC-NEXT:    leaq foo_nonlocal at TLSGD(%rip), %rdi
+; PIC-NEXT:    data16
+; PIC-NEXT:    data16
+; PIC-NEXT:    rex64
+; PIC-NEXT:    callq __tls_get_addr at PLT
+; PIC-NEXT:    movq %rax, %rbx
+; PIC-NEXT:    movl (%rax), %r14d
+; PIC-NEXT:    testl %ebp, %ebp
+; PIC-NEXT:    movl %r14d, %eax
+; PIC-NEXT:    jne .LBB1_2
+; PIC-NEXT:  # %bb.1: # %if.then
+; PIC-NEXT:    callq effect at PLT
+; PIC-NEXT:    movl (%rbx), %eax
+; PIC-NEXT:  .LBB1_2: # %if.end
+; PIC-NEXT:    addl %r14d, %eax
+; PIC-NEXT:    popq %rbx
+; PIC-NEXT:    .cfi_def_cfa_offset 24
+; PIC-NEXT:    popq %r14
+; PIC-NEXT:    .cfi_def_cfa_offset 16
+; PIC-NEXT:    popq %rbp
+; PIC-NEXT:    .cfi_def_cfa_offset 8
+; PIC-NEXT:    retq
+;
+; TLSDESC-LABEL: func_nonlocal_tls:
+; TLSDESC:       # %bb.0: # %entry
+; TLSDESC-NEXT:    pushq %r14
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 16
+; TLSDESC-NEXT:    pushq %rbx
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 24
+; TLSDESC-NEXT:    pushq %rax
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 32
+; TLSDESC-NEXT:    .cfi_offset %rbx, -24
+; TLSDESC-NEXT:    .cfi_offset %r14, -16
+; TLSDESC-NEXT:    leaq foo_nonlocal at tlsdesc(%rip), %rax
+; TLSDESC-NEXT:    callq *foo_nonlocal at tlscall(%rax)
+; TLSDESC-NEXT:    movl %fs:(%rax), %ebx
+; TLSDESC-NEXT:    testl %edi, %edi
+; TLSDESC-NEXT:    movl %ebx, %ecx
+; TLSDESC-NEXT:    jne .LBB1_2
+; TLSDESC-NEXT:  # %bb.1: # %if.then
+; TLSDESC-NEXT:    addq %fs:0, %rax
+; TLSDESC-NEXT:    movq %rax, %r14
+; TLSDESC-NEXT:    callq effect at PLT
+; TLSDESC-NEXT:    movl (%r14), %ecx
+; TLSDESC-NEXT:  .LBB1_2: # %if.end
+; TLSDESC-NEXT:    addl %ebx, %ecx
+; TLSDESC-NEXT:    movl %ecx, %eax
+; TLSDESC-NEXT:    addq $8, %rsp
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 24
+; TLSDESC-NEXT:    popq %rbx
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 16
+; TLSDESC-NEXT:    popq %r14
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 8
+; TLSDESC-NEXT:    retq
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_nonlocal)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg0, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %x = add i32 %arg1, 42
+  %addr1 = getelementptr inbounds i32, ptr %addr, i32 %x
+  %load1 = load i32, ptr %addr, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}

>From 9852236e9efba4faec9687cb4a9cc925c14b5695 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Fri, 5 Apr 2024 11:59:23 -0700
Subject: [PATCH 2/2] CodeGenPrepare: Remove threadlocal_address intrinsic when
 cheap to recompute.

The `threadlocal_address` intrinsic is currently ignored/removed for
instruction selection by the `SelectionDAGBuilder` (see also
https://reviews.llvm.org/D125291 ).

However being an Instruction means `SelectionDAG` will assign a register
to it and share the value across basic blocks. This sharing is
suboptimal in the "LocalExec" TLS model on x86 where it is cheaper to
just recompute the address. We saw a 0.5% regression in a codebase with
a lot of TLS usage (HHVM).

This introduces a new `cheapToRecomputeTLSAddress` target lowering
callback and removes the `threadlocal_address` intrinsic in
`CodeGenPrepare` to restore the efficient behavior from before the
introduction of the `threadlocal_address` intrinsic.

This fixes #87437
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  6 ++
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 19 ++++-
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 24 ++++++
 llvm/lib/Target/X86/X86ISelLowering.h         |  2 +
 .../X86/codegen-prepare-addrmode-tls.ll       | 54 +++---------
 .../CodeGenPrepare/X86/sink-addrmode-tls.ll   | 83 +++++++++++++++++++
 6 files changed, 142 insertions(+), 46 deletions(-)
 create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a4dc097446186a..05b22289d6e247 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2798,6 +2798,12 @@ class TargetLoweringBase {
                                      Type *Ty, unsigned AddrSpace,
                                      Instruction *I = nullptr) const;
 
+  /// Returns true if the targets addressing mode can target thread local
+  /// storage (TLS).
+  virtual bool addressingModeSupportsTLS(const GlobalValue &) const {
+    return false;
+  }
+
   /// Return the prefered common base offset.
   virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset,
                                                  int64_t MaxOffset) const {
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e657872c382848..56bf84d8b62c4f 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -5082,6 +5082,16 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
     }
     return true;
   }
+  case Instruction::Call:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(AddrInst)) {
+      if (II->getIntrinsicID() == Intrinsic::threadlocal_address) {
+        GlobalValue &GV = cast<GlobalValue>(*II->getArgOperand(0));
+        if (TLI.addressingModeSupportsTLS(GV)) {
+          return matchAddr(AddrInst->getOperand(0), Depth);
+        }
+      }
+    }
+    break;
   }
   return false;
 }
@@ -5620,11 +5630,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         return Modified;
     }
 
-    if (AddrMode.BaseGV) {
+    GlobalValue *BaseGV = AddrMode.BaseGV;
+    if (BaseGV != nullptr) {
       if (ResultPtr)
         return Modified;
 
-      ResultPtr = AddrMode.BaseGV;
+      if (BaseGV->isThreadLocal()) {
+        ResultPtr = Builder.CreateThreadLocalAddress(BaseGV);
+      } else {
+        ResultPtr = BaseGV;
+      }
     }
 
     // If the real base value actually came from an inttoptr, then the matcher
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 52be35aafb0f57..5f04ca635a334d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18928,6 +18928,30 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("TLS not implemented for this target.");
 }
 
+bool X86TargetLowering::addressingModeSupportsTLS(const GlobalValue &GV) const {
+  if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
+    const TargetMachine &TM = getTargetMachine();
+    TLSModel::Model Model = TM.getTLSModel(&GV);
+    switch (Model) {
+    case TLSModel::LocalExec:
+    case TLSModel::InitialExec:
+      // We can include the %fs segment register in addressing modes.
+      return true;
+    case TLSModel::LocalDynamic:
+    case TLSModel::GeneralDynamic:
+      // These models do not result in %fs relative addresses unless
+      // TLS descriptior are used.
+      //
+      // Even in the case of TLS descriptors we currently have no way to model
+      // the difference between %fs access and the computations needed for the
+      // offset and returning `true` for TLS-desc currently duplicates both
+      // which is detrimental :-/
+      return false;
+    }
+  }
+  return false;
+}
+
 /// Lower SRA_PARTS and friends, which return two i32 values
 /// and take a 2 x i32 value to shift plus a shift amount.
 /// TODO: Can this be moved to general expansion code?
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 0a1e8ca4427314..e348ba6e8ac085 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1323,6 +1323,8 @@ namespace llvm {
                                Type *Ty, unsigned AS,
                                Instruction *I = nullptr) const override;
 
+    bool addressingModeSupportsTLS(const GlobalValue &GV) const override;
+
     /// Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
     /// compare a register against the immediate without having to materialize
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
index 882111bf4f9bc8..52effc295b6474 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
@@ -13,88 +13,55 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
 define i32 @func_local_tls(i32 %arg0, i32 %arg1) {
 ; NOPIC-LABEL: func_local_tls:
 ; NOPIC:       # %bb.0: # %entry
-; NOPIC-NEXT:    pushq %r14
-; NOPIC-NEXT:    .cfi_def_cfa_offset 16
 ; NOPIC-NEXT:    pushq %rbx
-; NOPIC-NEXT:    .cfi_def_cfa_offset 24
-; NOPIC-NEXT:    pushq %rax
-; NOPIC-NEXT:    .cfi_def_cfa_offset 32
-; NOPIC-NEXT:    .cfi_offset %rbx, -24
-; NOPIC-NEXT:    .cfi_offset %r14, -16
+; NOPIC-NEXT:    .cfi_def_cfa_offset 16
+; NOPIC-NEXT:    .cfi_offset %rbx, -16
 ; NOPIC-NEXT:    movl %fs:foo_local at TPOFF, %ebx
 ; NOPIC-NEXT:    testl %edi, %edi
 ; NOPIC-NEXT:    movl %ebx, %eax
 ; NOPIC-NEXT:    jne .LBB0_2
 ; NOPIC-NEXT:  # %bb.1: # %if.then
-; NOPIC-NEXT:    movq %fs:0, %rax
-; NOPIC-NEXT:    leaq foo_local at TPOFF(%rax), %r14
 ; NOPIC-NEXT:    callq effect at PLT
-; NOPIC-NEXT:    movl (%r14), %eax
+; NOPIC-NEXT:    movl %fs:foo_local at TPOFF, %eax
 ; NOPIC-NEXT:  .LBB0_2: # %if.end
 ; NOPIC-NEXT:    addl %ebx, %eax
-; NOPIC-NEXT:    addq $8, %rsp
-; NOPIC-NEXT:    .cfi_def_cfa_offset 24
 ; NOPIC-NEXT:    popq %rbx
-; NOPIC-NEXT:    .cfi_def_cfa_offset 16
-; NOPIC-NEXT:    popq %r14
 ; NOPIC-NEXT:    .cfi_def_cfa_offset 8
 ; NOPIC-NEXT:    retq
 ;
 ; PIC-LABEL: func_local_tls:
 ; PIC:       # %bb.0: # %entry
-; PIC-NEXT:    pushq %r14
-; PIC-NEXT:    .cfi_def_cfa_offset 16
 ; PIC-NEXT:    pushq %rbx
-; PIC-NEXT:    .cfi_def_cfa_offset 24
-; PIC-NEXT:    pushq %rax
-; PIC-NEXT:    .cfi_def_cfa_offset 32
-; PIC-NEXT:    .cfi_offset %rbx, -24
-; PIC-NEXT:    .cfi_offset %r14, -16
+; PIC-NEXT:    .cfi_def_cfa_offset 16
+; PIC-NEXT:    .cfi_offset %rbx, -16
 ; PIC-NEXT:    movl %fs:.Lfoo_local$local at TPOFF, %ebx
 ; PIC-NEXT:    testl %edi, %edi
 ; PIC-NEXT:    movl %ebx, %eax
 ; PIC-NEXT:    jne .LBB0_2
 ; PIC-NEXT:  # %bb.1: # %if.then
-; PIC-NEXT:    movq %fs:0, %rax
-; PIC-NEXT:    leaq .Lfoo_local$local at TPOFF(%rax), %r14
 ; PIC-NEXT:    callq effect at PLT
-; PIC-NEXT:    movl (%r14), %eax
+; PIC-NEXT:    movl %fs:.Lfoo_local$local at TPOFF, %eax
 ; PIC-NEXT:  .LBB0_2: # %if.end
 ; PIC-NEXT:    addl %ebx, %eax
-; PIC-NEXT:    addq $8, %rsp
-; PIC-NEXT:    .cfi_def_cfa_offset 24
 ; PIC-NEXT:    popq %rbx
-; PIC-NEXT:    .cfi_def_cfa_offset 16
-; PIC-NEXT:    popq %r14
 ; PIC-NEXT:    .cfi_def_cfa_offset 8
 ; PIC-NEXT:    retq
 ;
 ; TLSDESC-LABEL: func_local_tls:
 ; TLSDESC:       # %bb.0: # %entry
-; TLSDESC-NEXT:    pushq %r14
-; TLSDESC-NEXT:    .cfi_def_cfa_offset 16
 ; TLSDESC-NEXT:    pushq %rbx
-; TLSDESC-NEXT:    .cfi_def_cfa_offset 24
-; TLSDESC-NEXT:    pushq %rax
-; TLSDESC-NEXT:    .cfi_def_cfa_offset 32
-; TLSDESC-NEXT:    .cfi_offset %rbx, -24
-; TLSDESC-NEXT:    .cfi_offset %r14, -16
+; TLSDESC-NEXT:    .cfi_def_cfa_offset 16
+; TLSDESC-NEXT:    .cfi_offset %rbx, -16
 ; TLSDESC-NEXT:    movl %fs:.Lfoo_local$local at TPOFF, %ebx
 ; TLSDESC-NEXT:    testl %edi, %edi
 ; TLSDESC-NEXT:    movl %ebx, %eax
 ; TLSDESC-NEXT:    jne .LBB0_2
 ; TLSDESC-NEXT:  # %bb.1: # %if.then
-; TLSDESC-NEXT:    movq %fs:0, %rax
-; TLSDESC-NEXT:    leaq .Lfoo_local$local at TPOFF(%rax), %r14
 ; TLSDESC-NEXT:    callq effect at PLT
-; TLSDESC-NEXT:    movl (%r14), %eax
+; TLSDESC-NEXT:    movl %fs:.Lfoo_local$local at TPOFF, %eax
 ; TLSDESC-NEXT:  .LBB0_2: # %if.end
 ; TLSDESC-NEXT:    addl %ebx, %eax
-; TLSDESC-NEXT:    addq $8, %rsp
-; TLSDESC-NEXT:    .cfi_def_cfa_offset 24
 ; TLSDESC-NEXT:    popq %rbx
-; TLSDESC-NEXT:    .cfi_def_cfa_offset 16
-; TLSDESC-NEXT:    popq %r14
 ; TLSDESC-NEXT:    .cfi_def_cfa_offset 8
 ; TLSDESC-NEXT:    retq
 entry:
@@ -135,9 +102,8 @@ define i32 @func_nonlocal_tls(i32 %arg0, i32 %arg1) {
 ; NOPIC-NEXT:    movl %ebx, %eax
 ; NOPIC-NEXT:    jne .LBB1_2
 ; NOPIC-NEXT:  # %bb.1: # %if.then
-; NOPIC-NEXT:    addq %fs:0, %r14
 ; NOPIC-NEXT:    callq effect at PLT
-; NOPIC-NEXT:    movl (%r14), %eax
+; NOPIC-NEXT:    movl %fs:(%r14), %eax
 ; NOPIC-NEXT:  .LBB1_2: # %if.end
 ; NOPIC-NEXT:    addl %ebx, %eax
 ; NOPIC-NEXT:    addq $8, %rsp
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll
new file mode 100644
index 00000000000000..0397240421340a
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s | FileCheck %s
+
+target triple = "x86_64--linux-gnu"
+
+ at foo = dso_local thread_local(localexec) global i32 0, align 4
+
+declare void @effect()
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+define i32 @func0(i32 %arg) {
+; CHECK-LABEL: define i32 @func0(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[ARG]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @effect()
+; CHECK-NEXT:    [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %load1 = load i32, ptr %addr, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}
+
+define i32 @func1(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @func1(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @effect()
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[ARG1]], 42
+; CHECK-NEXT:    [[ADDR1:%.*]] = getelementptr inbounds i32, ptr [[ADDR]], i32 [[X]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg0, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %x = add i32 %arg1, 42
+  %addr1 = getelementptr inbounds i32, ptr %addr, i32 %x
+  %load1 = load i32, ptr %addr, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}



More information about the llvm-commits mailing list