[llvm] CodeGenPrepare: Add support for llvm.threadlocal.address address-mode sinking (PR #87844)

Matthias Braun via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 16 13:19:04 PDT 2024


https://github.com/MatzeB updated https://github.com/llvm/llvm-project/pull/87844

>From 1fe6dd6ae3c3ba9f2222536e63d3c3aa3636122a Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Fri, 5 Apr 2024 14:28:21 -0700
Subject: [PATCH 1/3] Add test for TLS handling change

---
 .../X86/codegen-prepare-addrmode-tls.ll       | 196 ++++++++++++++++++
 1 file changed, 196 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll

diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
new file mode 100644
index 00000000000000..25d77a05afb4a5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -o - %s | FileCheck %s --check-prefix=NOPIC
+; RUN: llc -o - %s -relocation-model=pic | FileCheck %s --check-prefix=PIC
+; RUN: llc -o - %s -relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=TLSDESC
+
+target triple = "x86_64--linux-gnu"
+
+declare void @effect()
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+ at foo_local = dso_local thread_local(localexec) global i32 0, align 4
+
+define i32 @func_local_tls(i32 %arg0, i64 %arg1) nounwind {
+; NOPIC-LABEL: func_local_tls:
+; NOPIC:       # %bb.0: # %entry
+; NOPIC-NEXT:    pushq %rbp
+; NOPIC-NEXT:    pushq %r14
+; NOPIC-NEXT:    pushq %rbx
+; NOPIC-NEXT:    movl %fs:foo_local at TPOFF, %ebp
+; NOPIC-NEXT:    testl %edi, %edi
+; NOPIC-NEXT:    movl %ebp, %eax
+; NOPIC-NEXT:    jne .LBB0_2
+; NOPIC-NEXT:  # %bb.1: # %if.then
+; NOPIC-NEXT:    movq %rsi, %rbx
+; NOPIC-NEXT:    movq %fs:0, %rax
+; NOPIC-NEXT:    leaq foo_local at TPOFF(%rax), %r14
+; NOPIC-NEXT:    callq effect at PLT
+; NOPIC-NEXT:    movl 168(%r14,%rbx,4), %eax
+; NOPIC-NEXT:  .LBB0_2: # %if.end
+; NOPIC-NEXT:    addl %ebp, %eax
+; NOPIC-NEXT:    popq %rbx
+; NOPIC-NEXT:    popq %r14
+; NOPIC-NEXT:    popq %rbp
+; NOPIC-NEXT:    retq
+;
+; PIC-LABEL: func_local_tls:
+; PIC:       # %bb.0: # %entry
+; PIC-NEXT:    pushq %rbp
+; PIC-NEXT:    pushq %r14
+; PIC-NEXT:    pushq %rbx
+; PIC-NEXT:    movl %fs:.Lfoo_local$local at TPOFF, %ebp
+; PIC-NEXT:    testl %edi, %edi
+; PIC-NEXT:    movl %ebp, %eax
+; PIC-NEXT:    jne .LBB0_2
+; PIC-NEXT:  # %bb.1: # %if.then
+; PIC-NEXT:    movq %rsi, %rbx
+; PIC-NEXT:    movq %fs:0, %rax
+; PIC-NEXT:    leaq .Lfoo_local$local at TPOFF(%rax), %r14
+; PIC-NEXT:    callq effect at PLT
+; PIC-NEXT:    movl 168(%r14,%rbx,4), %eax
+; PIC-NEXT:  .LBB0_2: # %if.end
+; PIC-NEXT:    addl %ebp, %eax
+; PIC-NEXT:    popq %rbx
+; PIC-NEXT:    popq %r14
+; PIC-NEXT:    popq %rbp
+; PIC-NEXT:    retq
+;
+; TLSDESC-LABEL: func_local_tls:
+; TLSDESC:       # %bb.0: # %entry
+; TLSDESC-NEXT:    pushq %rbp
+; TLSDESC-NEXT:    pushq %r14
+; TLSDESC-NEXT:    pushq %rbx
+; TLSDESC-NEXT:    movl %fs:.Lfoo_local$local at TPOFF, %ebp
+; TLSDESC-NEXT:    testl %edi, %edi
+; TLSDESC-NEXT:    movl %ebp, %eax
+; TLSDESC-NEXT:    jne .LBB0_2
+; TLSDESC-NEXT:  # %bb.1: # %if.then
+; TLSDESC-NEXT:    movq %rsi, %rbx
+; TLSDESC-NEXT:    movq %fs:0, %rax
+; TLSDESC-NEXT:    leaq .Lfoo_local$local at TPOFF(%rax), %r14
+; TLSDESC-NEXT:    callq effect at PLT
+; TLSDESC-NEXT:    movl 168(%r14,%rbx,4), %eax
+; TLSDESC-NEXT:  .LBB0_2: # %if.end
+; TLSDESC-NEXT:    addl %ebp, %eax
+; TLSDESC-NEXT:    popq %rbx
+; TLSDESC-NEXT:    popq %r14
+; TLSDESC-NEXT:    popq %rbp
+; TLSDESC-NEXT:    retq
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_local)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg0, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %x = add i64 %arg1, 42
+  %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x
+  %load1 = load i32, ptr %addr1, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}
+
+ at foo_nonlocal = thread_local global i32 0, align 4
+
+define i32 @func_nonlocal_tls(i32 %arg0, i64 %arg1) nounwind {
+; NOPIC-LABEL: func_nonlocal_tls:
+; NOPIC:       # %bb.0: # %entry
+; NOPIC-NEXT:    pushq %rbp
+; NOPIC-NEXT:    pushq %r14
+; NOPIC-NEXT:    pushq %rbx
+; NOPIC-NEXT:    movq foo_nonlocal at GOTTPOFF(%rip), %r14
+; NOPIC-NEXT:    movl %fs:(%r14), %ebp
+; NOPIC-NEXT:    testl %edi, %edi
+; NOPIC-NEXT:    movl %ebp, %eax
+; NOPIC-NEXT:    jne .LBB1_2
+; NOPIC-NEXT:  # %bb.1: # %if.then
+; NOPIC-NEXT:    movq %rsi, %rbx
+; NOPIC-NEXT:    addq %fs:0, %r14
+; NOPIC-NEXT:    callq effect at PLT
+; NOPIC-NEXT:    movl 168(%r14,%rbx,4), %eax
+; NOPIC-NEXT:  .LBB1_2: # %if.end
+; NOPIC-NEXT:    addl %ebp, %eax
+; NOPIC-NEXT:    popq %rbx
+; NOPIC-NEXT:    popq %r14
+; NOPIC-NEXT:    popq %rbp
+; NOPIC-NEXT:    retq
+;
+; PIC-LABEL: func_nonlocal_tls:
+; PIC:       # %bb.0: # %entry
+; PIC-NEXT:    pushq %rbp
+; PIC-NEXT:    pushq %r15
+; PIC-NEXT:    pushq %r14
+; PIC-NEXT:    pushq %rbx
+; PIC-NEXT:    pushq %rax
+; PIC-NEXT:    movq %rsi, %rbx
+; PIC-NEXT:    movl %edi, %ebp
+; PIC-NEXT:    data16
+; PIC-NEXT:    leaq foo_nonlocal at TLSGD(%rip), %rdi
+; PIC-NEXT:    data16
+; PIC-NEXT:    data16
+; PIC-NEXT:    rex64
+; PIC-NEXT:    callq __tls_get_addr at PLT
+; PIC-NEXT:    movq %rax, %r14
+; PIC-NEXT:    movl (%rax), %r15d
+; PIC-NEXT:    testl %ebp, %ebp
+; PIC-NEXT:    movl %r15d, %eax
+; PIC-NEXT:    jne .LBB1_2
+; PIC-NEXT:  # %bb.1: # %if.then
+; PIC-NEXT:    callq effect at PLT
+; PIC-NEXT:    movl 168(%r14,%rbx,4), %eax
+; PIC-NEXT:  .LBB1_2: # %if.end
+; PIC-NEXT:    addl %r15d, %eax
+; PIC-NEXT:    addq $8, %rsp
+; PIC-NEXT:    popq %rbx
+; PIC-NEXT:    popq %r14
+; PIC-NEXT:    popq %r15
+; PIC-NEXT:    popq %rbp
+; PIC-NEXT:    retq
+;
+; TLSDESC-LABEL: func_nonlocal_tls:
+; TLSDESC:       # %bb.0: # %entry
+; TLSDESC-NEXT:    pushq %rbp
+; TLSDESC-NEXT:    pushq %r14
+; TLSDESC-NEXT:    pushq %rbx
+; TLSDESC-NEXT:    leaq foo_nonlocal at tlsdesc(%rip), %rax
+; TLSDESC-NEXT:    callq *foo_nonlocal at tlscall(%rax)
+; TLSDESC-NEXT:    movl %fs:(%rax), %ebp
+; TLSDESC-NEXT:    testl %edi, %edi
+; TLSDESC-NEXT:    movl %ebp, %ecx
+; TLSDESC-NEXT:    jne .LBB1_2
+; TLSDESC-NEXT:  # %bb.1: # %if.then
+; TLSDESC-NEXT:    movq %rsi, %rbx
+; TLSDESC-NEXT:    addq %fs:0, %rax
+; TLSDESC-NEXT:    movq %rax, %r14
+; TLSDESC-NEXT:    callq effect at PLT
+; TLSDESC-NEXT:    movl 168(%r14,%rbx,4), %ecx
+; TLSDESC-NEXT:  .LBB1_2: # %if.end
+; TLSDESC-NEXT:    addl %ebp, %ecx
+; TLSDESC-NEXT:    movl %ecx, %eax
+; TLSDESC-NEXT:    popq %rbx
+; TLSDESC-NEXT:    popq %r14
+; TLSDESC-NEXT:    popq %rbp
+; TLSDESC-NEXT:    retq
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_nonlocal)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg0, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %x = add i64 %arg1, 42
+  %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x
+  %load1 = load i32, ptr %addr1, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}

>From 4c0836d2e7b24a649e27f77a7a6c0c42942ac8d6 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Fri, 5 Apr 2024 11:59:23 -0700
Subject: [PATCH 2/3] CodeGenPrepare: Remove threadlocal_address intrinsic when
 cheap to recompute.

The `threadlocal_address` intrinsic is currently ignored/removed for
instruction selection by the `SelectionDAGBuilder` (see also
https://reviews.llvm.org/D125291 ).

However being an Instruction means `SelectionDAG` will assign a register
to it and share the value across basic blocks. This sharing is
suboptimal in the "LocalExec" TLS model on x86 where it is cheaper to
just recompute the address. We saw a 0.5% regression in a codebase with
a lot of TLS usage (HHVM).

This introduces a new `cheapToRecomputeTLSAddress` target lowering
callback and removes the `threadlocal_address` intrinsic in
`CodeGenPrepare` to restore the efficient behavior from before the
introduction of the `threadlocal_address` intrinsic.

This fixes #87437
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  6 ++
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 18 +++-
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 24 ++++++
 llvm/lib/Target/X86/X86ISelLowering.h         |  2 +
 .../X86/codegen-prepare-addrmode-tls.ll       | 11 +--
 .../CodeGenPrepare/X86/sink-addrmode-tls.ll   | 86 +++++++++++++++++++
 6 files changed, 138 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a4dc097446186a..05b22289d6e247 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2798,6 +2798,12 @@ class TargetLoweringBase {
                                      Type *Ty, unsigned AddrSpace,
                                      Instruction *I = nullptr) const;
 
+  /// Returns true if the targets addressing mode can target thread local
+  /// storage (TLS).
+  virtual bool addressingModeSupportsTLS(const GlobalValue &) const {
+    return false;
+  }
+
   /// Return the prefered common base offset.
   virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset,
                                                  int64_t MaxOffset) const {
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e657872c382848..256145e41b9f43 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -5082,6 +5082,15 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
     }
     return true;
   }
+  case Instruction::Call:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(AddrInst)) {
+      if (II->getIntrinsicID() == Intrinsic::threadlocal_address) {
+        GlobalValue &GV = cast<GlobalValue>(*II->getArgOperand(0));
+        if (TLI.addressingModeSupportsTLS(GV))
+          return matchAddr(AddrInst->getOperand(0), Depth);
+      }
+    }
+    break;
   }
   return false;
 }
@@ -5620,11 +5629,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         return Modified;
     }
 
-    if (AddrMode.BaseGV) {
+    GlobalValue *BaseGV = AddrMode.BaseGV;
+    if (BaseGV != nullptr) {
       if (ResultPtr)
         return Modified;
 
-      ResultPtr = AddrMode.BaseGV;
+      if (BaseGV->isThreadLocal()) {
+        ResultPtr = Builder.CreateThreadLocalAddress(BaseGV);
+      } else {
+        ResultPtr = BaseGV;
+      }
     }
 
     // If the real base value actually came from an inttoptr, then the matcher
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f274da6f6f7767..3358d7918f4b08 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18928,6 +18928,30 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("TLS not implemented for this target.");
 }
 
+bool X86TargetLowering::addressingModeSupportsTLS(const GlobalValue &GV) const {
+  if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
+    const TargetMachine &TM = getTargetMachine();
+    TLSModel::Model Model = TM.getTLSModel(&GV);
+    switch (Model) {
+    case TLSModel::LocalExec:
+    case TLSModel::InitialExec:
+      // We can include the %fs segment register in addressing modes.
+      return true;
+    case TLSModel::LocalDynamic:
+    case TLSModel::GeneralDynamic:
+      // These models do not result in %fs relative addresses unless
+      // TLS descriptior are used.
+      //
+      // Even in the case of TLS descriptors we currently have no way to model
+      // the difference between %fs access and the computations needed for the
+      // offset and returning `true` for TLS-desc currently duplicates both
+      // which is detrimental :-/
+      return false;
+    }
+  }
+  return false;
+}
+
 /// Lower SRA_PARTS and friends, which return two i32 values
 /// and take a 2 x i32 value to shift plus a shift amount.
 /// TODO: Can this be moved to general expansion code?
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 0a1e8ca4427314..e348ba6e8ac085 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1323,6 +1323,8 @@ namespace llvm {
                                Type *Ty, unsigned AS,
                                Instruction *I = nullptr) const override;
 
+    bool addressingModeSupportsTLS(const GlobalValue &GV) const override;
+
     /// Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
     /// compare a register against the immediate without having to materialize
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
index 25d77a05afb4a5..0ca1da26fa89c7 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
@@ -14,22 +14,20 @@ define i32 @func_local_tls(i32 %arg0, i64 %arg1) nounwind {
 ; NOPIC-LABEL: func_local_tls:
 ; NOPIC:       # %bb.0: # %entry
 ; NOPIC-NEXT:    pushq %rbp
-; NOPIC-NEXT:    pushq %r14
 ; NOPIC-NEXT:    pushq %rbx
+; NOPIC-NEXT:    pushq %rax
 ; NOPIC-NEXT:    movl %fs:foo_local at TPOFF, %ebp
 ; NOPIC-NEXT:    testl %edi, %edi
 ; NOPIC-NEXT:    movl %ebp, %eax
 ; NOPIC-NEXT:    jne .LBB0_2
 ; NOPIC-NEXT:  # %bb.1: # %if.then
 ; NOPIC-NEXT:    movq %rsi, %rbx
-; NOPIC-NEXT:    movq %fs:0, %rax
-; NOPIC-NEXT:    leaq foo_local at TPOFF(%rax), %r14
 ; NOPIC-NEXT:    callq effect at PLT
-; NOPIC-NEXT:    movl 168(%r14,%rbx,4), %eax
+; NOPIC-NEXT:    movl %fs:foo_local at TPOFF+168(,%rbx,4), %eax
 ; NOPIC-NEXT:  .LBB0_2: # %if.end
 ; NOPIC-NEXT:    addl %ebp, %eax
+; NOPIC-NEXT:    addq $8, %rsp
 ; NOPIC-NEXT:    popq %rbx
-; NOPIC-NEXT:    popq %r14
 ; NOPIC-NEXT:    popq %rbp
 ; NOPIC-NEXT:    retq
 ;
@@ -110,9 +108,8 @@ define i32 @func_nonlocal_tls(i32 %arg0, i64 %arg1) nounwind {
 ; NOPIC-NEXT:    jne .LBB1_2
 ; NOPIC-NEXT:  # %bb.1: # %if.then
 ; NOPIC-NEXT:    movq %rsi, %rbx
-; NOPIC-NEXT:    addq %fs:0, %r14
 ; NOPIC-NEXT:    callq effect at PLT
-; NOPIC-NEXT:    movl 168(%r14,%rbx,4), %eax
+; NOPIC-NEXT:    movl %fs:168(%r14,%rbx,4), %eax
 ; NOPIC-NEXT:  .LBB1_2: # %if.end
 ; NOPIC-NEXT:    addl %ebp, %eax
 ; NOPIC-NEXT:    popq %rbx
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll
new file mode 100644
index 00000000000000..080c807cbad13f
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s | FileCheck %s
+
+target triple = "x86_64--linux-gnu"
+
+ at foo = dso_local thread_local(localexec) global i32 0, align 4
+
+declare void @effect()
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+define i32 @func0(i32 %arg) {
+; CHECK-LABEL: define i32 @func0(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[ARG]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @effect()
+; CHECK-NEXT:    [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %load1 = load i32, ptr %addr, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}
+
+define i32 @func1(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @func1(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @effect()
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[ARG1]], 42
+; CHECK-NEXT:    [[X64:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[X64]], 4
+; CHECK-NEXT:    [[ADDR1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[SUNKADDR]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[ADDR1]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg0, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %x = add i32 %arg1, 42
+  %x64 = sext i32 %x to i64
+  %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x64
+  %load1 = load i32, ptr %addr1, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}

>From f2048f0a6fa4f9e6187d4f04b1d3231be05fb16c Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Tue, 16 Apr 2024 13:18:41 -0700
Subject: [PATCH 3/3] fix

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 256145e41b9f43..22a766f8d62524 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -5803,8 +5803,15 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     }
 
     // Add in the BaseGV if present.
-    if (AddrMode.BaseGV) {
-      Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
+    GlobalValue *BaseGV = AddrMode.BaseGV;
+    if (BaseGV != nullptr) {
+      Value *BaseGVPtr;
+      if (BaseGV->isThreadLocal()) {
+        BaseGVPtr = Builder.CreateThreadLocalAddress(BaseGV);
+      } else {
+        BaseGVPtr = BaseGV;
+      }
+      Value *V = Builder.CreatePtrToInt(BaseGVPtr, IntPtrTy, "sunkaddr");
       if (Result)
         Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else



More information about the llvm-commits mailing list