[llvm] 652bcf6 - CodeGenPrepare: Add support for llvm.threadlocal.address address-mode sinking (#87844)

Wed Apr 17 12:48:06 PDT 2024

Author: Matthias Braun
Date: 2024-04-17T12:48:02-07:00
New Revision: 652bcf685c72447f3cc46d93d6c9c1948e8499f3

URL: https://github.com/llvm/llvm-project/commit/652bcf685c72447f3cc46d93d6c9c1948e8499f3
DIFF: https://github.com/llvm/llvm-project/commit/652bcf685c72447f3cc46d93d6c9c1948e8499f3.diff

LOG: CodeGenPrepare: Add support for llvm.threadlocal.address address-mode sinking (#87844)

Depending on the TLSMode many thread-local accesses on x86 can be
expressed by adding a %fs: segment register to an addressing mode. Even
if there are mutliple users of a `llvm.threadlocal.address` intrinsic it
is generally not worth sharing the value in a register but instead fold
the %fs access into multiple addressing modes.

Hence this changes CodeGenPrepare to duplicate the
`llvm.threadlocal.address` intrinsic as necessary.

Introduces a new `TargetLowering::addressingModeSupportsTLS` callback
that allows targets to indicate whether TLS accesses can be part of an
addressing mode.

This is fixing a performance problem, as this folding of TLS-accesses
into multiple addressing modes happened naturally before the
introduction of the `llvm.threadlocal.address` intrinsic, but regressed
due to `SelectionDAG` keeping things in registers when accessed across
basic blocks, so CodeGenPrepare needs to duplicate to mitigate this. We
see a ~0.5% recovery in a codebase with heavy TLS usage (HHVM).

This fixes most of #87437

Added: 
    llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
    llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/CodeGenPrepare.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ade02959025f..2dd978c7b58498 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2805,6 +2805,12 @@ class TargetLoweringBase {
                                      Type *Ty, unsigned AddrSpace,
                                      Instruction *I = nullptr) const;
 
+  /// Returns true if the targets addressing mode can target thread local
+  /// storage (TLS).
+  virtual bool addressingModeSupportsTLS(const GlobalValue &) const {
+    return false;
+  }
+
   /// Return the prefered common base offset.
   virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset,
                                                  int64_t MaxOffset) const {

diff  --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e657872c382848..22a766f8d62524 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -5082,6 +5082,15 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
     }
     return true;
   }
+  case Instruction::Call:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(AddrInst)) {
+      if (II->getIntrinsicID() == Intrinsic::threadlocal_address) {
+        GlobalValue &GV = cast<GlobalValue>(*II->getArgOperand(0));
+        if (TLI.addressingModeSupportsTLS(GV))
+          return matchAddr(AddrInst->getOperand(0), Depth);
+      }
+    }
+    break;
   }
   return false;
 }
@@ -5620,11 +5629,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         return Modified;
     }
 
-    if (AddrMode.BaseGV) {
+    GlobalValue *BaseGV = AddrMode.BaseGV;
+    if (BaseGV != nullptr) {
       if (ResultPtr)
         return Modified;
 
-      ResultPtr = AddrMode.BaseGV;
+      if (BaseGV->isThreadLocal()) {
+        ResultPtr = Builder.CreateThreadLocalAddress(BaseGV);
+      } else {
+        ResultPtr = BaseGV;
+      }
     }
 
     // If the real base value actually came from an inttoptr, then the matcher
@@ -5789,8 +5803,15 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     }
 
     // Add in the BaseGV if present.
-    if (AddrMode.BaseGV) {
-      Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
+    GlobalValue *BaseGV = AddrMode.BaseGV;
+    if (BaseGV != nullptr) {
+      Value *BaseGVPtr;
+      if (BaseGV->isThreadLocal()) {
+        BaseGVPtr = Builder.CreateThreadLocalAddress(BaseGV);
+      } else {
+        BaseGVPtr = BaseGV;
+      }
+      Value *V = Builder.CreatePtrToInt(BaseGVPtr, IntPtrTy, "sunkaddr");
       if (Result)
         Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 27107f554fccf1..bedec0c8974a85 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18920,6 +18920,30 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("TLS not implemented for this target.");
 }
 
+bool X86TargetLowering::addressingModeSupportsTLS(const GlobalValue &GV) const {
+  if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
+    const TargetMachine &TM = getTargetMachine();
+    TLSModel::Model Model = TM.getTLSModel(&GV);
+    switch (Model) {
+    case TLSModel::LocalExec:
+    case TLSModel::InitialExec:
+      // We can include the %fs segment register in addressing modes.
+      return true;
+    case TLSModel::LocalDynamic:
+    case TLSModel::GeneralDynamic:
+      // These models do not result in %fs relative addresses unless
+      // TLS descriptior are used.
+      //
+      // Even in the case of TLS descriptors we currently have no way to model
+      // the 
diff erence between %fs access and the computations needed for the
+      // offset and returning `true` for TLS-desc currently duplicates both
+      // which is detrimental :-/
+      return false;
+    }
+  }
+  return false;
+}
+
 /// Lower SRA_PARTS and friends, which return two i32 values
 /// and take a 2 x i32 value to shift plus a shift amount.
 /// TODO: Can this be moved to general expansion code?

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 0a1e8ca4427314..e348ba6e8ac085 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1323,6 +1323,8 @@ namespace llvm {
                                Type *Ty, unsigned AS,
                                Instruction *I = nullptr) const override;
 
+    bool addressingModeSupportsTLS(const GlobalValue &GV) const override;
+
     /// Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
     /// compare a register against the immediate without having to materialize

diff  --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
new file mode 100644
index 00000000000000..0ca1da26fa89c7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -o - %s | FileCheck %s --check-prefix=NOPIC
+; RUN: llc -o - %s -relocation-model=pic | FileCheck %s --check-prefix=PIC
+; RUN: llc -o - %s -relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=TLSDESC
+
+target triple = "x86_64--linux-gnu"
+
+declare void @effect()
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+ at foo_local = dso_local thread_local(localexec) global i32 0, align 4
+
+define i32 @func_local_tls(i32 %arg0, i64 %arg1) nounwind {
+; NOPIC-LABEL: func_local_tls:
+; NOPIC:       # %bb.0: # %entry
+; NOPIC-NEXT:    pushq %rbp
+; NOPIC-NEXT:    pushq %rbx
+; NOPIC-NEXT:    pushq %rax
+; NOPIC-NEXT:    movl %fs:foo_local at TPOFF, %ebp
+; NOPIC-NEXT:    testl %edi, %edi
+; NOPIC-NEXT:    movl %ebp, %eax
+; NOPIC-NEXT:    jne .LBB0_2
+; NOPIC-NEXT:  # %bb.1: # %if.then
+; NOPIC-NEXT:    movq %rsi, %rbx
+; NOPIC-NEXT:    callq effect at PLT
+; NOPIC-NEXT:    movl %fs:foo_local at TPOFF+168(,%rbx,4), %eax
+; NOPIC-NEXT:  .LBB0_2: # %if.end
+; NOPIC-NEXT:    addl %ebp, %eax
+; NOPIC-NEXT:    addq $8, %rsp
+; NOPIC-NEXT:    popq %rbx
+; NOPIC-NEXT:    popq %rbp
+; NOPIC-NEXT:    retq
+;
+; PIC-LABEL: func_local_tls:
+; PIC:       # %bb.0: # %entry
+; PIC-NEXT:    pushq %rbp
+; PIC-NEXT:    pushq %r14
+; PIC-NEXT:    pushq %rbx
+; PIC-NEXT:    movl %fs:.Lfoo_local$local at TPOFF, %ebp
+; PIC-NEXT:    testl %edi, %edi
+; PIC-NEXT:    movl %ebp, %eax
+; PIC-NEXT:    jne .LBB0_2
+; PIC-NEXT:  # %bb.1: # %if.then
+; PIC-NEXT:    movq %rsi, %rbx
+; PIC-NEXT:    movq %fs:0, %rax
+; PIC-NEXT:    leaq .Lfoo_local$local at TPOFF(%rax), %r14
+; PIC-NEXT:    callq effect at PLT
+; PIC-NEXT:    movl 168(%r14,%rbx,4), %eax
+; PIC-NEXT:  .LBB0_2: # %if.end
+; PIC-NEXT:    addl %ebp, %eax
+; PIC-NEXT:    popq %rbx
+; PIC-NEXT:    popq %r14
+; PIC-NEXT:    popq %rbp
+; PIC-NEXT:    retq
+;
+; TLSDESC-LABEL: func_local_tls:
+; TLSDESC:       # %bb.0: # %entry
+; TLSDESC-NEXT:    pushq %rbp
+; TLSDESC-NEXT:    pushq %r14
+; TLSDESC-NEXT:    pushq %rbx
+; TLSDESC-NEXT:    movl %fs:.Lfoo_local$local at TPOFF, %ebp
+; TLSDESC-NEXT:    testl %edi, %edi
+; TLSDESC-NEXT:    movl %ebp, %eax
+; TLSDESC-NEXT:    jne .LBB0_2
+; TLSDESC-NEXT:  # %bb.1: # %if.then
+; TLSDESC-NEXT:    movq %rsi, %rbx
+; TLSDESC-NEXT:    movq %fs:0, %rax
+; TLSDESC-NEXT:    leaq .Lfoo_local$local at TPOFF(%rax), %r14
+; TLSDESC-NEXT:    callq effect at PLT
+; TLSDESC-NEXT:    movl 168(%r14,%rbx,4), %eax
+; TLSDESC-NEXT:  .LBB0_2: # %if.end
+; TLSDESC-NEXT:    addl %ebp, %eax
+; TLSDESC-NEXT:    popq %rbx
+; TLSDESC-NEXT:    popq %r14
+; TLSDESC-NEXT:    popq %rbp
+; TLSDESC-NEXT:    retq
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_local)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg0, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %x = add i64 %arg1, 42
+  %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x
+  %load1 = load i32, ptr %addr1, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}
+
+ at foo_nonlocal = thread_local global i32 0, align 4
+
+define i32 @func_nonlocal_tls(i32 %arg0, i64 %arg1) nounwind {
+; NOPIC-LABEL: func_nonlocal_tls:
+; NOPIC:       # %bb.0: # %entry
+; NOPIC-NEXT:    pushq %rbp
+; NOPIC-NEXT:    pushq %r14
+; NOPIC-NEXT:    pushq %rbx
+; NOPIC-NEXT:    movq foo_nonlocal at GOTTPOFF(%rip), %r14
+; NOPIC-NEXT:    movl %fs:(%r14), %ebp
+; NOPIC-NEXT:    testl %edi, %edi
+; NOPIC-NEXT:    movl %ebp, %eax
+; NOPIC-NEXT:    jne .LBB1_2
+; NOPIC-NEXT:  # %bb.1: # %if.then
+; NOPIC-NEXT:    movq %rsi, %rbx
+; NOPIC-NEXT:    callq effect at PLT
+; NOPIC-NEXT:    movl %fs:168(%r14,%rbx,4), %eax
+; NOPIC-NEXT:  .LBB1_2: # %if.end
+; NOPIC-NEXT:    addl %ebp, %eax
+; NOPIC-NEXT:    popq %rbx
+; NOPIC-NEXT:    popq %r14
+; NOPIC-NEXT:    popq %rbp
+; NOPIC-NEXT:    retq
+;
+; PIC-LABEL: func_nonlocal_tls:
+; PIC:       # %bb.0: # %entry
+; PIC-NEXT:    pushq %rbp
+; PIC-NEXT:    pushq %r15
+; PIC-NEXT:    pushq %r14
+; PIC-NEXT:    pushq %rbx
+; PIC-NEXT:    pushq %rax
+; PIC-NEXT:    movq %rsi, %rbx
+; PIC-NEXT:    movl %edi, %ebp
+; PIC-NEXT:    data16
+; PIC-NEXT:    leaq foo_nonlocal at TLSGD(%rip), %rdi
+; PIC-NEXT:    data16
+; PIC-NEXT:    data16
+; PIC-NEXT:    rex64
+; PIC-NEXT:    callq __tls_get_addr at PLT
+; PIC-NEXT:    movq %rax, %r14
+; PIC-NEXT:    movl (%rax), %r15d
+; PIC-NEXT:    testl %ebp, %ebp
+; PIC-NEXT:    movl %r15d, %eax
+; PIC-NEXT:    jne .LBB1_2
+; PIC-NEXT:  # %bb.1: # %if.then
+; PIC-NEXT:    callq effect at PLT
+; PIC-NEXT:    movl 168(%r14,%rbx,4), %eax
+; PIC-NEXT:  .LBB1_2: # %if.end
+; PIC-NEXT:    addl %r15d, %eax
+; PIC-NEXT:    addq $8, %rsp
+; PIC-NEXT:    popq %rbx
+; PIC-NEXT:    popq %r14
+; PIC-NEXT:    popq %r15
+; PIC-NEXT:    popq %rbp
+; PIC-NEXT:    retq
+;
+; TLSDESC-LABEL: func_nonlocal_tls:
+; TLSDESC:       # %bb.0: # %entry
+; TLSDESC-NEXT:    pushq %rbp
+; TLSDESC-NEXT:    pushq %r14
+; TLSDESC-NEXT:    pushq %rbx
+; TLSDESC-NEXT:    leaq foo_nonlocal at tlsdesc(%rip), %rax
+; TLSDESC-NEXT:    callq *foo_nonlocal at tlscall(%rax)
+; TLSDESC-NEXT:    movl %fs:(%rax), %ebp
+; TLSDESC-NEXT:    testl %edi, %edi
+; TLSDESC-NEXT:    movl %ebp, %ecx
+; TLSDESC-NEXT:    jne .LBB1_2
+; TLSDESC-NEXT:  # %bb.1: # %if.then
+; TLSDESC-NEXT:    movq %rsi, %rbx
+; TLSDESC-NEXT:    addq %fs:0, %rax
+; TLSDESC-NEXT:    movq %rax, %r14
+; TLSDESC-NEXT:    callq effect at PLT
+; TLSDESC-NEXT:    movl 168(%r14,%rbx,4), %ecx
+; TLSDESC-NEXT:  .LBB1_2: # %if.end
+; TLSDESC-NEXT:    addl %ebp, %ecx
+; TLSDESC-NEXT:    movl %ecx, %eax
+; TLSDESC-NEXT:    popq %rbx
+; TLSDESC-NEXT:    popq %r14
+; TLSDESC-NEXT:    popq %rbp
+; TLSDESC-NEXT:    retq
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_nonlocal)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg0, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %x = add i64 %arg1, 42
+  %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x
+  %load1 = load i32, ptr %addr1, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}

diff  --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll
new file mode 100644
index 00000000000000..080c807cbad13f
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s | FileCheck %s
+
+target triple = "x86_64--linux-gnu"
+
+ at foo = dso_local thread_local(localexec) global i32 0, align 4
+
+declare void @effect()
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+define i32 @func0(i32 %arg) {
+; CHECK-LABEL: define i32 @func0(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[ARG]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @effect()
+; CHECK-NEXT:    [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %load1 = load i32, ptr %addr, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}
+
+define i32 @func1(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @func1(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @effect()
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[ARG1]], 42
+; CHECK-NEXT:    [[X64:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[X64]], 4
+; CHECK-NEXT:    [[ADDR1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[SUNKADDR]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[ADDR1]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+  %load0 = load i32, ptr %addr, align 4
+  %cond = icmp eq i32 %arg0, 0
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @effect()
+  %x = add i32 %arg1, 42
+  %x64 = sext i32 %x to i64
+  %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x64
+  %load1 = load i32, ptr %addr1, align 4
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+  %ret = add i32 %phi, %load0
+  ret i32 %ret
+}