[llvm] 652bcf6 - CodeGenPrepare: Add support for llvm.threadlocal.address address-mode sinking (#87844)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 17 12:48:06 PDT 2024
Author: Matthias Braun
Date: 2024-04-17T12:48:02-07:00
New Revision: 652bcf685c72447f3cc46d93d6c9c1948e8499f3
URL: https://github.com/llvm/llvm-project/commit/652bcf685c72447f3cc46d93d6c9c1948e8499f3
DIFF: https://github.com/llvm/llvm-project/commit/652bcf685c72447f3cc46d93d6c9c1948e8499f3.diff
LOG: CodeGenPrepare: Add support for llvm.threadlocal.address address-mode sinking (#87844)
Depending on the TLSMode many thread-local accesses on x86 can be
expressed by adding a %fs: segment register to an addressing mode. Even
if there are mutliple users of a `llvm.threadlocal.address` intrinsic it
is generally not worth sharing the value in a register but instead fold
the %fs access into multiple addressing modes.
Hence this changes CodeGenPrepare to duplicate the
`llvm.threadlocal.address` intrinsic as necessary.
Introduces a new `TargetLowering::addressingModeSupportsTLS` callback
that allows targets to indicate whether TLS accesses can be part of an
addressing mode.
This is fixing a performance problem, as this folding of TLS-accesses
into multiple addressing modes happened naturally before the
introduction of the `llvm.threadlocal.address` intrinsic, but regressed
due to `SelectionDAG` keeping things in registers when accessed across
basic blocks, so CodeGenPrepare needs to duplicate to mitigate this. We
see a ~0.5% recovery in a codebase with heavy TLS usage (HHVM).
This fixes most of #87437
Added:
llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/CodeGenPrepare.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.h
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ade02959025f..2dd978c7b58498 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2805,6 +2805,12 @@ class TargetLoweringBase {
Type *Ty, unsigned AddrSpace,
Instruction *I = nullptr) const;
+ /// Returns true if the targets addressing mode can target thread local
+ /// storage (TLS).
+ virtual bool addressingModeSupportsTLS(const GlobalValue &) const {
+ return false;
+ }
+
/// Return the prefered common base offset.
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset,
int64_t MaxOffset) const {
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e657872c382848..22a766f8d62524 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -5082,6 +5082,15 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
}
return true;
}
+ case Instruction::Call:
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(AddrInst)) {
+ if (II->getIntrinsicID() == Intrinsic::threadlocal_address) {
+ GlobalValue &GV = cast<GlobalValue>(*II->getArgOperand(0));
+ if (TLI.addressingModeSupportsTLS(GV))
+ return matchAddr(AddrInst->getOperand(0), Depth);
+ }
+ }
+ break;
}
return false;
}
@@ -5620,11 +5629,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
return Modified;
}
- if (AddrMode.BaseGV) {
+ GlobalValue *BaseGV = AddrMode.BaseGV;
+ if (BaseGV != nullptr) {
if (ResultPtr)
return Modified;
- ResultPtr = AddrMode.BaseGV;
+ if (BaseGV->isThreadLocal()) {
+ ResultPtr = Builder.CreateThreadLocalAddress(BaseGV);
+ } else {
+ ResultPtr = BaseGV;
+ }
}
// If the real base value actually came from an inttoptr, then the matcher
@@ -5789,8 +5803,15 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
}
// Add in the BaseGV if present.
- if (AddrMode.BaseGV) {
- Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
+ GlobalValue *BaseGV = AddrMode.BaseGV;
+ if (BaseGV != nullptr) {
+ Value *BaseGVPtr;
+ if (BaseGV->isThreadLocal()) {
+ BaseGVPtr = Builder.CreateThreadLocalAddress(BaseGV);
+ } else {
+ BaseGVPtr = BaseGV;
+ }
+ Value *V = Builder.CreatePtrToInt(BaseGVPtr, IntPtrTy, "sunkaddr");
if (Result)
Result = Builder.CreateAdd(Result, V, "sunkaddr");
else
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 27107f554fccf1..bedec0c8974a85 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18920,6 +18920,30 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("TLS not implemented for this target.");
}
+bool X86TargetLowering::addressingModeSupportsTLS(const GlobalValue &GV) const {
+ if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
+ const TargetMachine &TM = getTargetMachine();
+ TLSModel::Model Model = TM.getTLSModel(&GV);
+ switch (Model) {
+ case TLSModel::LocalExec:
+ case TLSModel::InitialExec:
+ // We can include the %fs segment register in addressing modes.
+ return true;
+ case TLSModel::LocalDynamic:
+ case TLSModel::GeneralDynamic:
+ // These models do not result in %fs relative addresses unless
+ // TLS descriptior are used.
+ //
+ // Even in the case of TLS descriptors we currently have no way to model
+ // the
diff erence between %fs access and the computations needed for the
+ // offset and returning `true` for TLS-desc currently duplicates both
+ // which is detrimental :-/
+ return false;
+ }
+ }
+ return false;
+}
+
/// Lower SRA_PARTS and friends, which return two i32 values
/// and take a 2 x i32 value to shift plus a shift amount.
/// TODO: Can this be moved to general expansion code?
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 0a1e8ca4427314..e348ba6e8ac085 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1323,6 +1323,8 @@ namespace llvm {
Type *Ty, unsigned AS,
Instruction *I = nullptr) const override;
+ bool addressingModeSupportsTLS(const GlobalValue &GV) const override;
+
/// Return true if the specified immediate is legal
/// icmp immediate, that is the target has icmp instructions which can
/// compare a register against the immediate without having to materialize
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
new file mode 100644
index 00000000000000..0ca1da26fa89c7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -o - %s | FileCheck %s --check-prefix=NOPIC
+; RUN: llc -o - %s -relocation-model=pic | FileCheck %s --check-prefix=PIC
+; RUN: llc -o - %s -relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=TLSDESC
+
+target triple = "x86_64--linux-gnu"
+
+declare void @effect()
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+ at foo_local = dso_local thread_local(localexec) global i32 0, align 4
+
+define i32 @func_local_tls(i32 %arg0, i64 %arg1) nounwind {
+; NOPIC-LABEL: func_local_tls:
+; NOPIC: # %bb.0: # %entry
+; NOPIC-NEXT: pushq %rbp
+; NOPIC-NEXT: pushq %rbx
+; NOPIC-NEXT: pushq %rax
+; NOPIC-NEXT: movl %fs:foo_local at TPOFF, %ebp
+; NOPIC-NEXT: testl %edi, %edi
+; NOPIC-NEXT: movl %ebp, %eax
+; NOPIC-NEXT: jne .LBB0_2
+; NOPIC-NEXT: # %bb.1: # %if.then
+; NOPIC-NEXT: movq %rsi, %rbx
+; NOPIC-NEXT: callq effect at PLT
+; NOPIC-NEXT: movl %fs:foo_local at TPOFF+168(,%rbx,4), %eax
+; NOPIC-NEXT: .LBB0_2: # %if.end
+; NOPIC-NEXT: addl %ebp, %eax
+; NOPIC-NEXT: addq $8, %rsp
+; NOPIC-NEXT: popq %rbx
+; NOPIC-NEXT: popq %rbp
+; NOPIC-NEXT: retq
+;
+; PIC-LABEL: func_local_tls:
+; PIC: # %bb.0: # %entry
+; PIC-NEXT: pushq %rbp
+; PIC-NEXT: pushq %r14
+; PIC-NEXT: pushq %rbx
+; PIC-NEXT: movl %fs:.Lfoo_local$local at TPOFF, %ebp
+; PIC-NEXT: testl %edi, %edi
+; PIC-NEXT: movl %ebp, %eax
+; PIC-NEXT: jne .LBB0_2
+; PIC-NEXT: # %bb.1: # %if.then
+; PIC-NEXT: movq %rsi, %rbx
+; PIC-NEXT: movq %fs:0, %rax
+; PIC-NEXT: leaq .Lfoo_local$local at TPOFF(%rax), %r14
+; PIC-NEXT: callq effect at PLT
+; PIC-NEXT: movl 168(%r14,%rbx,4), %eax
+; PIC-NEXT: .LBB0_2: # %if.end
+; PIC-NEXT: addl %ebp, %eax
+; PIC-NEXT: popq %rbx
+; PIC-NEXT: popq %r14
+; PIC-NEXT: popq %rbp
+; PIC-NEXT: retq
+;
+; TLSDESC-LABEL: func_local_tls:
+; TLSDESC: # %bb.0: # %entry
+; TLSDESC-NEXT: pushq %rbp
+; TLSDESC-NEXT: pushq %r14
+; TLSDESC-NEXT: pushq %rbx
+; TLSDESC-NEXT: movl %fs:.Lfoo_local$local at TPOFF, %ebp
+; TLSDESC-NEXT: testl %edi, %edi
+; TLSDESC-NEXT: movl %ebp, %eax
+; TLSDESC-NEXT: jne .LBB0_2
+; TLSDESC-NEXT: # %bb.1: # %if.then
+; TLSDESC-NEXT: movq %rsi, %rbx
+; TLSDESC-NEXT: movq %fs:0, %rax
+; TLSDESC-NEXT: leaq .Lfoo_local$local at TPOFF(%rax), %r14
+; TLSDESC-NEXT: callq effect at PLT
+; TLSDESC-NEXT: movl 168(%r14,%rbx,4), %eax
+; TLSDESC-NEXT: .LBB0_2: # %if.end
+; TLSDESC-NEXT: addl %ebp, %eax
+; TLSDESC-NEXT: popq %rbx
+; TLSDESC-NEXT: popq %r14
+; TLSDESC-NEXT: popq %rbp
+; TLSDESC-NEXT: retq
+entry:
+ %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_local)
+ %load0 = load i32, ptr %addr, align 4
+ %cond = icmp eq i32 %arg0, 0
+ br i1 %cond, label %if.then, label %if.end
+
+if.then:
+ tail call void @effect()
+ %x = add i64 %arg1, 42
+ %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x
+ %load1 = load i32, ptr %addr1, align 4
+ br label %if.end
+
+if.end:
+ %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+ %ret = add i32 %phi, %load0
+ ret i32 %ret
+}
+
+ at foo_nonlocal = thread_local global i32 0, align 4
+
+define i32 @func_nonlocal_tls(i32 %arg0, i64 %arg1) nounwind {
+; NOPIC-LABEL: func_nonlocal_tls:
+; NOPIC: # %bb.0: # %entry
+; NOPIC-NEXT: pushq %rbp
+; NOPIC-NEXT: pushq %r14
+; NOPIC-NEXT: pushq %rbx
+; NOPIC-NEXT: movq foo_nonlocal at GOTTPOFF(%rip), %r14
+; NOPIC-NEXT: movl %fs:(%r14), %ebp
+; NOPIC-NEXT: testl %edi, %edi
+; NOPIC-NEXT: movl %ebp, %eax
+; NOPIC-NEXT: jne .LBB1_2
+; NOPIC-NEXT: # %bb.1: # %if.then
+; NOPIC-NEXT: movq %rsi, %rbx
+; NOPIC-NEXT: callq effect at PLT
+; NOPIC-NEXT: movl %fs:168(%r14,%rbx,4), %eax
+; NOPIC-NEXT: .LBB1_2: # %if.end
+; NOPIC-NEXT: addl %ebp, %eax
+; NOPIC-NEXT: popq %rbx
+; NOPIC-NEXT: popq %r14
+; NOPIC-NEXT: popq %rbp
+; NOPIC-NEXT: retq
+;
+; PIC-LABEL: func_nonlocal_tls:
+; PIC: # %bb.0: # %entry
+; PIC-NEXT: pushq %rbp
+; PIC-NEXT: pushq %r15
+; PIC-NEXT: pushq %r14
+; PIC-NEXT: pushq %rbx
+; PIC-NEXT: pushq %rax
+; PIC-NEXT: movq %rsi, %rbx
+; PIC-NEXT: movl %edi, %ebp
+; PIC-NEXT: data16
+; PIC-NEXT: leaq foo_nonlocal at TLSGD(%rip), %rdi
+; PIC-NEXT: data16
+; PIC-NEXT: data16
+; PIC-NEXT: rex64
+; PIC-NEXT: callq __tls_get_addr at PLT
+; PIC-NEXT: movq %rax, %r14
+; PIC-NEXT: movl (%rax), %r15d
+; PIC-NEXT: testl %ebp, %ebp
+; PIC-NEXT: movl %r15d, %eax
+; PIC-NEXT: jne .LBB1_2
+; PIC-NEXT: # %bb.1: # %if.then
+; PIC-NEXT: callq effect at PLT
+; PIC-NEXT: movl 168(%r14,%rbx,4), %eax
+; PIC-NEXT: .LBB1_2: # %if.end
+; PIC-NEXT: addl %r15d, %eax
+; PIC-NEXT: addq $8, %rsp
+; PIC-NEXT: popq %rbx
+; PIC-NEXT: popq %r14
+; PIC-NEXT: popq %r15
+; PIC-NEXT: popq %rbp
+; PIC-NEXT: retq
+;
+; TLSDESC-LABEL: func_nonlocal_tls:
+; TLSDESC: # %bb.0: # %entry
+; TLSDESC-NEXT: pushq %rbp
+; TLSDESC-NEXT: pushq %r14
+; TLSDESC-NEXT: pushq %rbx
+; TLSDESC-NEXT: leaq foo_nonlocal at tlsdesc(%rip), %rax
+; TLSDESC-NEXT: callq *foo_nonlocal at tlscall(%rax)
+; TLSDESC-NEXT: movl %fs:(%rax), %ebp
+; TLSDESC-NEXT: testl %edi, %edi
+; TLSDESC-NEXT: movl %ebp, %ecx
+; TLSDESC-NEXT: jne .LBB1_2
+; TLSDESC-NEXT: # %bb.1: # %if.then
+; TLSDESC-NEXT: movq %rsi, %rbx
+; TLSDESC-NEXT: addq %fs:0, %rax
+; TLSDESC-NEXT: movq %rax, %r14
+; TLSDESC-NEXT: callq effect at PLT
+; TLSDESC-NEXT: movl 168(%r14,%rbx,4), %ecx
+; TLSDESC-NEXT: .LBB1_2: # %if.end
+; TLSDESC-NEXT: addl %ebp, %ecx
+; TLSDESC-NEXT: movl %ecx, %eax
+; TLSDESC-NEXT: popq %rbx
+; TLSDESC-NEXT: popq %r14
+; TLSDESC-NEXT: popq %rbp
+; TLSDESC-NEXT: retq
+entry:
+ %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_nonlocal)
+ %load0 = load i32, ptr %addr, align 4
+ %cond = icmp eq i32 %arg0, 0
+ br i1 %cond, label %if.then, label %if.end
+
+if.then:
+ tail call void @effect()
+ %x = add i64 %arg1, 42
+ %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x
+ %load1 = load i32, ptr %addr1, align 4
+ br label %if.end
+
+if.end:
+ %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+ %ret = add i32 %phi, %load0
+ ret i32 %ret
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll
new file mode 100644
index 00000000000000..080c807cbad13f
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s | FileCheck %s
+
+target triple = "x86_64--linux-gnu"
+
+ at foo = dso_local thread_local(localexec) global i32 0, align 4
+
+declare void @effect()
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+define i32 @func0(i32 %arg) {
+; CHECK-LABEL: define i32 @func0(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: tail call void @effect()
+; CHECK-NEXT: [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
+; CHECK-NEXT: ret i32 [[RET]]
+;
+entry:
+ %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+ %load0 = load i32, ptr %addr, align 4
+ %cond = icmp eq i32 %arg, 0
+ br i1 %cond, label %if.then, label %if.end
+
+if.then:
+ tail call void @effect()
+ %load1 = load i32, ptr %addr, align 4
+ br label %if.end
+
+if.end:
+ %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+ %ret = add i32 %phi, %load0
+ ret i32 %ret
+}
+
+define i32 @func1(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @func1(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: tail call void @effect()
+; CHECK-NEXT: [[X:%.*]] = add i32 [[ARG1]], 42
+; CHECK-NEXT: [[X64:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
+; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[X64]], 4
+; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[SUNKADDR]]
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[ADDR1]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
+; CHECK-NEXT: ret i32 [[RET]]
+;
+entry:
+ %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
+ %load0 = load i32, ptr %addr, align 4
+ %cond = icmp eq i32 %arg0, 0
+ br i1 %cond, label %if.then, label %if.end
+
+if.then:
+ tail call void @effect()
+ %x = add i32 %arg1, 42
+ %x64 = sext i32 %x to i64
+ %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x64
+ %load1 = load i32, ptr %addr1, align 4
+ br label %if.end
+
+if.end:
+ %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
+ %ret = add i32 %phi, %load0
+ ret i32 %ret
+}
More information about the llvm-commits
mailing list