[llvm] [AIX][TLS] Produce a faster local-exec access sequence for the "aix-small-tls" global variable attribute (PR #83053)

Amy Kwan via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 20 06:08:13 PDT 2024


https://github.com/amy-kwan updated https://github.com/llvm/llvm-project/pull/83053

>From 8991522f386d4d2e50f630aa2f1a353bd63a01c3 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Sat, 24 Feb 2024 12:55:26 -0600
Subject: [PATCH 1/2] [AIX][TLS] Produce a faster local-exec access sequence
 for the "aix-small-tls" global variable attribute

Similar to 3f46e5453d9310b15d974e876f6132e3cf50c4b1, this patch allows the
backend to produce a faster access sequence for the local-exec TLS model,
where loading from the TOC can be avoided, for local-exec TLS variables that
are annotated with the "aix-small-tls" attribute.

The expectation is for local-exec TLS variables to be set with this attribute
through PGO. Furthermore, the optimized access sequence is only generated for
local-exec TLS variables annotated with "aix-small-tls", only if they are less
than ~32KB in size.
---
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   |  34 ++-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  21 +-
 .../aix-small-tls-globalvarattr-funcattr.ll   | 197 ++++++++++++++
 .../aix-small-tls-globalvarattr-loadaddr.ll   | 251 ++++++++++++++++++
 .../aix-small-tls-globalvarattr-targetattr.ll | 104 ++++++++
 5 files changed, 593 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-loadaddr.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-targetattr.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 9e5f0b36616d1b..05f5d6ba7007a6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7573,6 +7573,22 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
   DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
 }
 
+// Check if an SDValue has the 'aix-small-tls' global variable attribute.
+static bool hasAIXSmallTLSAttr(SDValue Val) {
+  GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Val);
+  if (!GA)
+    return false;
+
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
+  if (!GV)
+    return false;
+
+  if (!GV->hasAttribute("aix-small-tls"))
+    return false;
+
+  return true;
+}
+
 // Is an ADDI eligible for folding for non-TOC-based local-exec accesses?
 static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
                                                      SDValue ADDIToFold) {
@@ -7582,20 +7598,25 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
       (ADDIToFold.getMachineOpcode() != PPC::ADDI8))
     return false;
 
+  // Folding is only allowed for the AIX small-local-exec TLS target attribute
+  // or when the 'aix-small-tls' global variable attribute is present.
+  const PPCSubtarget &Subtarget =
+      DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
+  SDValue TLSVarNode = ADDIToFold.getOperand(1);
+  if (!(Subtarget.hasAIXSmallLocalExecTLS() || hasAIXSmallTLSAttr(TLSVarNode)))
+    return false;
+
   // The first operand of the ADDIToFold should be the thread pointer.
   // This transformation is only performed if the first operand of the
   // addi is the thread pointer.
   SDValue TPRegNode = ADDIToFold.getOperand(0);
   RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode());
-  const PPCSubtarget &Subtarget =
-      DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
   if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister()))
     return false;
 
   // The second operand of the ADDIToFold should be the global TLS address
   // (the local-exec TLS variable). We only perform the folding if the TLS
   // variable is the second operand.
-  SDValue TLSVarNode = ADDIToFold.getOperand(1);
   GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode);
   if (!GA)
     return false;
@@ -7664,7 +7685,6 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
 
 void PPCDAGToDAGISel::PeepholePPC64() {
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
-  bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS();
 
   while (Position != CurDAG->allnodes_begin()) {
     SDNode *N = &*--Position;
@@ -7676,8 +7696,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       reduceVSXSwap(N, CurDAG);
 
     // This optimization is performed for non-TOC-based local-exec accesses.
-    if (HasAIXSmallLocalExecTLS)
-      foldADDIForLocalExecAccesses(N, CurDAG);
+    foldADDIForLocalExecAccesses(N, CurDAG);
 
     unsigned FirstOp;
     unsigned StorageOpcode = N->getMachineOpcode();
@@ -7836,8 +7855,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
                                             ImmOpnd.getValueType());
       } else if (Offset != 0) {
         // This optimization is performed for non-TOC-based local-exec accesses.
-        if (HasAIXSmallLocalExecTLS &&
-            isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) {
+        if (isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) {
           // Add the non-zero offset information into the load or store
           // instruction to be used for non-TOC-based local-exec accesses.
           GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 51becf1d5b8584..128cfa79ff95e4 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -3365,6 +3365,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool Is64Bit = Subtarget.isPPC64();
   bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
+  bool HasAIXSmallTLSGlobalAttr = false;
   TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
   bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;
 
@@ -3373,6 +3374,11 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
         DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_FLAG);
     SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
     SDValue TLSReg;
+
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->hasAttribute("aix-small-tls"))
+        HasAIXSmallTLSGlobalAttr = true;
+
     if (Is64Bit) {
       // For local-exec and initial-exec on AIX (64-bit), the sequence generated
       // involves a load of the variable offset (from the TOC), followed by an
@@ -3382,14 +3388,16 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
       //    add reg2, reg1, r13     // r13 contains the thread pointer
       TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
 
-      // With the -maix-small-local-exec-tls option, produce a faster access
-      // sequence for local-exec TLS variables where the offset from the TLS
-      // base is encoded as an immediate operand.
+      // With the -maix-small-local-exec-tls option, or with the "aix-small-tls"
+      // global variable attribute, produce a faster access sequence for
+      // local-exec TLS variables where the offset from the TLS base is encoded
+      // as an immediate operand.
       //
       // We only utilize the faster local-exec access sequence when the TLS
       // variable has a size within the policy limit. We treat types that are
       // not sized or are empty as being over the policy size limit.
-      if (HasAIXSmallLocalExecTLS && IsTLSLocalExecModel) {
+      if ((HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr) &&
+          IsTLSLocalExecModel) {
         Type *GVType = GV->getValueType();
         if (GVType->isSized() && !GVType->isEmptyTy() &&
             GV->getParent()->getDataLayout().getTypeAllocSize(GVType) <=
@@ -3407,8 +3415,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
       TLSReg = DAG.getNode(PPCISD::GET_TPOINTER, dl, PtrVT);
 
       // We do not implement the 32-bit version of the faster access sequence
-      // for local-exec that is controlled by -maix-small-local-exec-tls.
-      if (HasAIXSmallLocalExecTLS)
+      // for local-exec that is controlled by the -maix-small-local-exec-tls
+      // option, or the "aix-small-tls" global variable attribute.
+      if (HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr)
         report_fatal_error("The small-local-exec TLS access sequence is "
                            "currently only supported on AIX (64-bit mode).");
     }
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll
new file mode 100644
index 00000000000000..55e486876e3373
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff < %s \
+; RUN:      | FileCheck %s --check-prefix=CHECK-SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      < %s | FileCheck %s --check-prefix=CHECK-LARGECM64
+
+ at mySmallLocalExecTLS6 = external thread_local(localexec) global [60 x i64], align 8
+ at mySmallLocalExecTLS2 = external thread_local(localexec) global [3000 x i64], align 8 #0
+ at MyTLSGDVar = thread_local global [800 x i64] zeroinitializer, align 8
+ at mySmallLocalExecTLS3 = internal thread_local(localexec) global [3000 x i64] zeroinitializer, align 8
+ at mySmallLocalExecTLS4 = internal thread_local(localexec) global [3000 x i64] zeroinitializer, align 8 #0
+ at mySmallLocalExecTLS5 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8 #0
+ at mySmallLocalExecTLS = thread_local(localexec) local_unnamed_addr global [7800 x i64] zeroinitializer, align 8 #0
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+; All accesses use a "faster" local-exec sequence directly off the thread pointer.
+define i64 @StoreLargeAccess1() #1 {
+; CHECK-SMALLCM64-LABEL: StoreLargeAccess1:
+; CHECK-SMALLCM64:       # %bb.0: # %entry
+; CHECK-SMALLCM64-NEXT:    mflr r0
+; CHECK-SMALLCM64-NEXT:    stdu r1, -48(r1)
+; CHECK-SMALLCM64-NEXT:    li r3, 212
+; CHECK-SMALLCM64-NEXT:    li r4, 203
+; CHECK-SMALLCM64-NEXT:    std r0, 64(r1)
+; CHECK-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS6[UL]@le+424(r13)
+; CHECK-SMALLCM64-NEXT:    std r4, mySmallLocalExecTLS2[UL]@le+1200(r13)
+; CHECK-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-tlsgdm) @MyTLSGDVar
+; CHECK-SMALLCM64-NEXT:    ld r4, L..C1(r2) # target-flags(ppc-tlsgd) @MyTLSGDVar
+; CHECK-SMALLCM64-NEXT:    bla .__tls_get_addr[PR]
+; CHECK-SMALLCM64-NEXT:    li r4, 44
+; CHECK-SMALLCM64-NEXT:    std r4, 440(r3)
+; CHECK-SMALLCM64-NEXT:    li r3, 6
+; CHECK-SMALLCM64-NEXT:    li r4, 100
+; CHECK-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS3[UL]@le+2000(r13)
+; CHECK-SMALLCM64-NEXT:    li r3, 882
+; CHECK-SMALLCM64-NEXT:    std r4, (mySmallLocalExecTLS4[UL]@le+6800)-65536(r13)
+; CHECK-SMALLCM64-NEXT:    std r3, (mySmallLocalExecTLS5[TL]@le+8400)-65536(r13)
+; CHECK-SMALLCM64-NEXT:    li r3, 1191
+; CHECK-SMALLCM64-NEXT:    addi r1, r1, 48
+; CHECK-SMALLCM64-NEXT:    ld r0, 16(r1)
+; CHECK-SMALLCM64-NEXT:    mtlr r0
+; CHECK-SMALLCM64-NEXT:    blr
+;
+; CHECK-LARGECM64-LABEL: StoreLargeAccess1:
+; CHECK-LARGECM64:       # %bb.0: # %entry
+; CHECK-LARGECM64-NEXT:    mflr r0
+; CHECK-LARGECM64-NEXT:    stdu r1, -48(r1)
+; CHECK-LARGECM64-NEXT:    li r3, 212
+; CHECK-LARGECM64-NEXT:    std r0, 64(r1)
+; CHECK-LARGECM64-NEXT:    addis r4, L..C0 at u(r2)
+; CHECK-LARGECM64-NEXT:    ld r4, L..C0 at l(r4)
+; CHECK-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS6[UL]@le+424(r13)
+; CHECK-LARGECM64-NEXT:    li r3, 203
+; CHECK-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS2[UL]@le+1200(r13)
+; CHECK-LARGECM64-NEXT:    addis r3, L..C1 at u(r2)
+; CHECK-LARGECM64-NEXT:    ld r3, L..C1 at l(r3)
+; CHECK-LARGECM64-NEXT:    bla .__tls_get_addr[PR]
+; CHECK-LARGECM64-NEXT:    li r4, 44
+; CHECK-LARGECM64-NEXT:    std r4, 440(r3)
+; CHECK-LARGECM64-NEXT:    li r3, 6
+; CHECK-LARGECM64-NEXT:    li r4, 100
+; CHECK-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS3[UL]@le+2000(r13)
+; CHECK-LARGECM64-NEXT:    li r3, 882
+; CHECK-LARGECM64-NEXT:    std r4, (mySmallLocalExecTLS4[UL]@le+6800)-65536(r13)
+; CHECK-LARGECM64-NEXT:    std r3, (mySmallLocalExecTLS5[TL]@le+8400)-65536(r13)
+; CHECK-LARGECM64-NEXT:    li r3, 1191
+; CHECK-LARGECM64-NEXT:    addi r1, r1, 48
+; CHECK-LARGECM64-NEXT:    ld r0, 16(r1)
+; CHECK-LARGECM64-NEXT:    mtlr r0
+; CHECK-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS6)
+  %arrayidx = getelementptr inbounds [60 x i64], ptr %0, i64 0, i64 53
+  store i64 212, ptr %arrayidx, align 8
+  %1 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS2)
+  %arrayidx1 = getelementptr inbounds [3000 x i64], ptr %1, i64 0, i64 150
+  store i64 203, ptr %arrayidx1, align 8
+  %2 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @MyTLSGDVar)
+  %arrayidx2 = getelementptr inbounds [800 x i64], ptr %2, i64 0, i64 55
+  store i64 44, ptr %arrayidx2, align 8
+  %3 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS3)
+  %arrayidx3 = getelementptr inbounds [3000 x i64], ptr %3, i64 0, i64 250
+  store i64 6, ptr %arrayidx3, align 8
+  %4 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS4)
+  %arrayidx4 = getelementptr inbounds [3000 x i64], ptr %4, i64 0, i64 850
+  store i64 100, ptr %arrayidx4, align 8
+  %5 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS5)
+  %arrayidx5 = getelementptr inbounds [3000 x i64], ptr %5, i64 0, i64 1050
+  store i64 882, ptr %arrayidx5, align 8
+  %6 = load i64, ptr %arrayidx1, align 8
+  %7 = load i64, ptr %arrayidx3, align 8
+  %8 = load i64, ptr %arrayidx4, align 8
+  %add = add i64 %6, 882
+  %add9 = add i64 %add, %7
+  %add11 = add i64 %add9, %8
+  ret i64 %add11
+}
+
+; Since this function does not have the 'aix-small-local-exec-tls` attribute,
+; only some local-exec variables should have the small-local-exec TLS access
+; sequence (as opposed to all of them).
+define i64 @StoreLargeAccess2() {
+; CHECK-SMALLCM64-LABEL: StoreLargeAccess2:
+; CHECK-SMALLCM64:       # %bb.0: # %entry
+; CHECK-SMALLCM64-NEXT:    mflr r0
+; CHECK-SMALLCM64-NEXT:    stdu r1, -48(r1)
+; CHECK-SMALLCM64-NEXT:    ld r3, L..C2(r2) # target-flags(ppc-tprel) @mySmallLocalExecTLS6
+; CHECK-SMALLCM64-NEXT:    li r4, 212
+; CHECK-SMALLCM64-NEXT:    std r0, 64(r1)
+; CHECK-SMALLCM64-NEXT:    add r3, r13, r3
+; CHECK-SMALLCM64-NEXT:    std r4, 424(r3)
+; CHECK-SMALLCM64-NEXT:    ld r4, L..C1(r2) # target-flags(ppc-tlsgd) @MyTLSGDVar
+; CHECK-SMALLCM64-NEXT:    li r3, 203
+; CHECK-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS2[UL]@le+1200(r13)
+; CHECK-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-tlsgdm) @MyTLSGDVar
+; CHECK-SMALLCM64-NEXT:    bla .__tls_get_addr[PR]
+; CHECK-SMALLCM64-NEXT:    li r4, 44
+; CHECK-SMALLCM64-NEXT:    std r4, 440(r3)
+; CHECK-SMALLCM64-NEXT:    ld r3, L..C3(r2) # target-flags(ppc-tprel) @mySmallLocalExecTLS3
+; CHECK-SMALLCM64-NEXT:    li r4, 6
+; CHECK-SMALLCM64-NEXT:    add r3, r13, r3
+; CHECK-SMALLCM64-NEXT:    std r4, 2000(r3)
+; CHECK-SMALLCM64-NEXT:    li r3, 100
+; CHECK-SMALLCM64-NEXT:    li r4, 882
+; CHECK-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS4[UL]@le+6800(r13)
+; CHECK-SMALLCM64-NEXT:    std r4, mySmallLocalExecTLS5[TL]@le+8400(r13)
+; CHECK-SMALLCM64-NEXT:    li r3, 1191
+; CHECK-SMALLCM64-NEXT:    addi r1, r1, 48
+; CHECK-SMALLCM64-NEXT:    ld r0, 16(r1)
+; CHECK-SMALLCM64-NEXT:    mtlr r0
+; CHECK-SMALLCM64-NEXT:    blr
+;
+; CHECK-LARGECM64-LABEL: StoreLargeAccess2:
+; CHECK-LARGECM64:       # %bb.0: # %entry
+; CHECK-LARGECM64-NEXT:    mflr r0
+; CHECK-LARGECM64-NEXT:    stdu r1, -48(r1)
+; CHECK-LARGECM64-NEXT:    addis r3, L..C2 at u(r2)
+; CHECK-LARGECM64-NEXT:    li r4, 212
+; CHECK-LARGECM64-NEXT:    std r0, 64(r1)
+; CHECK-LARGECM64-NEXT:    ld r3, L..C2 at l(r3)
+; CHECK-LARGECM64-NEXT:    add r3, r13, r3
+; CHECK-LARGECM64-NEXT:    std r4, 424(r3)
+; CHECK-LARGECM64-NEXT:    li r3, 203
+; CHECK-LARGECM64-NEXT:    addis r4, L..C0 at u(r2)
+; CHECK-LARGECM64-NEXT:    ld r4, L..C0 at l(r4)
+; CHECK-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS2[UL]@le+1200(r13)
+; CHECK-LARGECM64-NEXT:    addis r3, L..C1 at u(r2)
+; CHECK-LARGECM64-NEXT:    ld r3, L..C1 at l(r3)
+; CHECK-LARGECM64-NEXT:    bla .__tls_get_addr[PR]
+; CHECK-LARGECM64-NEXT:    li r4, 44
+; CHECK-LARGECM64-NEXT:    std r4, 440(r3)
+; CHECK-LARGECM64-NEXT:    addis r3, L..C3 at u(r2)
+; CHECK-LARGECM64-NEXT:    li r4, 6
+; CHECK-LARGECM64-NEXT:    ld r3, L..C3 at l(r3)
+; CHECK-LARGECM64-NEXT:    add r3, r13, r3
+; CHECK-LARGECM64-NEXT:    std r4, 2000(r3)
+; CHECK-LARGECM64-NEXT:    li r3, 100
+; CHECK-LARGECM64-NEXT:    li r4, 882
+; CHECK-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS4[UL]@le+6800(r13)
+; CHECK-LARGECM64-NEXT:    std r4, mySmallLocalExecTLS5[TL]@le+8400(r13)
+; CHECK-LARGECM64-NEXT:    li r3, 1191
+; CHECK-LARGECM64-NEXT:    addi r1, r1, 48
+; CHECK-LARGECM64-NEXT:    ld r0, 16(r1)
+; CHECK-LARGECM64-NEXT:    mtlr r0
+; CHECK-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS6)
+  %arrayidx = getelementptr inbounds [60 x i64], ptr %0, i64 0, i64 53
+  store i64 212, ptr %arrayidx, align 8
+  %1 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS2)
+  %arrayidx1 = getelementptr inbounds [3000 x i64], ptr %1, i64 0, i64 150
+  store i64 203, ptr %arrayidx1, align 8
+  %2 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @MyTLSGDVar)
+  %arrayidx2 = getelementptr inbounds [800 x i64], ptr %2, i64 0, i64 55
+  store i64 44, ptr %arrayidx2, align 8
+  %3 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS3)
+  %arrayidx3 = getelementptr inbounds [3000 x i64], ptr %3, i64 0, i64 250
+  store i64 6, ptr %arrayidx3, align 8
+  %4 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS4)
+  %arrayidx4 = getelementptr inbounds [3000 x i64], ptr %4, i64 0, i64 850
+  store i64 100, ptr %arrayidx4, align 8
+  %5 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS5)
+  %arrayidx5 = getelementptr inbounds [3000 x i64], ptr %5, i64 0, i64 1050
+  store i64 882, ptr %arrayidx5, align 8
+  %6 = load i64, ptr %arrayidx1, align 8
+  %7 = load i64, ptr %arrayidx3, align 8
+  %8 = load i64, ptr %arrayidx4, align 8
+  %add = add i64 %6, 882
+  %add9 = add i64 %add, %7
+  %add11 = add i64 %add9, %8
+  ret i64 %add11
+}
+
+attributes #0 = { "aix-small-tls" }
+attributes #1 = { "target-features"="+aix-small-local-exec-tls" }
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-loadaddr.ll b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-loadaddr.ll
new file mode 100644
index 00000000000000..db4266958daff1
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-loadaddr.ll
@@ -0,0 +1,251 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      < %s | FileCheck %s --check-prefix=LARGECM64
+
+; Test that the 'aix-small-tls' global variable attribute generates the
+; optimized small-local-exec TLS sequence. Global variables without this
+; attribute should still generate a TOC-based local-exec access sequence.
+
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+ at a = thread_local(localexec) global [87 x i8] zeroinitializer, align 1 #0
+ at a_noattr = thread_local(localexec) global [87 x i8] zeroinitializer, align 1
+ at b = thread_local(localexec) global [87 x i16] zeroinitializer, align 2 #0
+ at b_noattr = thread_local(localexec) global [87 x i16] zeroinitializer, align 2
+ at c = thread_local(localexec) global [87 x i32] zeroinitializer, align 4 #0
+ at c_noattr = thread_local(localexec) global [87 x i32] zeroinitializer, align 4
+ at d = thread_local(localexec) global [87 x i64] zeroinitializer, align 8 #0
+ at d_noattr = thread_local(localexec) global [87 x i64] zeroinitializer, align 8 #0
+
+ at e = thread_local(localexec) global [87 x double] zeroinitializer, align 8 #0
+ at e_noattr = thread_local(localexec) global [87 x double] zeroinitializer, align 8
+ at f = thread_local(localexec) global [87 x float] zeroinitializer, align 4 #0
+ at f_noattr = thread_local(localexec) global [87 x float] zeroinitializer, align 4
+
+define nonnull ptr @AddrTest1() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest1:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    addi r3, r13, a[TL]@le+1
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest1:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addi r3, r13, a[TL]@le+1
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @a)
+  %arrayidx = getelementptr inbounds [87 x i8], ptr %0, i64 0, i64 1
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest1_NoAttr() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest1_NoAttr:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-tprel) @a_noattr
+; SMALLCM64-NEXT:    add r3, r13, r3
+; SMALLCM64-NEXT:    addi r3, r3, 1
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest1_NoAttr:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; LARGECM64-NEXT:    add r3, r13, r3
+; LARGECM64-NEXT:    addi r3, r3, 1
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @a_noattr)
+  %arrayidx = getelementptr inbounds [87 x i8], ptr %0, i64 0, i64 1
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest2() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest2:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    addi r3, r13, b[TL]@le+4
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest2:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addi r3, r13, b[TL]@le+4
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @b)
+  %arrayidx = getelementptr inbounds [87 x i16], ptr %0, i64 0, i64 2
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest2_NoAttr() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest2_NoAttr:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    ld r3, L..C1(r2) # target-flags(ppc-tprel) @b_noattr
+; SMALLCM64-NEXT:    add r3, r13, r3
+; SMALLCM64-NEXT:    addi r3, r3, 4
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest2_NoAttr:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addis r3, L..C1 at u(r2)
+; LARGECM64-NEXT:    ld r3, L..C1 at l(r3)
+; LARGECM64-NEXT:    add r3, r13, r3
+; LARGECM64-NEXT:    addi r3, r3, 4
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @b_noattr)
+  %arrayidx = getelementptr inbounds [87 x i16], ptr %0, i64 0, i64 2
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest3() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest3:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    addi r3, r13, c[TL]@le+12
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest3:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addi r3, r13, c[TL]@le+12
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @c)
+  %arrayidx = getelementptr inbounds [87 x i32], ptr %0, i64 0, i64 3
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest3_NoAttr() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest3_NoAttr:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    ld r3, L..C2(r2) # target-flags(ppc-tprel) @c_noattr
+; SMALLCM64-NEXT:    add r3, r13, r3
+; SMALLCM64-NEXT:    addi r3, r3, 12
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest3_NoAttr:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addis r3, L..C2 at u(r2)
+; LARGECM64-NEXT:    ld r3, L..C2 at l(r3)
+; LARGECM64-NEXT:    add r3, r13, r3
+; LARGECM64-NEXT:    addi r3, r3, 12
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @c_noattr)
+  %arrayidx = getelementptr inbounds [87 x i32], ptr %0, i64 0, i64 3
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest4() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest4:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    addi r3, r13, c[TL]@le+56
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest4:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addi r3, r13, c[TL]@le+56
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @c)
+  %arrayidx = getelementptr inbounds [87 x i64], ptr %0, i64 0, i64 7
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest4_NoAttr() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest4_NoAttr:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    ld r3, L..C2(r2) # target-flags(ppc-tprel) @c_noattr
+; SMALLCM64-NEXT:    add r3, r13, r3
+; SMALLCM64-NEXT:    addi r3, r3, 56
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest4_NoAttr:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addis r3, L..C2 at u(r2)
+; LARGECM64-NEXT:    ld r3, L..C2 at l(r3)
+; LARGECM64-NEXT:    add r3, r13, r3
+; LARGECM64-NEXT:    addi r3, r3, 56
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @c_noattr)
+  %arrayidx = getelementptr inbounds [87 x i64], ptr %0, i64 0, i64 7
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest5() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest5:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    addi r3, r13, e[TL]@le+48
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest5:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addi r3, r13, e[TL]@le+48
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @e)
+  %arrayidx = getelementptr inbounds [87 x double], ptr %0, i64 0, i64 6
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest5_NoAttr() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest5_NoAttr:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    ld r3, L..C3(r2) # target-flags(ppc-tprel) @e_noattr
+; SMALLCM64-NEXT:    add r3, r13, r3
+; SMALLCM64-NEXT:    addi r3, r3, 48
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest5_NoAttr:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addis r3, L..C3 at u(r2)
+; LARGECM64-NEXT:    ld r3, L..C3 at l(r3)
+; LARGECM64-NEXT:    add r3, r13, r3
+; LARGECM64-NEXT:    addi r3, r3, 48
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @e_noattr)
+  %arrayidx = getelementptr inbounds [87 x double], ptr %0, i64 0, i64 6
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest6() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest6:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    addi r3, r13, f[TL]@le+16
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest6:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addi r3, r13, f[TL]@le+16
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @f)
+  %arrayidx = getelementptr inbounds [87 x float], ptr %0, i64 0, i64 4
+  ret ptr %arrayidx
+}
+
+define nonnull ptr @AddrTest6_NoAttr() local_unnamed_addr {
+; SMALLCM64-LABEL: AddrTest6_NoAttr:
+; SMALLCM64:       # %bb.0: # %entry
+; SMALLCM64-NEXT:    ld r3, L..C4(r2) # target-flags(ppc-tprel) @f_noattr
+; SMALLCM64-NEXT:    add r3, r13, r3
+; SMALLCM64-NEXT:    addi r3, r3, 16
+; SMALLCM64-NEXT:    blr
+;
+; LARGECM64-LABEL: AddrTest6_NoAttr:
+; LARGECM64:       # %bb.0: # %entry
+; LARGECM64-NEXT:    addis r3, L..C4 at u(r2)
+; LARGECM64-NEXT:    ld r3, L..C4 at l(r3)
+; LARGECM64-NEXT:    add r3, r13, r3
+; LARGECM64-NEXT:    addi r3, r3, 16
+; LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @f_noattr)
+  %arrayidx = getelementptr inbounds [87 x float], ptr %0, i64 0, i64 4
+  ret ptr %arrayidx
+}
+
+attributes #0 = { "aix-small-tls" }
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-targetattr.ll b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-targetattr.ll
new file mode 100644
index 00000000000000..8f617eac66ef6c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-targetattr.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \
+; RUN:      --check-prefix=SMALL-LOCAL-EXEC-LARGECM64
+
+ at mySmallLocalExecTLS6 = external thread_local(localexec) global [60 x i64], align 8
+ at mySmallLocalExecTLS2 = external thread_local(localexec) global [3000 x i64], align 8 #0
+ at MyTLSGDVar = thread_local global [800 x i64] zeroinitializer, align 8
+ at mySmallLocalExecTLS3 = internal thread_local(localexec) global [3000 x i64] zeroinitializer, align 8
+ at mySmallLocalExecTLS4 = internal thread_local(localexec) global [3000 x i64] zeroinitializer, align 8 #0
+ at mySmallLocalExecTLS5 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8 #0
+ at mySmallLocalExecTLS = thread_local(localexec) local_unnamed_addr global [7800 x i64] zeroinitializer, align 8 #0
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+; Although some global variables are annotated with 'aix-small-tls', because the
+; aix-small-local-exec-tls target attribute is turned on, all accesses will use
+; a "faster" local-exec sequence directly off the thread pointer.
+define i64 @StoreLargeAccess1() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: StoreLargeAccess1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    mflr r0
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stdu r1, -48(r1)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 212
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 203
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r0, 64(r1)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS6[UL]@le+424(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r4, mySmallLocalExecTLS2[UL]@le+1200(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-tlsgdm) @MyTLSGDVar
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r4, L..C1(r2) # target-flags(ppc-tlsgd) @MyTLSGDVar
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    bla .__tls_get_addr[PR]
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 44
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r4, 440(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 6
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 100
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS3[UL]@le+2000(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 882
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r4, (mySmallLocalExecTLS4[UL]@le+6800)-65536(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, (mySmallLocalExecTLS5[TL]@le+8400)-65536(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 1191
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r1, r1, 48
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r0, 16(r1)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    mtlr r0
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreLargeAccess1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    mflr r0
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stdu r1, -48(r1)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 212
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r0, 64(r1)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r4, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r4, L..C0 at l(r4)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS6[UL]@le+424(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 203
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS2[UL]@le+1200(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C1 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C1 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    bla .__tls_get_addr[PR]
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 44
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r4, 440(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 6
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 100
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS3[UL]@le+2000(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 882
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r4, (mySmallLocalExecTLS4[UL]@le+6800)-65536(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, (mySmallLocalExecTLS5[TL]@le+8400)-65536(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 1191
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r1, r1, 48
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r0, 16(r1)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    mtlr r0
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS6)
+  %arrayidx = getelementptr inbounds [60 x i64], ptr %0, i64 0, i64 53
+  store i64 212, ptr %arrayidx, align 8
+  %1 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS2)
+  %arrayidx1 = getelementptr inbounds [3000 x i64], ptr %1, i64 0, i64 150
+  store i64 203, ptr %arrayidx1, align 8
+  %2 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @MyTLSGDVar)
+  %arrayidx2 = getelementptr inbounds [800 x i64], ptr %2, i64 0, i64 55
+  store i64 44, ptr %arrayidx2, align 8
+  %3 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS3)
+  %arrayidx3 = getelementptr inbounds [3000 x i64], ptr %3, i64 0, i64 250
+  store i64 6, ptr %arrayidx3, align 8
+  %4 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS4)
+  %arrayidx4 = getelementptr inbounds [3000 x i64], ptr %4, i64 0, i64 850
+  store i64 100, ptr %arrayidx4, align 8
+  %5 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS5)
+  %arrayidx5 = getelementptr inbounds [3000 x i64], ptr %5, i64 0, i64 1050
+  store i64 882, ptr %arrayidx5, align 8
+  %6 = load i64, ptr %arrayidx1, align 8
+  %7 = load i64, ptr %arrayidx3, align 8
+  %8 = load i64, ptr %arrayidx4, align 8
+  %add = add i64 %6, 882
+  %add9 = add i64 %add, %7
+  %add11 = add i64 %add9, %8
+  ret i64 %add11
+}
+
+attributes #0 = { "aix-small-tls" }

>From ec013bd7ed1b10fbe73fb55a2d74860472cdbb74 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Wed, 20 Mar 2024 08:07:54 -0500
Subject: [PATCH 2/2] Address review comments: move variables around and
 simplify function to check for attribute

---
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 16 +++++-----------
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |  4 ++--
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 05f5d6ba7007a6..56888ddcf9d050 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7575,18 +7575,12 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
 
 // Check if an SDValue has the 'aix-small-tls' global variable attribute.
 static bool hasAIXSmallTLSAttr(SDValue Val) {
-  GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Val);
-  if (!GA)
-    return false;
-
-  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
-  if (!GV)
-    return false;
-
-  if (!GV->hasAttribute("aix-small-tls"))
-    return false;
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Val))
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(GA->getGlobal()))
+      if (GV->hasAttribute("aix-small-tls"))
+        return true;
 
-  return true;
+  return false;
 }
 
 // Is an ADDI eligible for folding for non-TOC-based local-exec accesses?
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 128cfa79ff95e4..36c4def5e9ac36 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -3364,12 +3364,12 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool Is64Bit = Subtarget.isPPC64();
-  bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
-  bool HasAIXSmallTLSGlobalAttr = false;
   TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
   bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;
 
   if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {
+    bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
+    bool HasAIXSmallTLSGlobalAttr = false;
     SDValue VariableOffsetTGA =
         DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_FLAG);
     SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);



More information about the llvm-commits mailing list