[llvm] [AIX][TLS] Optimize the small local-exec access sequence for non-zero offsets (PR #71485)

Amy Kwan via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 26 20:59:52 PST 2024


https://github.com/amy-kwan updated https://github.com/llvm/llvm-project/pull/71485

>From c1b473e61712e5b11f7d12a211cdc102fed64ced Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Mon, 6 Nov 2023 22:19:58 -0600
Subject: [PATCH 01/16] [AIX][TLS] Optimize the -maix-small-local-exec-tls
 local-exec access sequence for non-zero offsets

This patch utilizes the -maix-small-local-exec-tls option to produce a faster,
non-TOC-based access sequence for the local-exec TLS model, specifically for
when the offsets from the TLS variable are non-zero.

In particular, this patch produces either a single:
- addi/la with a displacement off of R13 plus a non-zero offset for
  when an address is calculated, or
- load or store off of R13 plus a non-zero offset for when an address is
  calculated and used for further access
Where R13 is the thread pointer, respectively.

In order to produce a single addi or load/store off of the thread pointer with
a non-zero offset, this patch also adds the necessary support in the assembly
printer when printing these instructions.

Specifically:
- The non-zero offset is added to the TLS variable address when the address of
  the TLS variable + it's offset is less than 32KB.
- Otherwise, when the address of the TLS variable + its offset is greater than
  32KB, the non-zero offset (multiplied by a multiple of 64KB) is subtracted
  from the TLS address.

This handling in the assembly printer is necessary to ensure that the TLS
address + the non-zero offset is between [-32768, 32768), so that the total
displacement can fit within the addi/load/store instructions.
---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     | 170 +++++++++++++-
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   |  69 +++++-
 .../PowerPC/aix-small-local-exec-tls-char.ll  |   6 +-
 .../aix-small-local-exec-tls-double.ll        |   6 +-
 .../PowerPC/aix-small-local-exec-tls-float.ll |   6 +-
 .../PowerPC/aix-small-local-exec-tls-int.ll   |   6 +-
 .../aix-small-local-exec-tls-largeaccess.ll   | 211 ++++++++----------
 .../aix-small-local-exec-tls-largeaccess2.ll  | 160 +++++++++++++
 .../PowerPC/aix-small-local-exec-tls-short.ll |   6 +-
 9 files changed, 497 insertions(+), 143 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 780b22b4fbe65ef..63c03c93e202d9a 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -66,6 +66,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Threading.h"
@@ -155,6 +156,11 @@ class PPCAsmPrinter : public AsmPrinter {
       TOC;
   const PPCSubtarget *Subtarget = nullptr;
 
+  // Keep track of the number of TLS variables and their corresponding
+  // addresses, which is then used for the assembly printing of
+  // non-TOC-based local-exec variables.
+  MapVector<const GlobalValue *, uint64_t> TLSVarsToAddressMapping;
+
 public:
   explicit PPCAsmPrinter(TargetMachine &TM,
                          std::unique_ptr<MCStreamer> Streamer)
@@ -199,6 +205,8 @@ class PPCAsmPrinter : public AsmPrinter {
   void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
   void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
   void EmitAIXTlsCallHelper(const MachineInstr *MI);
+  const MCExpr *getAdjustedLocalExecExpr(const MachineOperand &MO,
+                                         int64_t Offset);
   bool runOnMachineFunction(MachineFunction &MF) override {
     Subtarget = &MF.getSubtarget<PPCSubtarget>();
     bool Changed = AsmPrinter::runOnMachineFunction(MF);
@@ -1503,13 +1511,42 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::LWA: {
     // Verify alignment is legal, so we don't create relocations
     // that can't be supported.
-    unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+    unsigned OpNum;
+    if (Subtarget->hasAIXSmallLocalExecTLS())
+      OpNum = 1;
+    else
+      OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
     const MachineOperand &MO = MI->getOperand(OpNum);
     if (MO.isGlobal()) {
       const DataLayout &DL = MO.getGlobal()->getParent()->getDataLayout();
       if (MO.getGlobal()->getPointerAlignment(DL) < 4)
         llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+
+      // A faster non-TOC-based local-exec sequence is represented by
+      // `lwa`/`ld`/`std` directingly loading or storing off of the thread
+      // pointer and with an immediate operand having the MO_TPREL_FLAG.
+      // Such instructions do not otherwise arise.
+      unsigned Flag = MO.getTargetFlags();
+      if (Flag == PPCII::MO_TPREL_FLAG) {
+        assert(Subtarget->hasAIXSmallLocalExecTLS() &&
+               "lwa/ld/std with thread-pointer only expected with "
+               "local-exec small TLS");
+        int64_t Offset = MO.getOffset();
+        // Non-zero offsets for lwa/ld/std require special handling and are
+        // handled here.
+        if (!Offset)
+          break;
+
+        LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
+        if (Offset) {
+          const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset);
+          TmpInst.getOperand(1) = MCOperand::createExpr(Expr);
+        }
+        EmitToStreamer(*OutStreamer, TmpInst);
+        return;
+      }
     }
+
     // Now process the instruction normally.
     break;
   }
@@ -1523,19 +1560,58 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::EnforceIEIO));
     return;
   }
+  case PPC::LBZ:
+  case PPC::LBZ8:
+  case PPC::LHA:
+  case PPC::LHA8:
+  case PPC::LHZ:
+  case PPC::LHZ8:
+  case PPC::LWZ:
+  case PPC::LWZ8:
+  case PPC::STB:
+  case PPC::STB8:
+  case PPC::STH:
+  case PPC::STH8:
+  case PPC::STW:
+  case PPC::STW8:
+  case PPC::LFS:
+  case PPC::STFS:
+  case PPC::LFD:
+  case PPC::STFD:
   case PPC::ADDI8: {
-    // The faster non-TOC-based local-exec sequence is represented by `addi`
-    // with an immediate operand having the MO_TPREL_FLAG. Such an instruction
-    // does not otherwise arise.
-    unsigned Flag = MI->getOperand(2).getTargetFlags();
+    // A faster non-TOC-based local-exec sequence is represented by `addi`
+    // or a load/store instruction (that directly loads or stores off of the
+    // thread pointer) with an immediate operand having the MO_TPREL_FLAG.
+    // Such instructions do not otherwise arise.
+    bool IsMIADDI8 = MI->getOpcode() == PPC::ADDI8;
+    unsigned OpNum = IsMIADDI8 ? 2 : 1;
+    const MachineOperand &MO = MI->getOperand(OpNum);
+    unsigned Flag = MO.getTargetFlags();
     if (Flag == PPCII::MO_TPREL_FLAG ||
         Flag == PPCII::MO_GOT_TPREL_PCREL_FLAG ||
         Flag == PPCII::MO_TPREL_PCREL_FLAG) {
       assert(
           Subtarget->hasAIXSmallLocalExecTLS() &&
-          "addi with thread-pointer only expected with local-exec small TLS");
+          "addi, or load/stores with thread-pointer only expected with "
+          "local-exec small TLS");
+
+      int64_t Offset = MO.getOffset();
+      // Non-zero offsets for loads/stores require special handling and are
+      // handled here. For `addi`, all offsets are handled here.
+      if (!Offset && !IsMIADDI8)
+        break;
+
       LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
-      TmpInst.setOpcode(PPC::LA8);
+
+      if (Offset) {
+        const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset);
+        TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr);
+      }
+
+      // Change the opcode to load address if the original opcode is an `addi`.
+      if (IsMIADDI8)
+        TmpInst.setOpcode(PPC::LA8);
+
       EmitToStreamer(*OutStreamer, TmpInst);
       return;
     }
@@ -1547,6 +1623,69 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+// For non-TOC-based local-exec variables that have a non-zero offset,
+// we need to create a new MCExpr that adds the non-zero offset to the address
+// of the local-exec variable that will be used in either an addi, load or
+// store. However, the final displacement for these instructions must be
+// between [-32768, 32768), so if the TLS address + it's non-zero offset is
+// greater than 32KB, a new MCExpr is produced to accommodate this situation.
+const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
+                                                      int64_t Offset) {
+  assert(MO.isGlobal() && "Only expecting a global MachineOperand here!");
+  const GlobalValue *GValue = MO.getGlobal();
+  TLSModel::Model Model = TM.getTLSModel(GValue);
+  assert(Model == TLSModel::LocalExec &&
+         "Only local-exec accesses are handled!");
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLE;
+
+  const MCExpr *Expr = MCSymbolRefExpr::create(getSymbol(GValue), RefKind,
+                                               OutContext);
+
+  bool IsGlobalADeclaration = GValue->isDeclarationForLinker();
+  // Find the GlobalVariable that corresponds to the particular TLS variable
+  // in the TLS variable to address mapping. All TLS variables should exist
+  // within this map, with the exception of TLS variables marked as extern.
+  const auto TLSVarsMapEntryIter = TLSVarsToAddressMapping.find(GValue);
+  if (TLSVarsMapEntryIter == TLSVarsToAddressMapping.end())
+    assert(IsGlobalADeclaration &&
+           "Only expecting to find extern TLS variables not present in the TLS "
+           "variables to address map!");
+
+  unsigned TLSVarAddress = TLSVarsMapEntryIter->second;
+  ptrdiff_t FinalAddress = (TLSVarAddress + Offset);
+  // If the address of the TLS variable + the offset is less than 32KB,
+  // or if the TLS variable is extern, we simply produce an MCExpr to add the
+  // non-zero offset to the TLS variable address.
+  // For when TLS variables are extern, this is safe to do because we can
+  // assume that the address of extern TLS variables are zero.
+  if ((FinalAddress < 32768) || IsGlobalADeclaration)
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(Offset, OutContext),
+                                   OutContext);
+  else {
+    // Handle the written offset for cases where:
+    //   address of the TLS variable + the offset is greater than 32KB.
+
+    // Get the address in the range of 0 to 64KB.
+    FinalAddress = FinalAddress & 0xFFFF;
+    // If the highest bit in the calculated address is set, subtract
+    // additional 64KB to ensure that the final address fits within
+    // [-32768,32768).
+    if (FinalAddress & 0x8000)
+      FinalAddress = FinalAddress - 0x10000;
+    assert((FinalAddress < 32768) || (FinalAddress >= -32768) &&
+           "Expecting the final address for local-exec TLS variables to be "
+           "between [-32768,32768)!");
+    // Get the offset that is actually written out in assembly by adding back
+    // the original address of the TLS variable.
+    ptrdiff_t WrittenOffset = FinalAddress - TLSVarAddress;
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(WrittenOffset, OutContext), OutContext);
+  }
+
+  return Expr;
+}
+
 void PPCLinuxAsmPrinter::emitGNUAttributes(Module &M) {
   // Emit float ABI into GNU attribute
   Metadata *MD = M.getModuleFlag("float-abi");
@@ -2749,6 +2888,23 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
     Csect->ensureMinAlignment(GOAlign);
   };
 
+  // For all TLS variables, calculate their corresponding addresses and store
+  // them into TLSVarsToAddressMapping, which will be used to determine whether
+  // or not local-exec TLS variables require special assembly printing.
+  // This address calculation follows the same method seen within
+  // assignAddressesAndIndices() in XCOFFObjectWriter.cpp.
+  uint64_t Address = 0;
+  uint64_t TLSVarAddress = 0;
+  auto DL = M.getDataLayout();
+  for (const auto &G : M.globals()) {
+    if (G.isThreadLocal() && !G.isDeclaration()) {
+      TLSVarAddress = alignTo(Address, getGVAlignment(&G, DL));
+      unsigned GVSize = DL.getTypeAllocSize(G.getValueType());
+      Address = TLSVarAddress + GVSize;
+      TLSVarsToAddressMapping[&G] = TLSVarAddress;
+    }
+  }
+
   // We need to know, up front, the alignment of csects for the assembly path,
   // because once a .csect directive gets emitted, we could not change the
   // alignment value on it.
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 26ed74108ec36c5..427099376d72d96 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7565,8 +7565,64 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
   DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
 }
 
+// For non-TOC-based local-exec access where an addi is feeding into another
+// addi, fold this sequence into a single addi if possible.
+static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
+  const PPCSubtarget &Subtarget =
+      DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
+  // This optimization is only performed for non-TOC-based local-exec accesses.
+  if (!Subtarget.hasAIXSmallLocalExecTLS())
+    return;
+
+  if (N->getMachineOpcode() != PPC::ADDI8)
+    return;
+
+  // InitialADDI is the addi feeding into N (also an addi), and the addi that
+  // we want optimized out.
+  SDValue InitialADDI = N->getOperand(0);
+  if (!InitialADDI.isMachineOpcode())
+    return;
+  if (InitialADDI.getMachineOpcode() != PPC::ADDI8)
+    return;
+
+  // The first operand of the InitialADDI will be the thread pointer.
+  // This transformation is only performed if the first operand of the
+  // addi is the thread pointer.
+  SDValue TPRegNode = InitialADDI.getOperand(0);
+  RegisterSDNode *TPReg =
+      dyn_cast_or_null<RegisterSDNode>(TPRegNode.getNode());
+  if (!TPReg)
+    return;
+  if (TPReg->getReg() != Subtarget.getThreadPointerRegister())
+    return;
+
+  // The second operand of the InitialADDI will be a TargetGlobalTLSAddress,
+  // (the local-exec TLS variable). We only perform the folding if the TLS
+  // variable is the second operand.
+  SDValue TLSVarNode = InitialADDI.getOperand(1);
+  GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode);
+  if (!GA)
+    return;
+
+  unsigned TargetFlags = GA->getTargetFlags();
+  if ((TargetFlags & PPCII::MO_TPREL_FLAG) == 0)
+    return;
+  // The second operand of the addi that we want to preserve will be an
+  // immediate. We add this immediate together with the address of the TLS
+  // variable found in InitialADDI in order to preserve the correct TLS address
+  // information during assembly printing.
+  int Offset = N->getConstantOperandVal(1);
+  TLSVarNode = DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), MVT::i64,
+                                           Offset, TargetFlags);
+
+  (void)DAG->UpdateNodeOperands(N, TPRegNode, TLSVarNode);
+  if (InitialADDI.getNode()->use_empty())
+    DAG->RemoveDeadNode(InitialADDI.getNode());
+}
+
 void PPCDAGToDAGISel::PeepholePPC64() {
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+  bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS();
 
   while (Position != CurDAG->allnodes_begin()) {
     SDNode *N = &*--Position;
@@ -7577,6 +7633,8 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     if (isVSXSwap(SDValue(N, 0)))
       reduceVSXSwap(N, CurDAG);
 
+    foldADDIForLocalExecAccesses(N, CurDAG);
+
     unsigned FirstOp;
     unsigned StorageOpcode = N->getMachineOpcode();
     bool RequiresMod4Offset = false;
@@ -7733,7 +7791,16 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd),
                                             ImmOpnd.getValueType());
       } else if (Offset != 0) {
-        continue;
+        if (!HasAIXSmallLocalExecTLS)
+          continue;
+        // Add the non-zero offset information into the load or store
+        // instruction to be used for non-TOC-based local-exec accesses.
+        GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);
+        if (!GA)
+          continue;
+        ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
+                                                 MVT::i64, Offset,
+                                                 GA->getTargetFlags());
       }
     }
 
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll
index 6c05fb38ee16d83..c938b9485c25732 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll
@@ -16,14 +16,12 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
 define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
 ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
 ; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, c[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, c[TL]@le+1(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
 ;
 ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
 ; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, c[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, c[TL]@le+1(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @c)
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll
index 5cf359f68f8bd11..02d794fec75cc98 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll
@@ -16,14 +16,12 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
 define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
 ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
 ; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, f[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 48
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, f[TL]@le+48(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
 ;
 ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
 ; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, f[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 48
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, f[TL]@le+48(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @f)
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll
index 1fc014edaf2bb5f..a1f6f4f974bd818 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll
@@ -16,14 +16,12 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
 define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
 ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
 ; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, e[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 16
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, e[TL]@le+16(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
 ;
 ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
 ; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, e[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 16
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, e[TL]@le+16(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @e)
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll
index 40adf27d7ee39b3..c74abe93c18bf30 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll
@@ -18,14 +18,12 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
 define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
 ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
 ; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, a[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 12
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, a[TL]@le+12(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
 ;
 ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
 ; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, a[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 12
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, a[TL]@le+12(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @a)
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll
index 55c69839515c439..3aa3ecc9f2b0d10 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll
@@ -25,43 +25,33 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
 define signext i32 @StoreArrays1() {
 ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: StoreArrays1:
 ; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, mySmallLocalExecTLSv1[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 1
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r5, 4
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLSv1[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r4, mySmallLocalExecTLS2[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r5, 24(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, mySmallLocalExecTLSv1[TL]@le(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 2
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, 320(r4)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, mySmallLocalExecTLS3[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 3
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, 324(r3)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, mySmallLocalExecTLS4[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 88
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r5, 328(r3)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, mySmallLocalExecTLS5[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, 332(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 88
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 102
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
 ;
 ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreArrays1:
 ; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, mySmallLocalExecTLSv1[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 1
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r5, 4
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, mySmallLocalExecTLSv1[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r4, mySmallLocalExecTLS2[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r5, 24(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLSv1[TL]@le(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 2
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, 320(r4)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, mySmallLocalExecTLS3[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 3
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, 324(r3)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, mySmallLocalExecTLS4[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 88
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r5, 328(r3)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, mySmallLocalExecTLS5[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, 332(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 88
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 102
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
@@ -98,46 +88,38 @@ entry:
 define signext i32 @StoreArrays2() {
 ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: StoreArrays2:
 ; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-tprel) @mySmallLocalExecTLSv2
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 1
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r5, 4
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r3, r13, r3
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, 0(r3)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r4, mySmallLocalExecTLS2[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r5, 24(r3)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 2
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, 320(r4)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, mySmallLocalExecTLS3[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r4, L..C0(r2) # target-flags(ppc-tprel) @mySmallLocalExecTLSv2
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r4, r13, r4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, 0(r4)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, 24(r4)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 2
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLS2[TL]@le-65216(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 3
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, 324(r3)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, mySmallLocalExecTLS4[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r4, mySmallLocalExecTLS5[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r5, 328(r3)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 88
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, 332(r4)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLS3[TL]@le-65212(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 88
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, mySmallLocalExecTLS4[TL]@le-65208(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 102
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLS5[TL]@le-65204(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
 ;
 ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreArrays2:
 ; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r4, L..C0 at u(r2)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 1
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r5, 4
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r4, L..C0 at l(r4)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r4, r13, r4
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, 0(r4)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, mySmallLocalExecTLS2[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r5, 24(r4)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 2
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, 320(r3)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, mySmallLocalExecTLS3[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 3
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, 324(r3)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, mySmallLocalExecTLS4[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r4, mySmallLocalExecTLS5[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r5, 328(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r3, r13, r3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, 24(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 2
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 88
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, 332(r4)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 102
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
@@ -173,77 +155,76 @@ entry:
 ; DIS:      {{.*}}aix-small-local-exec-tls-largeaccess.ll.tmp.o:	file format aix5coff64-rs6000
 ; DIS:      Disassembly of section .text:
 ; DIS:      0000000000000000 (idx: 3) .StoreArrays1:
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 3, 13, 0
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 1
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 4
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, 0(13)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 15) mySmallLocalExecTLSv1[TL]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 1
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 5, 4
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, 0(13)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 2
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, 24(13)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 15) mySmallLocalExecTLSv1[TL]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 4, 13, 32748
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, -32468(13)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 17) mySmallLocalExecTLS2[TL]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 5, 24(3)
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 2
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, 320(4)
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 3, 13, -16788
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 3
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, -16464(13)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 19) mySmallLocalExecTLS3[TL]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 3
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, 324(3)
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 3, 13, -788
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 88
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, -460(13)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 21) mySmallLocalExecTLS4[TL]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 88
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 5, 328(3)
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 3, 13, 15212
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, 15544(13)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 23) mySmallLocalExecTLS5[TL]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, 332(3)
 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 102
 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                blr
 
-; DIS:      0000000000000050 (idx: 5) .StoreArrays2:
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addis 4, 2, 0
+; DIS:      0000000000000040 (idx: 5) .StoreArrays2:
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addis 3, 2, 0
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU	(idx: 13) mySmallLocalExecTLSv2[TE]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 1
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 5, 4
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                ld 4, 0(4)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 1
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                ld 3, 0(3)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL	(idx: 13) mySmallLocalExecTLSv2[TE]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                add 4, 13, 4
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, 0(4)
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 3, 13, 32748
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                add 3, 13, 3
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, 0(3)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 4
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, 24(3)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 2
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, -32468(13)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 17) mySmallLocalExecTLS2[TL]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 5, 24(4)
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 2
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, 320(3)
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 3, 13, -16788
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 3
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, -16464(13)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 19) mySmallLocalExecTLS3[TL]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 3
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, 324(3)
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 3, 13, -788
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 88
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, -460(13)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 21) mySmallLocalExecTLS4[TL]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 4, 13, 15212
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, 15544(13)
 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 23) mySmallLocalExecTLS5[TL]
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 5, 328(3)
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 88
-; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, 332(4)
 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 102
 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                blr
 
 ; DIS:      Disassembly of section .data:
-; DIS:      00000000000000a0 (idx: 7) StoreArrays1[DS]:
+; DIS:      0000000000000080 (idx: 7) StoreArrays1[DS]:
+; DIS-NEXT:       80: 00 00 00 00
+; DIS-NEXT: 0000000000000080:  R_POS	(idx: 3) .StoreArrays1
+; DIS-NEXT:       84: 00 00 00 00
+; DIS-NEXT:       88: 00 00 00 00
+; DIS-NEXT: 0000000000000088:  R_POS        (idx: 11) TOC[TC0]
+; DIS-NEXT:       8c: 00 00 00 b0
+
+; DIS:      0000000000000098 (idx: 9) StoreArrays2[DS]:
+; DIS-NEXT:       98: 00 00 00 00
+; DIS-NEXT: 0000000000000098:  R_POS	(idx: 5) .StoreArrays2
+; DIS-NEXT:       9c: 00 00 00 40
 ; DIS-NEXT:       a0: 00 00 00 00
-; DIS-NEXT: 00000000000000a0:  R_POS	(idx: 3) .StoreArrays1
-; DIS-NEXT:       a4: 00 00 00 00
-; DIS-NEXT:       a8: 00 00 00 00
-; DIS-NEXT: 00000000000000a8:  R_POS        (idx: 11) TOC[TC0]
-; DIS-NEXT:       ac: 00 00 00 d0
+; DIS-NEXT: 00000000000000a0:  R_POS        (idx: 11) TOC[TC0]
+; DIS-NEXT:       a4: 00 00 00 b0
 
-; DIS:      00000000000000b8 (idx: 9) StoreArrays2[DS]:
-; DIS-NEXT:       b8: 00 00 00 00
-; DIS-NEXT: 00000000000000b8:  R_POS	(idx: 5) .StoreArrays2
-; DIS-NEXT:       bc: 00 00 00 50
-; DIS-NEXT:       c0: 00 00 00 00
-; DIS-NEXT: 00000000000000c0:  R_POS        (idx: 11) TOC[TC0]
-; DIS-NEXT:       c4: 00 00 00 d0
+; DIS:      00000000000000b0 (idx: 13) mySmallLocalExecTLSv2[TE]:
+; DIS-NEXT:       b0: 00 00 00 00
+; DIS-NEXT: 00000000000000b0:  R_TLS_LE     (idx: 25) mySmallLocalExecTLSv2[TL]
+; DIS-NEXT:       b4: 00 01 79 ec
 
-; DIS:      00000000000000d0 (idx: 13) mySmallLocalExecTLSv2[TE]:
-; DIS-NEXT:       d0: 00 00 00 00
-; DIS-NEXT: 00000000000000d0:  R_TLS_LE     (idx: 25) mySmallLocalExecTLSv2[TL]
-; DIS-NEXT:       d4: 00 01 79 ec
+; DIS:      Disassembly of section .tdata:
+; DIS:      0000000000000000 (idx: 15) mySmallLocalExecTLSv1[TL]:
+; DIS:      0000000000007fec (idx: 17) mySmallLocalExecTLS2[TL]:
+; DIS:      000000000000be6c (idx: 19) mySmallLocalExecTLS3[TL]:
+; DIS:      000000000000fcec (idx: 21) mySmallLocalExecTLS4[TL]:
+; DIS:      0000000000013b6c (idx: 23) mySmallLocalExecTLS5[TL]:
+; DIS:      00000000000179ec (idx: 25) mySmallLocalExecTLSv2[TL]:
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll
new file mode 100644
index 000000000000000..c87b7acb6211c61
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \
+; RUN:      --check-prefix=SMALL-LOCAL-EXEC-LARGECM64
+
+; Test disassembly of object.
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=+aix-small-local-exec-tls \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -xcoff-traceback-table=false \
+; RUN:      --code-model=large -filetype=obj -o %t.o < %s
+; RUN: llvm-objdump -D -r --symbol-description %t.o | FileCheck --check-prefix=DIS %s
+
+ at mySmallLocalExecTLS6 = external thread_local(localexec) global [60 x i64], align 8
+ at mySmallLocalExecTLS2 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8
+ at MyTLSGDVar = thread_local global [800 x i64] zeroinitializer, align 8
+ at mySmallLocalExecTLS3 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8
+ at mySmallLocalExecTLS4 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8
+ at mySmallLocalExecTLS5 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8
+ at mySmallLocalExecTLS = thread_local(localexec) local_unnamed_addr global [7800 x i64] zeroinitializer, align 8
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
+
+; All accesses use a "faster" local-exec sequence directly off the thread pointer.
+define i64 @StoreLargeAccess1() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: StoreLargeAccess1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    mflr r0
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stdu r1, -48(r1)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 212
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 203
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r0, 64(r1)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS6[UL]@le+424(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r4, mySmallLocalExecTLS2[TL]@le+1200(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-lo) @MyTLSGDVar
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r4, L..C1(r2) # target-flags(ppc-tlsgd) @MyTLSGDVar
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    bla .__tls_get_addr[PR]
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 44
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r4, 440(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 6
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 100
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS3[TL]@le+2000(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 882
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r4, mySmallLocalExecTLS4[TL]@le-58736(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS5[TL]@le-57136(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 1191
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r1, r1, 48
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r0, 16(r1)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    mtlr r0
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreLargeAccess1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    mflr r0
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stdu r1, -48(r1)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 212
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r0, 64(r1)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r4, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r4, L..C0 at l(r4)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS6[UL]@le+424(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 203
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS2[TL]@le+1200(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C1 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C1 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    bla .__tls_get_addr[PR]
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 44
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r4, 440(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 6
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 100
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS3[TL]@le+2000(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 882
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r4, mySmallLocalExecTLS4[TL]@le-58736(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS5[TL]@le-57136(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 1191
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r1, r1, 48
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r0, 16(r1)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    mtlr r0
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS6)
+  %arrayidx = getelementptr inbounds [60 x i64], ptr %0, i64 0, i64 53
+  store i64 212, ptr %arrayidx, align 8
+  %1 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS2)
+  %arrayidx1 = getelementptr inbounds [3000 x i64], ptr %1, i64 0, i64 150
+  store i64 203, ptr %arrayidx1, align 8
+  %2 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @MyTLSGDVar)
+  %arrayidx2 = getelementptr inbounds [800 x i64], ptr %2, i64 0, i64 55
+  store i64 44, ptr %arrayidx2, align 8
+  %3 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS3)
+  %arrayidx3 = getelementptr inbounds [3000 x i64], ptr %3, i64 0, i64 250
+  store i64 6, ptr %arrayidx3, align 8
+  %4 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS4)
+  %arrayidx4 = getelementptr inbounds [3000 x i64], ptr %4, i64 0, i64 850
+  store i64 100, ptr %arrayidx4, align 8
+  %5 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS5)
+  %arrayidx5 = getelementptr inbounds [3000 x i64], ptr %5, i64 0, i64 1050
+  store i64 882, ptr %arrayidx5, align 8
+  %6 = load i64, ptr %arrayidx1, align 8
+  %7 = load i64, ptr %arrayidx3, align 8
+  %8 = load i64, ptr %arrayidx4, align 8
+  %add = add i64 %6, 882
+  %add9 = add i64 %add, %7
+  %add11 = add i64 %add9, %8
+  ret i64 %add11
+}
+
+; DIS:      0000000000000000 (idx: 7) .StoreLargeAccess1:
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                mflr 0
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stdu 1, -48(1)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 212
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                std 0, 64(1)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addis 4, 2, 0
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU	(idx: 13) MyTLSGDVar[TE]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                ld 4, 0(4)
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL	(idx: 13) MyTLSGDVar[TE]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                std 3, 424(13)
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 1) mySmallLocalExecTLS6[UL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 203
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                std 3, 1200(13)
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 17) mySmallLocalExecTLS2[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addis 3, 2, 0
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU	(idx: 15) .MyTLSGDVar[TE]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                ld 3, 8(3)
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL	(idx: 15) .MyTLSGDVar[TE]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                bla 0
+; DIS-NEXT: {{0*}}[[#ADDR]]: R_RBA  (idx: 3)      .__tls_get_addr[PR]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 44
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                std 4, 440(3)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 6
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 100
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                std 3, 32400(13)
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE       (idx: 21) mySmallLocalExecTLS3[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 882
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                std 4, -4336(13)
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE       (idx: 23) mySmallLocalExecTLS4[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                std 3, 21264(13)
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE       (idx: 25) mySmallLocalExecTLS5[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 1191
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 1, 1, 48
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                ld 0, 16(1)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                mtlr 0
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                blr
+
+; DIS:      Disassembly of section .data:
+; DIS:      0000000000000068 (idx: 9) StoreLargeAccess1[DS]:
+; DIS-NEXT:       68: 00 00 00 00
+; DIS-NEXT: 0000000000000068:  R_POS    (idx: 7) .StoreLargeAccess1
+; DIS-NEXT:       6c: 00 00 00 00
+; DIS-NEXT:       70: 00 00 00 00
+; DIS-NEXT: 0000000000000070:  R_POS        (idx: 11) TOC[TC0]
+; DIS-NEXT:       74: 00 00 00 80
+
+; DIS:      Disassembly of section .tdata:
+; DIS:      0000000000000000 (idx: 17) mySmallLocalExecTLS2[TL]:
+; DIS:      0000000000005dc0 (idx: 19) MyTLSGDVar[TL]:
+; DIS:      00000000000076c0 (idx: 21) mySmallLocalExecTLS3[TL]:
+; DIS:      000000000000d480 (idx: 23) mySmallLocalExecTLS4[TL]:
+; DIS:      0000000000013240 (idx: 25) mySmallLocalExecTLS5[TL]:
+; DIS:      0000000000019000 (idx: 27) mySmallLocalExecTLS[TL]:
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll
index bf1b7fab308149c..b172c2985e69534 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll
@@ -16,14 +16,12 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
 define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
 ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
 ; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, b[TL]@le(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, b[TL]@le+4(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
 ;
 ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
 ; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, b[TL]@le(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, b[TL]@le+4(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @b)

>From 73fe305ff61a563de702ee0fa5e93f64716c41c8 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Mon, 6 Nov 2023 22:36:01 -0600
Subject: [PATCH 02/16] Apply formatting changes

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp   | 18 +++++++++---------
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp |  8 +++-----
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 63c03c93e202d9a..2e9bd71a011813c 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -68,8 +68,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Process.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Threading.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -1638,8 +1638,8 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
          "Only local-exec accesses are handled!");
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLE;
 
-  const MCExpr *Expr = MCSymbolRefExpr::create(getSymbol(GValue), RefKind,
-                                               OutContext);
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(getSymbol(GValue), RefKind, OutContext);
 
   bool IsGlobalADeclaration = GValue->isDeclarationForLinker();
   // Find the GlobalVariable that corresponds to the particular TLS variable
@@ -1659,9 +1659,8 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
   // For when TLS variables are extern, this is safe to do because we can
   // assume that the address of extern TLS variables are zero.
   if ((FinalAddress < 32768) || IsGlobalADeclaration)
-    Expr = MCBinaryExpr::createAdd(Expr,
-                                   MCConstantExpr::create(Offset, OutContext),
-                                   OutContext);
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
   else {
     // Handle the written offset for cases where:
     //   address of the TLS variable + the offset is greater than 32KB.
@@ -1673,9 +1672,10 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
     // [-32768,32768).
     if (FinalAddress & 0x8000)
       FinalAddress = FinalAddress - 0x10000;
-    assert((FinalAddress < 32768) || (FinalAddress >= -32768) &&
-           "Expecting the final address for local-exec TLS variables to be "
-           "between [-32768,32768)!");
+    assert((FinalAddress < 32768) ||
+           (FinalAddress >= -32768) &&
+               "Expecting the final address for local-exec TLS variables to be "
+               "between [-32768,32768)!");
     // Get the offset that is actually written out in assembly by adding back
     // the original address of the TLS variable.
     ptrdiff_t WrittenOffset = FinalAddress - TLSVarAddress;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 427099376d72d96..acf4cb0f7e61b16 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7589,8 +7589,7 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
   // This transformation is only performed if the first operand of the
   // addi is the thread pointer.
   SDValue TPRegNode = InitialADDI.getOperand(0);
-  RegisterSDNode *TPReg =
-      dyn_cast_or_null<RegisterSDNode>(TPRegNode.getNode());
+  RegisterSDNode *TPReg = dyn_cast_or_null<RegisterSDNode>(TPRegNode.getNode());
   if (!TPReg)
     return;
   if (TPReg->getReg() != Subtarget.getThreadPointerRegister())
@@ -7798,9 +7797,8 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);
         if (!GA)
           continue;
-        ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
-                                                 MVT::i64, Offset,
-                                                 GA->getTargetFlags());
+        ImmOpnd = CurDAG->getTargetGlobalAddress(
+            GA->getGlobal(), SDLoc(GA), MVT::i64, Offset, GA->getTargetFlags());
       }
     }
 

>From 23ad11246db37c02204d6571b3d933a2f2be5f65 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Wed, 15 Nov 2023 23:27:01 -0600
Subject: [PATCH 03/16] Address review comments: fix comments, simplify offset
 generation, remove unnecessary breaks

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 85 +++++++++++------------
 1 file changed, 39 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 2e9bd71a011813c..68ecf5e8e1456db 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -761,6 +761,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
   const bool IsPPC64 = Subtarget->isPPC64();
   const bool IsAIX = Subtarget->isAIXABI();
+  const bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS();
   const Module *M = MF->getFunction().getParent();
   PICLevel::Level PL = M->getPICLevel();
 
@@ -1511,11 +1512,15 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::LWA: {
     // Verify alignment is legal, so we don't create relocations
     // that can't be supported.
-    unsigned OpNum;
-    if (Subtarget->hasAIXSmallLocalExecTLS())
-      OpNum = 1;
-    else
-      OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+    unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+    // For non-TOC-based local-exec TLS accesses with non-zero offsets, the
+    // machine operand (which is a TargetGlobalTLSAddress) is expected to be
+    // the same operand for both loads and stores.
+    for (const MachineOperand &TempMO : MI->operands()) {
+      if (((TempMO.getTargetFlags() & PPCII::MO_TPREL_FLAG) != 0) &&
+          TempMO.getOperandNo() == 1)
+        OpNum = 1;
+    }
     const MachineOperand &MO = MI->getOperand(OpNum);
     if (MO.isGlobal()) {
       const DataLayout &DL = MO.getGlobal()->getParent()->getDataLayout();
@@ -1528,20 +1533,14 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       // Such instructions do not otherwise arise.
       unsigned Flag = MO.getTargetFlags();
       if (Flag == PPCII::MO_TPREL_FLAG) {
-        assert(Subtarget->hasAIXSmallLocalExecTLS() &&
+        assert(HasAIXSmallLocalExecTLS &&
                "lwa/ld/std with thread-pointer only expected with "
                "local-exec small TLS");
         int64_t Offset = MO.getOffset();
-        // Non-zero offsets for lwa/ld/std require special handling and are
-        // handled here.
-        if (!Offset)
-          break;
-
         LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
-        if (Offset) {
-          const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset);
-          TmpInst.getOperand(1) = MCOperand::createExpr(Expr);
-        }
+        const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset);
+        if (Expr)
+          TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr);
         EmitToStreamer(*OutStreamer, TmpInst);
         return;
       }
@@ -1590,23 +1589,16 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (Flag == PPCII::MO_TPREL_FLAG ||
         Flag == PPCII::MO_GOT_TPREL_PCREL_FLAG ||
         Flag == PPCII::MO_TPREL_PCREL_FLAG) {
-      assert(
-          Subtarget->hasAIXSmallLocalExecTLS() &&
-          "addi, or load/stores with thread-pointer only expected with "
-          "local-exec small TLS");
+      assert(HasAIXSmallLocalExecTLS &&
+             "addi, or load/stores with thread-pointer only expected with "
+             "local-exec small TLS");
 
       int64_t Offset = MO.getOffset();
-      // Non-zero offsets for loads/stores require special handling and are
-      // handled here. For `addi`, all offsets are handled here.
-      if (!Offset && !IsMIADDI8)
-        break;
-
       LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
-      if (Offset) {
-        const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset);
+      const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset);
+      if (Expr)
         TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr);
-      }
 
       // Change the opcode to load address if the original opcode is an `addi`.
       if (IsMIADDI8)
@@ -1627,7 +1619,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 // we need to create a new MCExpr that adds the non-zero offset to the address
 // of the local-exec variable that will be used in either an addi, load or
 // store. However, the final displacement for these instructions must be
-// between [-32768, 32768), so if the TLS address + it's non-zero offset is
+// between [-32768, 32768), so if the TLS address + its non-zero offset is
 // greater than 32KB, a new MCExpr is produced to accommodate this situation.
 const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
                                                       int64_t Offset) {
@@ -1638,6 +1630,10 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
          "Only local-exec accesses are handled!");
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLE;
 
+  // Non-zero offsets (for loads, stores or `addi`) require additional handling.
+  // When the offset is zero, there is no need to create an adjusted MCExpr.
+  if (!Offset)
+    return nullptr;
   const MCExpr *Expr =
       MCSymbolRefExpr::create(getSymbol(GValue), RefKind, OutContext);
 
@@ -1651,7 +1647,8 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
            "Only expecting to find extern TLS variables not present in the TLS "
            "variables to address map!");
 
-  unsigned TLSVarAddress = TLSVarsMapEntryIter->second;
+  unsigned TLSVarAddress =
+      IsGlobalADeclaration ? 0 : TLSVarsMapEntryIter->second;
   ptrdiff_t FinalAddress = (TLSVarAddress + Offset);
   // If the address of the TLS variable + the offset is less than 32KB,
   // or if the TLS variable is extern, we simply produce an MCExpr to add the
@@ -1663,24 +1660,20 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
         Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
   else {
     // Handle the written offset for cases where:
-    //   address of the TLS variable + the offset is greater than 32KB.
-
-    // Get the address in the range of 0 to 64KB.
-    FinalAddress = FinalAddress & 0xFFFF;
-    // If the highest bit in the calculated address is set, subtract
-    // additional 64KB to ensure that the final address fits within
-    // [-32768,32768).
-    if (FinalAddress & 0x8000)
-      FinalAddress = FinalAddress - 0x10000;
-    assert((FinalAddress < 32768) ||
-           (FinalAddress >= -32768) &&
-               "Expecting the final address for local-exec TLS variables to be "
-               "between [-32768,32768)!");
-    // Get the offset that is actually written out in assembly by adding back
-    // the original address of the TLS variable.
-    ptrdiff_t WrittenOffset = FinalAddress - TLSVarAddress;
+    //   TLS variable address + Offset > 32KB.
+
+    // The assembly that is printed is actually:
+    //  TLSVar[storageMappingClass]@le + Offset - Delta
+    // where Delta is a multiple of 64KB: ((FinalAddress + 32768) & ~0xFFFF).
+    ptrdiff_t OffsetDelta = Offset - ((FinalAddress + 32768) & ~0xFFFF);
+    // Check that the total instruction displacement fits within [-32768,32768).
+    ptrdiff_t InstDisp = TLSVarAddress + OffsetDelta;
+    assert((InstDisp < 32768) ||
+           (InstDisp >= -32768) &&
+               "Expecting the instruction displacement for local-exec TLS "
+               "variables to be between [-32768, 32768)!");
     Expr = MCBinaryExpr::createAdd(
-        Expr, MCConstantExpr::create(WrittenOffset, OutContext), OutContext);
+        Expr, MCConstantExpr::create(OffsetDelta, OutContext), OutContext);
   }
 
   return Expr;

>From 30010bc19a96f295d3ba30dabde638587d568859 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Wed, 22 Nov 2023 09:30:45 -0600
Subject: [PATCH 04/16] Update comments and combine conditions

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp   |  2 +-
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 68ecf5e8e1456db..1b3667accdcdb45 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1663,7 +1663,7 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
     //   TLS variable address + Offset > 32KB.
 
     // The assembly that is printed is actually:
-    //  TLSVar[storageMappingClass]@le + Offset - Delta
+    //  TLSVar at le + Offset - Delta
     // where Delta is a multiple of 64KB: ((FinalAddress + 32768) & ~0xFFFF).
     ptrdiff_t OffsetDelta = Offset - ((FinalAddress + 32768) & ~0xFFFF);
     // Check that the total instruction displacement fits within [-32768,32768).
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index acf4cb0f7e61b16..d7b5f33dad7d9a1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7580,22 +7580,19 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
   // InitialADDI is the addi feeding into N (also an addi), and the addi that
   // we want optimized out.
   SDValue InitialADDI = N->getOperand(0);
-  if (!InitialADDI.isMachineOpcode())
-    return;
-  if (InitialADDI.getMachineOpcode() != PPC::ADDI8)
+  if (!InitialADDI.isMachineOpcode() ||
+      (InitialADDI.getMachineOpcode() != PPC::ADDI8))
     return;
 
-  // The first operand of the InitialADDI will be the thread pointer.
+  // The first operand of the InitialADDI should be the thread pointer.
   // This transformation is only performed if the first operand of the
   // addi is the thread pointer.
   SDValue TPRegNode = InitialADDI.getOperand(0);
   RegisterSDNode *TPReg = dyn_cast_or_null<RegisterSDNode>(TPRegNode.getNode());
-  if (!TPReg)
-    return;
-  if (TPReg->getReg() != Subtarget.getThreadPointerRegister())
+  if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister()))
     return;
 
-  // The second operand of the InitialADDI will be a TargetGlobalTLSAddress,
+  // The second operand of the InitialADDI should be the global TLS address
   // (the local-exec TLS variable). We only perform the folding if the TLS
   // variable is the second operand.
   SDValue TLSVarNode = InitialADDI.getOperand(1);
@@ -7603,12 +7600,15 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
   if (!GA)
     return;
 
+  // The local-exec TLS variable should only have the MO_TPREL_FLAG target flag,
+  // so this optimization is not performed otherwise if the flag is not set.
   unsigned TargetFlags = GA->getTargetFlags();
   if ((TargetFlags & PPCII::MO_TPREL_FLAG) == 0)
     return;
+
   // The second operand of the addi that we want to preserve will be an
-  // immediate. We add this immediate together with the address of the TLS
-  // variable found in InitialADDI in order to preserve the correct TLS address
+  // immediate. We add this immediate, together with the address of the TLS
+  // variable found in InitialADDI, in order to preserve the correct TLS address
   // information during assembly printing.
   int Offset = N->getConstantOperandVal(1);
   TLSVarNode = DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), MVT::i64,

>From f6699d6a4125ba66f9178443062f6a26aa415db1 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Wed, 22 Nov 2023 12:33:12 -0600
Subject: [PATCH 05/16] Remove unnecessary whitespace

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 1b3667accdcdb45..09c3048a6be73a9 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1545,7 +1545,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
         return;
       }
     }
-
     // Now process the instruction normally.
     break;
   }

>From 192f69e057ac22991480b4a17d5caef8eab938c3 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Wed, 29 Nov 2023 11:20:12 -0600
Subject: [PATCH 06/16] Address review comments by pulling out checks for if
 addi is eligible for folding

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp   |  6 +-
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 84 ++++++++++++++-------
 2 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 09c3048a6be73a9..54f41abe4429626 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1528,13 +1528,13 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
         llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
 
       // A faster non-TOC-based local-exec sequence is represented by
-      // `lwa`/`ld`/`std` directingly loading or storing off of the thread
-      // pointer and with an immediate operand having the MO_TPREL_FLAG.
+      // directingly loading or storing off of the thread pointer and with
+      // an immediate operand having the MO_TPREL_FLAG.
       // Such instructions do not otherwise arise.
       unsigned Flag = MO.getTargetFlags();
       if (Flag == PPCII::MO_TPREL_FLAG) {
         assert(HasAIXSmallLocalExecTLS &&
-               "lwa/ld/std with thread-pointer only expected with "
+               "loads/stores with thread-pointer only expected with "
                "local-exec small TLS");
         int64_t Offset = MO.getOffset();
         LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index d7b5f33dad7d9a1..b8b58c4b0ab89b8 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7565,51 +7565,76 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
   DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
 }
 
-// For non-TOC-based local-exec access where an addi is feeding into another
-// addi, fold this sequence into a single addi if possible.
-static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
+// Is an ADDI eligible for folding for non-TOC-based local-exec accesses?
+static bool isEligibleToFoldADDIForLocalExecAccesses(SDNode *N,
+                                                     SelectionDAG *DAG,
+                                                     SDValue ADDIToFold) {
   const PPCSubtarget &Subtarget =
       DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
   // This optimization is only performed for non-TOC-based local-exec accesses.
   if (!Subtarget.hasAIXSmallLocalExecTLS())
-    return;
-
-  if (N->getMachineOpcode() != PPC::ADDI8)
-    return;
+    return false;
 
-  // InitialADDI is the addi feeding into N (also an addi), and the addi that
-  // we want optimized out.
-  SDValue InitialADDI = N->getOperand(0);
-  if (!InitialADDI.isMachineOpcode() ||
-      (InitialADDI.getMachineOpcode() != PPC::ADDI8))
-    return;
+  // Check if ADDIToFold (the ADDI that we want to fold into local-exec
+  // accesses), is truly an ADDI.
+  if (!ADDIToFold.isMachineOpcode() ||
+      (ADDIToFold.getMachineOpcode() != PPC::ADDI8))
+    return false;
 
-  // The first operand of the InitialADDI should be the thread pointer.
+  // The first operand of the ADDIToFold should be the thread pointer.
   // This transformation is only performed if the first operand of the
   // addi is the thread pointer.
-  SDValue TPRegNode = InitialADDI.getOperand(0);
+  SDValue TPRegNode = ADDIToFold.getOperand(0);
   RegisterSDNode *TPReg = dyn_cast_or_null<RegisterSDNode>(TPRegNode.getNode());
   if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister()))
-    return;
+    return false;
 
-  // The second operand of the InitialADDI should be the global TLS address
+  // The second operand of the ADDIToFold should be the global TLS address
   // (the local-exec TLS variable). We only perform the folding if the TLS
   // variable is the second operand.
-  SDValue TLSVarNode = InitialADDI.getOperand(1);
+  SDValue TLSVarNode = ADDIToFold.getOperand(1);
   GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode);
   if (!GA)
-    return;
+    return false;
 
   // The local-exec TLS variable should only have the MO_TPREL_FLAG target flag,
   // so this optimization is not performed otherwise if the flag is not set.
   unsigned TargetFlags = GA->getTargetFlags();
   if ((TargetFlags & PPCII::MO_TPREL_FLAG) == 0)
+    return false;
+
+  // If all conditions are satisfied, the ADDI is valid for folding.
+  return true;
+}
+
+// For non-TOC-based local-exec access where an addi is feeding into another
+// addi, fold this sequence into a single addi if possible.
+static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
+  if (N->getMachineOpcode() != PPC::ADDI8)
+    return;
+
+  // InitialADDI is the addi feeding into N (also an addi), and the addi that
+  // we want optimized out.
+  SDValue InitialADDI = N->getOperand(0);
+
+  if (!isEligibleToFoldADDIForLocalExecAccesses(N, DAG, InitialADDI))
     return;
 
+  // At this point, InitialADDI can be folded into a non-TOC-based local-exec
+  // access. The first operand of InitialADDI should be the thread pointer.
+  SDValue TPRegNode = InitialADDI.getOperand(0);
+
+  // The second operand of the InitialADDI should be the global TLS address
+  // (the local-exec TLS variable), with the MO_TPREL_FLAG target flag.
+  SDValue TLSVarNode = InitialADDI.getOperand(1);
+  GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode);
+  unsigned TargetFlags = GA->getTargetFlags();
+
   // The second operand of the addi that we want to preserve will be an
   // immediate. We add this immediate, together with the address of the TLS
   // variable found in InitialADDI, in order to preserve the correct TLS address
-  // information during assembly printing.
+  // information during assembly printing. The offset is likely to be non-zero
+  // when we end up in this case.
   int Offset = N->getConstantOperandVal(1);
   TLSVarNode = DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), MVT::i64,
                                            Offset, TargetFlags);
@@ -7621,7 +7646,6 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
 
 void PPCDAGToDAGISel::PeepholePPC64() {
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
-  bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS();
 
   while (Position != CurDAG->allnodes_begin()) {
     SDNode *N = &*--Position;
@@ -7790,15 +7814,17 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd),
                                             ImmOpnd.getValueType());
       } else if (Offset != 0) {
-        if (!HasAIXSmallLocalExecTLS)
-          continue;
-        // Add the non-zero offset information into the load or store
-        // instruction to be used for non-TOC-based local-exec accesses.
-        GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);
-        if (!GA)
+        if (isEligibleToFoldADDIForLocalExecAccesses(N, CurDAG, Base)) {
+          // Add the non-zero offset information into the load or store
+          // instruction to be used for non-TOC-based local-exec accesses.
+          GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);
+          if (!GA)
+            continue;
+          ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
+                                                   MVT::i64, Offset,
+                                                   GA->getTargetFlags());
+        } else
           continue;
-        ImmOpnd = CurDAG->getTargetGlobalAddress(
-            GA->getGlobal(), SDLoc(GA), MVT::i64, Offset, GA->getTargetFlags());
       }
     }
 

>From bcacf486d61d9277c0a666975b6ab57f09c195ea Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Wed, 29 Nov 2023 15:05:36 -0600
Subject: [PATCH 07/16] Remove comment and add asserts

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp   |  2 --
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 12 +++++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 54f41abe4429626..4ab37db904ecac1 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2883,8 +2883,6 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
   // For all TLS variables, calculate their corresponding addresses and store
   // them into TLSVarsToAddressMapping, which will be used to determine whether
   // or not local-exec TLS variables require special assembly printing.
-  // This address calculation follows the same method seen within
-  // assignAddressesAndIndices() in XCOFFObjectWriter.cpp.
   uint64_t Address = 0;
   uint64_t TLSVarAddress = 0;
   auto DL = M.getDataLayout();
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index b8b58c4b0ab89b8..f523f3c7fee6f49 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7621,13 +7621,23 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
     return;
 
   // At this point, InitialADDI can be folded into a non-TOC-based local-exec
-  // access. The first operand of InitialADDI should be the thread pointer.
+  // access. The first operand of InitialADDI should be the thread pointer,
+  // which has been checked in isEligibleToFoldADDIForLocalExecAccesses().
   SDValue TPRegNode = InitialADDI.getOperand(0);
+  RegisterSDNode *TPReg = dyn_cast_or_null<RegisterSDNode>(TPRegNode.getNode());
+  const PPCSubtarget &Subtarget =
+      DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
+  assert((TPReg && (TPReg->getReg() == Subtarget.getThreadPointerRegister())) &&
+         "Expecting the first operand to be a thread pointer for folding addi "
+         "in local-exec accesses!");
 
   // The second operand of the InitialADDI should be the global TLS address
   // (the local-exec TLS variable), with the MO_TPREL_FLAG target flag.
+  // This has been checked in isEligibleToFoldADDIForLocalExecAccesses().
   SDValue TLSVarNode = InitialADDI.getOperand(1);
   GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode);
+  assert(GA && "Expecting a valid GlobalAddressSDNode when folding addi into "
+               "local-exec accesses!");
   unsigned TargetFlags = GA->getTargetFlags();
 
   // The second operand of the addi that we want to preserve will be an

>From 5c712a4abcec85462db73e47373582b23a96c879 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Thu, 7 Dec 2023 12:39:57 -0600
Subject: [PATCH 08/16] Update comments and condition

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp   | 4 ++--
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 4ab37db904ecac1..e74b5dbd2b5b5ae 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1528,7 +1528,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
         llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
 
       // A faster non-TOC-based local-exec sequence is represented by
-      // directingly loading or storing off of the thread pointer and with
+      // directly loading or storing off of the thread pointer and with
       // an immediate operand having the MO_TPREL_FLAG.
       // Such instructions do not otherwise arise.
       unsigned Flag = MO.getTargetFlags();
@@ -1654,7 +1654,7 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
   // non-zero offset to the TLS variable address.
   // For when TLS variables are extern, this is safe to do because we can
   // assume that the address of extern TLS variables are zero.
-  if ((FinalAddress < 32768) || IsGlobalADeclaration)
+  if (FinalAddress < 32768)
     Expr = MCBinaryExpr::createAdd(
         Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
   else {
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index f523f3c7fee6f49..f073bbc08e6661c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7609,6 +7609,11 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SDNode *N,
 
 // For non-TOC-based local-exec access where an addi is feeding into another
 // addi, fold this sequence into a single addi if possible.
+// Before this optimization, the sequence appears as:
+//    addi rN, r13, sym at le
+//    addi rM, rN, imm
+// After this optimization, we can fold the two addi into a single one:
+//    addi rM, r13, sym at le + imm
 static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
   if (N->getMachineOpcode() != PPC::ADDI8)
     return;

>From f536d9dd4de9bde2a6a5384f1f8ded1640f36c64 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Thu, 7 Dec 2023 15:34:30 -0600
Subject: [PATCH 09/16] Update target flags for TLSGD variable

---
 .../CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll
index c87b7acb6211c61..2f4b05ec7b01610 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll
@@ -33,7 +33,7 @@ define i64 @StoreLargeAccess1() {
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r0, 64(r1)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS6[UL]@le+424(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r4, mySmallLocalExecTLS2[TL]@le+1200(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-lo) @MyTLSGDVar
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-tlsgdm) @MyTLSGDVar
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r4, L..C1(r2) # target-flags(ppc-tlsgd) @MyTLSGDVar
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    bla .__tls_get_addr[PR]
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 44

>From 73fdf44e38891f5353f95f4b235d361671b45a76 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Thu, 7 Dec 2023 23:58:54 -0600
Subject: [PATCH 10/16] Print assembly in the tlsVar+Offset-Delta method
 instead

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     | 15 ++++-----
 .../aix-small-local-exec-tls-largeaccess.ll   | 32 +++++++++----------
 .../aix-small-local-exec-tls-largeaccess2.ll  |  8 ++---
 3 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index e74b5dbd2b5b5ae..8c69a0fb4d22bde 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1654,25 +1654,24 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
   // non-zero offset to the TLS variable address.
   // For when TLS variables are extern, this is safe to do because we can
   // assume that the address of extern TLS variables are zero.
-  if (FinalAddress < 32768)
-    Expr = MCBinaryExpr::createAdd(
-        Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
-  else {
+  Expr = MCBinaryExpr::createAdd(
+      Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
+  if (FinalAddress >= 32768) {
     // Handle the written offset for cases where:
     //   TLS variable address + Offset > 32KB.
 
-    // The assembly that is printed is actually:
+    // The assembly that is printed will look like:
     //  TLSVar at le + Offset - Delta
     // where Delta is a multiple of 64KB: ((FinalAddress + 32768) & ~0xFFFF).
-    ptrdiff_t OffsetDelta = Offset - ((FinalAddress + 32768) & ~0xFFFF);
+    ptrdiff_t Delta = ((FinalAddress + 32768) & ~0xFFFF);
     // Check that the total instruction displacement fits within [-32768,32768).
-    ptrdiff_t InstDisp = TLSVarAddress + OffsetDelta;
+    ptrdiff_t InstDisp = TLSVarAddress + Offset - Delta;
     assert((InstDisp < 32768) ||
            (InstDisp >= -32768) &&
                "Expecting the instruction displacement for local-exec TLS "
                "variables to be between [-32768, 32768)!");
     Expr = MCBinaryExpr::createAdd(
-        Expr, MCConstantExpr::create(OffsetDelta, OutContext), OutContext);
+        Expr, MCConstantExpr::create(-Delta, OutContext), OutContext);
   }
 
   return Expr;
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll
index 3aa3ecc9f2b0d10..22b8503ef403c8a 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll
@@ -30,12 +30,12 @@ define signext i32 @StoreArrays1() {
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, mySmallLocalExecTLSv1[TL]@le(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 2
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 3
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 88
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 102
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
 ;
@@ -46,12 +46,12 @@ define signext i32 @StoreArrays1() {
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLSv1[TL]@le(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 2
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 3
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 88
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 102
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
@@ -95,13 +95,13 @@ define signext i32 @StoreArrays2() {
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 4
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, 24(r4)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 2
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLS2[TL]@le-65216(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 3
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLS3[TL]@le-65212(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 88
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, mySmallLocalExecTLS4[TL]@le-65208(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 102
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLS5[TL]@le-65204(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
 ;
 ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreArrays2:
@@ -114,12 +114,12 @@ define signext i32 @StoreArrays2() {
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 4
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, 24(r3)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 2
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 3
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 88
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 102
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll
index 2f4b05ec7b01610..725b6800549264e 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll
@@ -42,8 +42,8 @@ define i64 @StoreLargeAccess1() {
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 100
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS3[TL]@le+2000(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 882
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r4, mySmallLocalExecTLS4[TL]@le-58736(r13)
-; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, mySmallLocalExecTLS5[TL]@le-57136(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r4, (mySmallLocalExecTLS4[TL]@le+6800)-65536(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, (mySmallLocalExecTLS5[TL]@le+8400)-65536(r13)
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 1191
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r1, r1, 48
 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r0, 16(r1)
@@ -70,8 +70,8 @@ define i64 @StoreLargeAccess1() {
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 100
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS3[TL]@le+2000(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 882
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r4, mySmallLocalExecTLS4[TL]@le-58736(r13)
-; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, mySmallLocalExecTLS5[TL]@le-57136(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r4, (mySmallLocalExecTLS4[TL]@le+6800)-65536(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, (mySmallLocalExecTLS5[TL]@le+8400)-65536(r13)
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 1191
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r1, r1, 48
 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r0, 16(r1)

>From bf985705e4ce432215b43c1946484894363a083e Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Tue, 2 Jan 2024 13:18:56 -0600
Subject: [PATCH 11/16] Update target flag checks

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp   | 2 +-
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 8c69a0fb4d22bde..5c13f8a69d51525 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1517,7 +1517,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // machine operand (which is a TargetGlobalTLSAddress) is expected to be
     // the same operand for both loads and stores.
     for (const MachineOperand &TempMO : MI->operands()) {
-      if (((TempMO.getTargetFlags() & PPCII::MO_TPREL_FLAG) != 0) &&
+      if (((TempMO.getTargetFlags() == PPCII::MO_TPREL_FLAG)) &&
           TempMO.getOperandNo() == 1)
         OpNum = 1;
     }
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index f073bbc08e6661c..8b8c1a0832cf915 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7600,7 +7600,7 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SDNode *N,
   // The local-exec TLS variable should only have the MO_TPREL_FLAG target flag,
   // so this optimization is not performed otherwise if the flag is not set.
   unsigned TargetFlags = GA->getTargetFlags();
-  if ((TargetFlags & PPCII::MO_TPREL_FLAG) == 0)
+  if (TargetFlags != PPCII::MO_TPREL_FLAG)
     return false;
 
   // If all conditions are satisfied, the ADDI is valid for folding.

>From d22274688195a06773ef113adf3c19732ffdd23a Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Wed, 10 Jan 2024 08:53:43 -0600
Subject: [PATCH 12/16] Add hyphens to variable-to-address comment/assert

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 5c13f8a69d51525..3a836bf839bc189 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1638,13 +1638,13 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
 
   bool IsGlobalADeclaration = GValue->isDeclarationForLinker();
   // Find the GlobalVariable that corresponds to the particular TLS variable
-  // in the TLS variable to address mapping. All TLS variables should exist
+  // in the TLS variable-to-address mapping. All TLS variables should exist
   // within this map, with the exception of TLS variables marked as extern.
   const auto TLSVarsMapEntryIter = TLSVarsToAddressMapping.find(GValue);
   if (TLSVarsMapEntryIter == TLSVarsToAddressMapping.end())
     assert(IsGlobalADeclaration &&
            "Only expecting to find extern TLS variables not present in the TLS "
-           "variables to address map!");
+           "variable-to-address map!");
 
   unsigned TLSVarAddress =
       IsGlobalADeclaration ? 0 : TLSVarsMapEntryIter->second;

>From d88edb4086cab575fb59ff4295119566400031d7 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Thu, 18 Jan 2024 16:34:16 -0600
Subject: [PATCH 13/16] Address various comments: moving around variables,
 removing unnecessary variables, etc.

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp   | 25 +++++++++------------
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 24 +++++++++++---------
 2 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 3a836bf839bc189..6ef7628f85a8461 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1592,10 +1592,9 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
              "addi, or load/stores with thread-pointer only expected with "
              "local-exec small TLS");
 
-      int64_t Offset = MO.getOffset();
       LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
-      const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset);
+      const MCExpr *Expr = getAdjustedLocalExecExpr(MO, MO.getOffset());
       if (Expr)
         TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr);
 
@@ -1622,19 +1621,15 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 // greater than 32KB, a new MCExpr is produced to accommodate this situation.
 const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
                                                       int64_t Offset) {
-  assert(MO.isGlobal() && "Only expecting a global MachineOperand here!");
-  const GlobalValue *GValue = MO.getGlobal();
-  TLSModel::Model Model = TM.getTLSModel(GValue);
-  assert(Model == TLSModel::LocalExec &&
-         "Only local-exec accesses are handled!");
-  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLE;
-
   // Non-zero offsets (for loads, stores or `addi`) require additional handling.
   // When the offset is zero, there is no need to create an adjusted MCExpr.
   if (!Offset)
     return nullptr;
-  const MCExpr *Expr =
-      MCSymbolRefExpr::create(getSymbol(GValue), RefKind, OutContext);
+
+  assert(MO.isGlobal() && "Only expecting a global MachineOperand here!");
+  const GlobalValue *GValue = MO.getGlobal();
+  assert(TM.getTLSModel(GValue) == TLSModel::LocalExec &&
+         "Only local-exec accesses are handled!");
 
   bool IsGlobalADeclaration = GValue->isDeclarationForLinker();
   // Find the GlobalVariable that corresponds to the particular TLS variable
@@ -1654,6 +1649,8 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
   // non-zero offset to the TLS variable address.
   // For when TLS variables are extern, this is safe to do because we can
   // assume that the address of extern TLS variables are zero.
+  const MCExpr *Expr = MCSymbolRefExpr::create(
+      getSymbol(GValue), MCSymbolRefExpr::VK_PPC_AIX_TLSLE, OutContext);
   Expr = MCBinaryExpr::createAdd(
       Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
   if (FinalAddress >= 32768) {
@@ -2882,15 +2879,13 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
   // For all TLS variables, calculate their corresponding addresses and store
   // them into TLSVarsToAddressMapping, which will be used to determine whether
   // or not local-exec TLS variables require special assembly printing.
-  uint64_t Address = 0;
   uint64_t TLSVarAddress = 0;
   auto DL = M.getDataLayout();
   for (const auto &G : M.globals()) {
     if (G.isThreadLocal() && !G.isDeclaration()) {
-      TLSVarAddress = alignTo(Address, getGVAlignment(&G, DL));
-      unsigned GVSize = DL.getTypeAllocSize(G.getValueType());
-      Address = TLSVarAddress + GVSize;
+      TLSVarAddress = alignTo(TLSVarAddress, getGVAlignment(&G, DL));
       TLSVarsToAddressMapping[&G] = TLSVarAddress;
+      TLSVarAddress += DL.getTypeAllocSize(G.getValueType());
     }
   }
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 8b8c1a0832cf915..ac042b0e0dea9cf 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7566,15 +7566,8 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
 }
 
 // Is an ADDI eligible for folding for non-TOC-based local-exec accesses?
-static bool isEligibleToFoldADDIForLocalExecAccesses(SDNode *N,
-                                                     SelectionDAG *DAG,
+static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
                                                      SDValue ADDIToFold) {
-  const PPCSubtarget &Subtarget =
-      DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
-  // This optimization is only performed for non-TOC-based local-exec accesses.
-  if (!Subtarget.hasAIXSmallLocalExecTLS())
-    return false;
-
   // Check if ADDIToFold (the ADDI that we want to fold into local-exec
   // accesses), is truly an ADDI.
   if (!ADDIToFold.isMachineOpcode() ||
@@ -7586,6 +7579,8 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SDNode *N,
   // addi is the thread pointer.
   SDValue TPRegNode = ADDIToFold.getOperand(0);
   RegisterSDNode *TPReg = dyn_cast_or_null<RegisterSDNode>(TPRegNode.getNode());
+  const PPCSubtarget &Subtarget =
+      DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
   if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister()))
     return false;
 
@@ -7622,7 +7617,7 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
   // we want optimized out.
   SDValue InitialADDI = N->getOperand(0);
 
-  if (!isEligibleToFoldADDIForLocalExecAccesses(N, DAG, InitialADDI))
+  if (!isEligibleToFoldADDIForLocalExecAccesses(DAG, InitialADDI))
     return;
 
   // At this point, InitialADDI can be folded into a non-TOC-based local-exec
@@ -7661,6 +7656,9 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
 
 void PPCDAGToDAGISel::PeepholePPC64() {
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+  const PPCSubtarget &Subtarget =
+      CurDAG->getMachineFunction().getSubtarget<PPCSubtarget>();
+  bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
 
   while (Position != CurDAG->allnodes_begin()) {
     SDNode *N = &*--Position;
@@ -7671,7 +7669,9 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     if (isVSXSwap(SDValue(N, 0)))
       reduceVSXSwap(N, CurDAG);
 
-    foldADDIForLocalExecAccesses(N, CurDAG);
+    // This optimization is performed for non-TOC-based local-exec accesses.
+    if (HasAIXSmallLocalExecTLS)
+      foldADDIForLocalExecAccesses(N, CurDAG);
 
     unsigned FirstOp;
     unsigned StorageOpcode = N->getMachineOpcode();
@@ -7829,7 +7829,9 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd),
                                             ImmOpnd.getValueType());
       } else if (Offset != 0) {
-        if (isEligibleToFoldADDIForLocalExecAccesses(N, CurDAG, Base)) {
+        // This optimization is performed for non-TOC-based local-exec accesses.
+        if (HasAIXSmallLocalExecTLS &&
+            isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) {
           // Add the non-zero offset information into the load or store
           // instruction to be used for non-TOC-based local-exec accesses.
           GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);

>From bc60aa8fafa6cc8a31ec152cf5f9fed76f5bef53 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Thu, 18 Jan 2024 17:09:10 -0600
Subject: [PATCH 14/16] Remove unnecessary subtarget variable.

---
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index ac042b0e0dea9cf..94d89c0fb89bccd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7656,9 +7656,7 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
 
 void PPCDAGToDAGISel::PeepholePPC64() {
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
-  const PPCSubtarget &Subtarget =
-      CurDAG->getMachineFunction().getSubtarget<PPCSubtarget>();
-  bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
+  bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS();
 
   while (Position != CurDAG->allnodes_begin()) {
     SDNode *N = &*--Position;

>From 3563b6a0cc4a096a0373c47a76a8d91d4f77ecd9 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Wed, 24 Jan 2024 14:10:11 -0600
Subject: [PATCH 15/16] Add an assert and common up code from load/stores

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp   | 45 +++++++--------------
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp |  4 +-
 2 files changed, 17 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 6ef7628f85a8461..450bd68f8d395fa 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1526,37 +1526,11 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       const DataLayout &DL = MO.getGlobal()->getParent()->getDataLayout();
       if (MO.getGlobal()->getPointerAlignment(DL) < 4)
         llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
-
-      // A faster non-TOC-based local-exec sequence is represented by
-      // directly loading or storing off of the thread pointer and with
-      // an immediate operand having the MO_TPREL_FLAG.
-      // Such instructions do not otherwise arise.
-      unsigned Flag = MO.getTargetFlags();
-      if (Flag == PPCII::MO_TPREL_FLAG) {
-        assert(HasAIXSmallLocalExecTLS &&
-               "loads/stores with thread-pointer only expected with "
-               "local-exec small TLS");
-        int64_t Offset = MO.getOffset();
-        LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
-        const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset);
-        if (Expr)
-          TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr);
-        EmitToStreamer(*OutStreamer, TmpInst);
-        return;
-      }
     }
-    // Now process the instruction normally.
-    break;
-  }
-  case PPC::PseudoEIEIO: {
-    EmitToStreamer(
-        *OutStreamer,
-        MCInstBuilder(PPC::ORI).addReg(PPC::X2).addReg(PPC::X2).addImm(0));
-    EmitToStreamer(
-        *OutStreamer,
-        MCInstBuilder(PPC::ORI).addReg(PPC::X2).addReg(PPC::X2).addImm(0));
-    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::EnforceIEIO));
-    return;
+    // As these load/stores share common code with the following load/stores,
+    // fall through to the subsequent cases in order to either process the
+    // non-TOC-based local-exec sequence or to process the instruction normally.
+    [[fallthrough]];
   }
   case PPC::LBZ:
   case PPC::LBZ8:
@@ -1605,8 +1579,19 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       EmitToStreamer(*OutStreamer, TmpInst);
       return;
     }
+    // Now process the instruction normally.
     break;
   }
+  case PPC::PseudoEIEIO: {
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(PPC::ORI).addReg(PPC::X2).addReg(PPC::X2).addImm(0));
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(PPC::ORI).addReg(PPC::X2).addReg(PPC::X2).addImm(0));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::EnforceIEIO));
+    return;
+  }
   }
 
   LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 94d89c0fb89bccd..49f49380ccdf189 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7833,8 +7833,8 @@ void PPCDAGToDAGISel::PeepholePPC64() {
           // Add the non-zero offset information into the load or store
           // instruction to be used for non-TOC-based local-exec accesses.
           GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);
-          if (!GA)
-            continue;
+          assert(GA && "Expecting a valid GlobalAddressSDNode when folding "
+                       "addi into local-exec accesses!");
           ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
                                                    MVT::i64, Offset,
                                                    GA->getTargetFlags());

>From 859368e601f6d65ee720a2179625e677d322f32a Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1 at ibm.com>
Date: Fri, 26 Jan 2024 22:59:31 -0600
Subject: [PATCH 16/16] Update assert to early exit

---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 450bd68f8d395fa..5098bc8bf053e27 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1555,6 +1555,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // or a load/store instruction (that directly loads or stores off of the
     // thread pointer) with an immediate operand having the MO_TPREL_FLAG.
     // Such instructions do not otherwise arise.
+    if (!HasAIXSmallLocalExecTLS)
+      break;
     bool IsMIADDI8 = MI->getOpcode() == PPC::ADDI8;
     unsigned OpNum = IsMIADDI8 ? 2 : 1;
     const MachineOperand &MO = MI->getOperand(OpNum);
@@ -1562,10 +1564,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (Flag == PPCII::MO_TPREL_FLAG ||
         Flag == PPCII::MO_GOT_TPREL_PCREL_FLAG ||
         Flag == PPCII::MO_TPREL_PCREL_FLAG) {
-      assert(HasAIXSmallLocalExecTLS &&
-             "addi, or load/stores with thread-pointer only expected with "
-             "local-exec small TLS");
-
       LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
       const MCExpr *Expr = getAdjustedLocalExecExpr(MO, MO.getOffset());



More information about the llvm-commits mailing list