[llvm] 3f46e54 - [AIX][TLS] Produce a faster local-exec access sequence with -maix-small-local-exec-tls (And optimize when load/store offsets are 0)

Amy Kwan via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 7 18:06:00 PDT 2023


Author: Amy Kwan
Date: 2023-09-07T20:05:29-05:00
New Revision: 3f46e5453d9310b15d974e876f6132e3cf50c4b1

URL: https://github.com/llvm/llvm-project/commit/3f46e5453d9310b15d974e876f6132e3cf50c4b1
DIFF: https://github.com/llvm/llvm-project/commit/3f46e5453d9310b15d974e876f6132e3cf50c4b1.diff

LOG: [AIX][TLS] Produce a faster local-exec access sequence with -maix-small-local-exec-tls (And optimize when load/store offsets are 0)

This patch utilizes the -maix-small-local-exec-tls option added in
D155544 to produce a faster access sequence for the local-exec TLS
model, where loading from the TOC can be avoided.

The patch either produces an addi/la with a displacement off of r13
(the thread pointer) when the address is calculated, or it produces an
addi/la followed by a load/store when the address is calculated and
used for further accesses.

This patch also optimizes this sequence a bit more where we can remove
the addi/la when the load/store offset is 0. A follow up patch will
be posted to account for when the load/store offset is non-zero, and
currently in these situations we keep the addi/la that precedes the
load/store.

Furthermore, this access sequence is only performed for TLS variables
that are less than ~32KB in size.

Differential Revision: https://reviews.llvm.org/D155600

Added: 
    llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll
    llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll
    llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll
    llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll
    llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll
    llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-longlong.ll
    llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll

Modified: 
    llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
    llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
    llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
    llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-O0.ll
    llvm/test/CodeGen/PowerPC/ppc64-nonfunc-calls.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index ffdbfc4ddb01341..065daf42fe6eb0c 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -69,6 +69,8 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
       return {XCOFF::RelocationType::R_TOCU, SignAndSizeForHalf16};
     case MCSymbolRefExpr::VK_PPC_L:
       return {XCOFF::RelocationType::R_TOCL, SignAndSizeForHalf16};
+    case MCSymbolRefExpr::VK_PPC_AIX_TLSLE:
+      return {XCOFF::RelocationType::R_TLS_LE, SignAndSizeForHalf16};
     }
   } break;
   case PPC::fixup_ppc_half16ds:
@@ -82,6 +84,8 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
       return {XCOFF::RelocationType::R_TOC, 15};
     case MCSymbolRefExpr::VK_PPC_L:
       return {XCOFF::RelocationType::R_TOCL, 15};
+    case MCSymbolRefExpr::VK_PPC_AIX_TLSLE:
+      return {XCOFF::RelocationType::R_TLS_LE, 15};
     }
   } break;
   case PPC::fixup_ppc_br24:

diff  --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index cf51fa9ac9c4267..4b97e3e1a09152f 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1534,6 +1534,22 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::EnforceIEIO));
     return;
   }
+  case PPC::ADDI8: {
+    // The faster non-TOC-based local-exec sequence is represented by `addi`
+    // with an immediate operand having the MO_TPREL_FLAG. Such an instruction
+    // does not otherwise arise.
+    const MachineOperand &MO = MI->getOperand(2);
+    if ((MO.getTargetFlags() & PPCII::MO_TPREL_FLAG) != 0) {
+      assert(
+          Subtarget->hasAIXSmallLocalExecTLS() &&
+          "addi with thread-pointer only expected with local-exec small TLS");
+      LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
+      TmpInst.setOpcode(PPC::LA8);
+      EmitToStreamer(*OutStreamer, TmpInst);
+      return;
+    }
+    break;
+  }
   }
 
   LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);

diff  --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 1e9ddecb4d1527a..61a3138007db900 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7654,13 +7654,6 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       // is already in place on the operand, so copying the operand
       // is sufficient.
       ReplaceFlags = false;
-      // For these cases, the immediate may not be divisible by 4, in
-      // which case the fold is illegal for DS-form instructions.  (The
-      // other cases provide aligned addresses and are always safe.)
-      if (RequiresMod4Offset &&
-          (!isa<ConstantSDNode>(Base.getOperand(1)) ||
-           Base.getConstantOperandVal(1) % 4 != 0))
-        continue;
       break;
     case PPC::ADDIdtprelL:
       Flags = PPCII::MO_DTPREL_LO;
@@ -7712,6 +7705,18 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         UpdateHBase = true;
       }
     } else {
+      // Global addresses can be folded, but only if they are sufficiently
+      // aligned.
+      if (RequiresMod4Offset) {
+        if (GlobalAddressSDNode *GA =
+                dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+          const GlobalValue *GV = GA->getGlobal();
+          Align Alignment = GV->getPointerAlignment(CurDAG->getDataLayout());
+          if (Alignment < 4)
+            continue;
+        }
+      }
+
       // If we're directly folding the addend from an addi instruction, then:
       //  1. In general, the offset on the memory access must be zero.
       //  2. If the addend is a constant, then it can be combined with a

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 6bc89891c0dc44d..d4a2ee3641f3ccc 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -148,6 +148,12 @@ static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
 
 static const char AIXSSPCanaryWordName[] = "__ssp_canary_word";
 
+// A faster local-exec TLS access sequence (enabled with the
+// -maix-small-local-exec-tls option) can be produced for TLS variables;
+// consistent with the IBM XL compiler, we apply a max size of slightly under
+// 32KB.
+constexpr uint64_t AIXSmallTlsPolicySizeLimit = 32751;
+
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
@@ -3355,14 +3361,16 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool Is64Bit = Subtarget.isPPC64();
+  bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
   TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
+  bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;
 
-  if (Model == TLSModel::LocalExec || Model == TLSModel::InitialExec) {
+  if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {
     SDValue VariableOffsetTGA =
         DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_FLAG);
     SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
     SDValue TLSReg;
-    if (Is64Bit)
+    if (Is64Bit) {
       // For local-exec and initial-exec on AIX (64-bit), the sequence generated
       // involves a load of the variable offset (from the TOC), followed by an
       // add of the loaded variable offset to R13 (the thread pointer).
@@ -3370,7 +3378,22 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
       //    ld reg1,var[TC](2)
       //    add reg2, reg1, r13     // r13 contains the thread pointer
       TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
-    else
+
+      // With the -maix-small-local-exec-tls option, produce a faster access
+      // sequence for local-exec TLS variables where the offset from the TLS
+      // base is encoded as an immediate operand.
+      //
+      // We only utilize the faster local-exec access sequence when the TLS
+      // variable has a size within the policy limit. We treat types that are
+      // not sized or are empty as being over the policy size limit.
+      if (HasAIXSmallLocalExecTLS && IsTLSLocalExecModel) {
+        Type *GVType = GV->getValueType();
+        if (GVType->isSized() && !GVType->isEmptyTy() &&
+            GV->getParent()->getDataLayout().getTypeAllocSize(GVType) <=
+                AIXSmallTlsPolicySizeLimit)
+          return DAG.getNode(PPCISD::Lo, dl, PtrVT, VariableOffsetTGA, TLSReg);
+      }
+    } else {
       // For local-exec and initial-exec on AIX (32-bit), the sequence generated
       // involves loading the variable offset from the TOC, generating a call to
       // .__get_tpointer to get the thread pointer (which will be in R3), and
@@ -3379,6 +3402,13 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
       //    bla .__get_tpointer
       //    add reg2, reg1, r3
       TLSReg = DAG.getNode(PPCISD::GET_TPOINTER, dl, PtrVT);
+
+      // We do not implement the 32-bit version of the faster access sequence
+      // for local-exec that is controlled by -maix-small-local-exec-tls.
+      if (HasAIXSmallLocalExecTLS)
+        report_fatal_error("The small-local-exec TLS access sequence is "
+                           "currently only supported on AIX (64-bit mode).");
+    }
     return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, VariableOffset);
   }
 

diff  --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 976effb96adc0d8..ba5465f8bb6edcd 100644
--- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -31,22 +31,19 @@ using namespace llvm;
 
 static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
                                       AsmPrinter &AP) {
-  const TargetMachine &TM = AP.TM;
-  Mangler &Mang = TM.getObjFileLowering()->getMangler();
-  const DataLayout &DL = AP.getDataLayout();
-  MCContext &Ctx = AP.OutContext;
-
-  SmallString<128> Name;
-  if (!MO.isGlobal()) {
-    assert(MO.isSymbol() && "Isn't a symbol reference");
-    Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
-  } else {
+  if (MO.isGlobal()) {
     const GlobalValue *GV = MO.getGlobal();
-    TM.getNameWithPrefix(Name, GV, Mang);
+    return AP.getSymbol(GV);
   }
 
-  MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
+  assert(MO.isSymbol() && "Isn't a symbol reference");
+
+  SmallString<128> Name;
+  const DataLayout &DL = AP.getDataLayout();
+  Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
 
+  MCContext &Ctx = AP.OutContext;
+  MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
   return Sym;
 }
 
@@ -80,6 +77,8 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
       break;
   }
 
+  const TargetMachine &TM = Printer.TM;
+
   if (MO.getTargetFlags() == PPCII::MO_PLT)
     RefKind = MCSymbolRefExpr::VK_PLT;
   else if (MO.getTargetFlags() == PPCII::MO_PCREL_FLAG)
@@ -94,12 +93,21 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_GOT_TPREL_PCREL_FLAG)
     RefKind = MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL;
+  else if (MO.getTargetFlags() == PPCII::MO_TPREL_FLAG) {
+    assert(MO.isGlobal() && "Only expecting a global MachineOperand here!");
+    TLSModel::Model Model = TM.getTLSModel(MO.getGlobal());
+    // For the local-exec TLS model, we may generate the offset from the TLS
+    // base as an immediate operand (instead of using a TOC entry).
+    // Set the relocation type in case the result is used for purposes other
+    // than a TOC reference. In TOC reference cases, this result is discarded.
+    if (Model == TLSModel::LocalExec)
+      RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLE;
+  }
 
   const MachineInstr *MI = MO.getParent();
   const MachineFunction *MF = MI->getMF();
   const Module *M = MF->getFunction().getParent();
   const PPCSubtarget *Subtarget = &(MF->getSubtarget<PPCSubtarget>());
-  const TargetMachine &TM = Printer.TM;
 
   unsigned MIOpcode = MI->getOpcode();
   assert((Subtarget->isUsingPCRelativeCalls() || MIOpcode != PPC::BL8_NOTOC) &&

diff  --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll
new file mode 100644
index 000000000000000..e23549e5e87126c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \
+; RUN:      --check-prefix=SMALL-LOCAL-EXEC-LARGECM64
+
+ at ThreadLocalVarInit = thread_local(localexec) global i8 1, align 1
+ at VarInit = local_unnamed_addr global i8 87, align 1
+ at IThreadLocalVarInit = internal thread_local(localexec) global i8 1, align 1
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
+ at c = thread_local(localexec) global [87 x i8] zeroinitializer, align 1
+
+define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, c[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, c[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @c)
+  %arrayidx = getelementptr inbounds [87 x i8], ptr %0, i64 0, i64 1
+  ret ptr %arrayidx
+}
+
+define void @storeITLInit(i8 noundef zeroext %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stb r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stb r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @IThreadLocalVarInit)
+  store i8 %x, ptr %0, align 1
+  ret void
+}
+
+define void @storeTLInit(i8 noundef zeroext %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stb r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stb r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @ThreadLocalVarInit)
+  store i8 %x, ptr %0, align 1
+  ret void
+}
+
+define zeroext i8 @loadITLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lbz r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lbz r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @IThreadLocalVarInit)
+  %1 = load i8, ptr %0, align 1
+  ret i8 %1
+}
+
+define zeroext i8 @loadITLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lbz r4, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lbz r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    clrldi r3, r3, 56
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lbz r4, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lbz r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    clrldi r3, r3, 56
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @IThreadLocalVarInit)
+  %1 = load i8, ptr %0, align 1
+  %2 = load i8, ptr @VarInit, align 1
+  %add = add i8 %2, %1
+  ret i8 %add
+}
+
+define zeroext i8 @loadTLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lbz r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lbz r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @ThreadLocalVarInit)
+  %1 = load i8, ptr %0, align 1
+  ret i8 %1
+}
+
+define zeroext i8 @loadTLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lbz r4, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lbz r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    clrldi r3, r3, 56
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lbz r4, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lbz r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    clrldi r3, r3, 56
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @ThreadLocalVarInit)
+  %1 = load i8, ptr %0, align 1
+  %2 = load i8, ptr @VarInit, align 1
+  %add = add i8 %2, %1
+  ret i8 %add
+}
+
+define void @loadStore1(i8 noundef zeroext %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lbz r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 9
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stb r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lbz r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 9
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stb r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @IThreadLocalVarInit)
+  %1 = load i8, ptr %0, align 1
+  %add = add i8 %1, 9
+  store i8 %add, ptr %0, align 1
+  ret void
+}

diff  --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll
new file mode 100644
index 000000000000000..94c67a693b447d8
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll
@@ -0,0 +1,179 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \
+; RUN:      --check-prefix=SMALL-LOCAL-EXEC-LARGECM64
+
+ at ThreadLocalVarInit = thread_local(localexec) global double 1.000000e+00, align 8
+ at VarInit = local_unnamed_addr global double 8.700000e+01, align 8
+ at IThreadLocalVarInit = internal thread_local(localexec) global double 1.000000e+00, align 8
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
+ at f = thread_local(localexec) global [87 x double] zeroinitializer, align 8
+
+define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, f[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 48
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, f[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 48
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @f)
+  %arrayidx = getelementptr inbounds [87 x double], ptr %0, i64 0, i64 6
+  ret ptr %arrayidx
+}
+
+define void @storeITLInit(double noundef %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stfd f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stfd f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @IThreadLocalVarInit)
+  store double %x, ptr %0, align 8
+  ret void
+}
+
+define void @storeTLInit(double noundef %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stfd f1, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stfd f1, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @ThreadLocalVarInit)
+  store double %x, ptr %0, align 8
+  ret void
+}
+
+define double @loadITLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfd f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfd f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @IThreadLocalVarInit)
+  %1 = load double, ptr %0, align 8
+  ret double %1
+}
+
+define double @loadITLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfd f0, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfd f1, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    xsadddp f1, f0, f1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfd f0, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfd f1, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    xsadddp f1, f0, f1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @IThreadLocalVarInit)
+  %1 = load double, ptr %0, align 8
+  %2 = load double, ptr @VarInit, align 8
+  %add = fadd double %1, %2
+  ret double %add
+}
+
+define double @loadTLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfd f1, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfd f1, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @ThreadLocalVarInit)
+  %1 = load double, ptr %0, align 8
+  ret double %1
+}
+
+define double @loadTLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfd f0, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfd f1, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    xsadddp f1, f0, f1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfd f0, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfd f1, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    xsadddp f1, f0, f1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @ThreadLocalVarInit)
+  %1 = load double, ptr %0, align 8
+  %2 = load double, ptr @VarInit, align 8
+  %add = fadd double %1, %2
+  ret double %add
+}
+
+define void @loadStore1(double noundef %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    vspltisw v2, 1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfd f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    xvcvsxwdp vs0, vs34
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    vspltisw v2, 8
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    xsadddp f0, f1, f0
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    xvcvsxwdp vs1, vs34
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    xsadddp f0, f0, f1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stfd f0, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    vspltisw v2, 1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfd f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    xvcvsxwdp vs0, vs34
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    vspltisw v2, 8
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    xsadddp f0, f1, f0
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    xvcvsxwdp vs1, vs34
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    xsadddp f0, f0, f1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stfd f0, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @IThreadLocalVarInit)
+  %1 = load double, ptr %0, align 8
+  %inc = fadd double %1, 1.000000e+00
+  %add = fadd double %inc, 8.000000e+00
+  store double %add, ptr %0, align 8
+  ret void
+}

diff  --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll
new file mode 100644
index 000000000000000..6e177de29ecc16d
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll
@@ -0,0 +1,179 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \
+; RUN:      --check-prefix=SMALL-LOCAL-EXEC-LARGECM64
+
+ at ThreadLocalVarInit = thread_local(localexec) global float 1.000000e+00, align 4
+ at VarInit = local_unnamed_addr global float 8.700000e+01, align 4
+ at IThreadLocalVarInit = internal thread_local(localexec) global float 1.000000e+00, align 4
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
+ at e = thread_local(localexec) global [87 x float] zeroinitializer, align 4
+
+define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, e[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 16
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, e[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 16
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @e)
+  %arrayidx = getelementptr inbounds [87 x float], ptr %0, i64 0, i64 4
+  ret ptr %arrayidx
+}
+
+define void @storeITLInit(float noundef %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stfs f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stfs f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @IThreadLocalVarInit)
+  store float %x, ptr %0, align 4
+  ret void
+}
+
+define void @storeTLInit(float noundef %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stfs f1, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stfs f1, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @ThreadLocalVarInit)
+  store float %x, ptr %0, align 4
+  ret void
+}
+
+define float @loadITLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfs f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfs f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @IThreadLocalVarInit)
+  %1 = load float, ptr %0, align 4
+  ret float %1
+}
+
+define float @loadITLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfs f0, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfs f1, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    fadds f1, f0, f1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfs f0, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfs f1, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    fadds f1, f0, f1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @IThreadLocalVarInit)
+  %1 = load float, ptr %0, align 4
+  %2 = load float, ptr @VarInit, align 4
+  %add = fadd float %1, %2
+  ret float %add
+}
+
+define float @loadTLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfs f1, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfs f1, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @ThreadLocalVarInit)
+  %1 = load float, ptr %0, align 4
+  ret float %1
+}
+
+define float @loadTLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfs f0, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfs f1, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    fadds f1, f0, f1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfs f0, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfs f1, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    fadds f1, f0, f1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @ThreadLocalVarInit)
+  %1 = load float, ptr %0, align 4
+  %2 = load float, ptr @VarInit, align 4
+  %add = fadd float %1, %2
+  ret float %add
+}
+
+define void @loadStore1(float noundef %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    vspltisw v2, 1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfs f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    xvcvsxwdp vs0, vs34
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    vspltisw v2, 8
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    xvcvsxwdp vs2, vs34
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    fadds f0, f1, f0
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    fadds f0, f0, f2
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stfs f0, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    vspltisw v2, 1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfs f1, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    xvcvsxwdp vs0, vs34
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    vspltisw v2, 8
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    xvcvsxwdp vs2, vs34
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    fadds f0, f1, f0
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    fadds f0, f0, f2
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stfs f0, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @IThreadLocalVarInit)
+  %1 = load float, ptr %0, align 4
+  %inc = fadd float %1, 1.000000e+00
+  %add = fadd float %inc, 8.000000e+00
+  store float %add, ptr %0, align 4
+  ret void
+}

diff  --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll
new file mode 100644
index 000000000000000..c3c919a5ca969eb
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \
+; RUN:      --check-prefix=SMALL-LOCAL-EXEC-LARGECM64
+
+ at ThreadLocalVarInit = thread_local(localexec) global i32 1, align 4
+ at VarInit = local_unnamed_addr global i32 87, align 4
+ at IThreadLocalVarInit = internal thread_local(localexec) global i32 1, align 4
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
+%struct.anon = type { i32 }
+ at ThreadLocalStruct = thread_local(localexec) global %struct.anon zeroinitializer, align 1
+ at a = thread_local(localexec) global [87 x i32] zeroinitializer, align 4
+
+define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, a[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 12
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, a[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 12
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @a)
+  %arrayidx = getelementptr inbounds [87 x i32], ptr %0, i64 0, i64 3
+  ret ptr %arrayidx
+}
+
+define signext i32 @testUnaligned() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: testUnaligned:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, ThreadLocalStruct[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwa r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: testUnaligned:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, ThreadLocalStruct[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwa r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @ThreadLocalStruct)
+  %x = getelementptr inbounds %struct.anon, ptr %0, i32 0, i32 0
+  %1 = load i32, ptr %x, align 1
+  ret i32 %1
+}
+
+define void @storeITLInit(i32 noundef signext %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @IThreadLocalVarInit)
+  store i32 %x, ptr %0, align 4
+  ret void
+}
+
+define void @storeTLInit(i32 noundef signext %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @ThreadLocalVarInit)
+  store i32 %x, ptr %0, align 4
+  ret void
+}
+
+define signext i32 @loadITLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwa r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwa r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @IThreadLocalVarInit)
+  %1 = load i32, ptr %0, align 4
+  ret i32 %1
+}
+
+define signext i32 @loadITLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwz r4, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwz r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    extsw r3, r3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwz r4, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwz r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    extsw r3, r3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @IThreadLocalVarInit)
+  %1 = load i32, ptr %0, align 4
+  %2 = load i32, ptr @VarInit, align 4
+  %add = add nsw i32 %2, %1
+  ret i32 %add
+}
+
+define signext i32 @loadTLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwa r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwa r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @ThreadLocalVarInit)
+  %1 = load i32, ptr %0, align 4
+  ret i32 %1
+}
+
+define signext i32 @loadTLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwz r4, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwz r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    extsw r3, r3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwz r4, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwz r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    extsw r3, r3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @ThreadLocalVarInit)
+  %1 = load i32, ptr %0, align 4
+  %2 = load i32, ptr @VarInit, align 4
+  %add = add nsw i32 %2, %1
+  ret i32 %add
+}
+
+define void @loadStore1(i32 noundef signext %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwz r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 9
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwz r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 9
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @IThreadLocalVarInit)
+  %1 = load i32, ptr %0, align 4
+  %add = add nsw i32 %1, 9
+  store i32 %add, ptr %0, align 4
+  ret void
+}
+

diff  --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll
new file mode 100644
index 000000000000000..1f2b79413c1e440
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll
@@ -0,0 +1,249 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \
+; RUN:      --check-prefix=SMALL-LOCAL-EXEC-LARGECM64
+
+; Test disassembly of object.
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=+aix-small-local-exec-tls \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -xcoff-traceback-table=false \
+; RUN:      --code-model=large -filetype=obj -o %t.o < %s
+; RUN: llvm-objdump -D -r --symbol-description %t.o | FileCheck --check-prefix=DIS %s
+
+ at mySmallLocalExecTLSv1 = thread_local(localexec) global [8187 x i32] zeroinitializer, align 4
+ at mySmallLocalExecTLS2 = thread_local(localexec) global [4000 x i32] zeroinitializer, align 4
+ at mySmallLocalExecTLS3 = thread_local(localexec) global [4000 x i32] zeroinitializer, align 4
+ at mySmallLocalExecTLS4 = thread_local(localexec) global [4000 x i32] zeroinitializer, align 4
+ at mySmallLocalExecTLS5 = thread_local(localexec) global [4000 x i32] zeroinitializer, align 4
+ at mySmallLocalExecTLSv2 = thread_local(localexec) global [9000 x i32] zeroinitializer, align 4
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
+
+; All accesses use a "faster" local-exec sequence directly off the thread pointer.
+define signext i32 @StoreArrays1() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: StoreArrays1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, mySmallLocalExecTLSv1[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r5, 4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r6, mySmallLocalExecTLS2[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r7, 2
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, mySmallLocalExecTLSv1[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r4, mySmallLocalExecTLS3[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r5, 24(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r7, 320(r6)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, 324(r4)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r4, mySmallLocalExecTLS4[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r6, mySmallLocalExecTLS5[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r7, 88
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 102
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r5, 328(r4)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r7, 332(r6)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreArrays1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, mySmallLocalExecTLSv1[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r5, 4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r6, mySmallLocalExecTLS2[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r7, 2
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, mySmallLocalExecTLSv1[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r4, mySmallLocalExecTLS3[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r5, 24(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r7, 320(r6)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, 324(r4)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r4, mySmallLocalExecTLS4[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r6, mySmallLocalExecTLS5[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r7, 88
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 102
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r5, 328(r4)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r7, 332(r6)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @mySmallLocalExecTLSv1)
+  store i32 1, ptr %0, align 4
+  %arrayidx1 = getelementptr inbounds [8187 x i32], ptr %0, i64 0, i64 6
+  store i32 4, ptr %arrayidx1, align 4
+  %1 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @mySmallLocalExecTLS2)
+  %arrayidx2 = getelementptr inbounds [4000 x i32], ptr %1, i64 0, i64 80
+  store i32 2, ptr %arrayidx2, align 4
+  %2 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @mySmallLocalExecTLS3)
+  %arrayidx3 = getelementptr inbounds [4000 x i32], ptr %2, i64 0, i64 81
+  store i32 3, ptr %arrayidx3, align 4
+  %3 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @mySmallLocalExecTLS4)
+  %arrayidx4 = getelementptr inbounds [4000 x i32], ptr %3, i64 0, i64 82
+  store i32 4, ptr %arrayidx4, align 4
+  %4 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @mySmallLocalExecTLS5)
+  %arrayidx5 = getelementptr inbounds [4000 x i32], ptr %4, i64 0, i64 83
+  store i32 88, ptr %arrayidx5, align 4
+  %5 = load i32, ptr %0, align 4
+  %6 = load i32, ptr %arrayidx1, align 4
+  %7 = load i32, ptr %arrayidx2, align 4
+  %8 = load i32, ptr %arrayidx3, align 4
+  %9 = load i32, ptr %arrayidx4, align 4
+  %add = add i32 %5, 88
+  %add9 = add i32 %add, %6
+  %add11 = add i32 %add9, %7
+  %add13 = add i32 %add11, %8
+  %add15 = add i32 %add13, %9
+  ret i32 %add15
+}
+
+; Example of one access using the regular local-exec access from the TOC.
+define signext i32 @StoreArrays2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: StoreArrays2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-tprel) @mySmallLocalExecTLSv2
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r4, 1
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r5, 4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r6, mySmallLocalExecTLS2[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r7, 2
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r3, r13, r3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r4, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r4, mySmallLocalExecTLS3[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r5, 24(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r7, 320(r6)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, 324(r4)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r4, mySmallLocalExecTLS4[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r6, mySmallLocalExecTLS5[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r7, 88
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    li r3, 102
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r5, 328(r4)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r7, 332(r6)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreArrays2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r4, 1
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r5, 4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r6, mySmallLocalExecTLS2[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r7, 2
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r3, r13, r3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r4, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r4, mySmallLocalExecTLS3[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r5, 24(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r7, 320(r6)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, 324(r4)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r4, mySmallLocalExecTLS4[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r6, mySmallLocalExecTLS5[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r7, 88
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    li r3, 102
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r5, 328(r4)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r7, 332(r6)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @mySmallLocalExecTLSv2)
+  store i32 1, ptr %0, align 4
+  %arrayidx1 = getelementptr inbounds [9000 x i32], ptr %0, i64 0, i64 6
+  store i32 4, ptr %arrayidx1, align 4
+  %1 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @mySmallLocalExecTLS2)
+  %arrayidx2 = getelementptr inbounds [4000 x i32], ptr %1, i64 0, i64 80
+  store i32 2, ptr %arrayidx2, align 4
+  %2 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @mySmallLocalExecTLS3)
+  %arrayidx3 = getelementptr inbounds [4000 x i32], ptr %2, i64 0, i64 81
+  store i32 3, ptr %arrayidx3, align 4
+  %3 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @mySmallLocalExecTLS4)
+  %arrayidx4 = getelementptr inbounds [4000 x i32], ptr %3, i64 0, i64 82
+  store i32 4, ptr %arrayidx4, align 4
+  %4 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @mySmallLocalExecTLS5)
+  %arrayidx5 = getelementptr inbounds [4000 x i32], ptr %4, i64 0, i64 83
+  store i32 88, ptr %arrayidx5, align 4
+  %5 = load i32, ptr %0, align 4
+  %6 = load i32, ptr %arrayidx1, align 4
+  %7 = load i32, ptr %arrayidx2, align 4
+  %8 = load i32, ptr %arrayidx3, align 4
+  %9 = load i32, ptr %arrayidx4, align 4
+  %add = add i32 %5, 88
+  %add9 = add i32 %add, %6
+  %add11 = add i32 %add9, %7
+  %add13 = add i32 %add11, %8
+  %add15 = add i32 %add13, %9
+  ret i32 %add15
+}
+
+; DIS:      {{.*}}aix-small-local-exec-tls-largeaccess.ll.tmp.o:	file format aix5coff64-rs6000
+; DIS:      Disassembly of section .text:
+; DIS:      0000000000000000 (idx: 3) .StoreArrays1:
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 3, 13, 0
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 15) mySmallLocalExecTLSv1[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 1
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 5, 4
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 6, 13, 32748
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 17) mySmallLocalExecTLS2[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 7, 2
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, 0(13)
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 15) mySmallLocalExecTLSv1[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 4, 13, -16788
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 19) mySmallLocalExecTLS3[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 5, 24(3)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 3
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 7, 320(6)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, 324(4)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 4, 13, -788
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 21) mySmallLocalExecTLS4[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 6, 13, 15212
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 23) mySmallLocalExecTLS5[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 7, 88
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 102
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 5, 328(4)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 7, 332(6)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                blr
+
+; DIS:      0000000000000050 (idx: 5) .StoreArrays2:
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addis 3, 2, 0
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU	(idx: 13) mySmallLocalExecTLSv2[TE]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 4, 1
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 5, 4
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 6, 13, 32748
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 17) mySmallLocalExecTLS2[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 7, 2
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                ld 3, 0(3)
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL	(idx: 13) mySmallLocalExecTLSv2[TE]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                add 3, 13, 3
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 4, 0(3)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 4, 13, -16788
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 19) mySmallLocalExecTLS3[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 5, 24(3)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 3
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 7, 320(6)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 3, 324(4)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 4, 13, -788
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 21) mySmallLocalExecTLS4[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                addi 6, 13, 15212
+; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE	(idx: 23) mySmallLocalExecTLS5[TL]
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 7, 88
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                li 3, 102
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 5, 328(4)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                stw 7, 332(6)
+; DIS-NEXT: [[#%x, ADDR:]]: {{.*}}                blr
+
+; DIS:      Disassembly of section .data:
+; DIS:      00000000000000a0 (idx: 7) StoreArrays1[DS]:
+; DIS-NEXT:       a0: 00 00 00 00
+; DIS-NEXT: 00000000000000a0:  R_POS	(idx: 3) .StoreArrays1
+; DIS-NEXT:       a4: 00 00 00 00
+; DIS-NEXT:       a8: 00 00 00 00
+; DIS-NEXT: 00000000000000a8:  R_POS        (idx: 11) TOC[TC0]
+; DIS-NEXT:       ac: 00 00 00 d0
+
+; DIS:      00000000000000b8 (idx: 9) StoreArrays2[DS]:
+; DIS-NEXT:       b8: 00 00 00 00
+; DIS-NEXT: 00000000000000b8:  R_POS	(idx: 5) .StoreArrays2
+; DIS-NEXT:       bc: 00 00 00 50
+; DIS-NEXT:       c0: 00 00 00 00
+; DIS-NEXT: 00000000000000c0:  R_POS        (idx: 11) TOC[TC0]
+; DIS-NEXT:       c4: 00 00 00 d0
+
+; DIS:      00000000000000d0 (idx: 13) mySmallLocalExecTLSv2[TE]:
+; DIS-NEXT:       d0: 00 00 00 00
+; DIS-NEXT: 00000000000000d0:  R_TLS_LE     (idx: 25) mySmallLocalExecTLSv2[TL]
+; DIS-NEXT:       d4: 00 01 79 ec

diff  --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-longlong.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-longlong.ll
new file mode 100644
index 000000000000000..fd350838994b0c5
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-longlong.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \
+; RUN:      --check-prefix=SMALL-LOCAL-EXEC-LARGECM64
+
+ at ThreadLocalVarInit = thread_local(localexec) global i64 1, align 8
+ at VarInit = local_unnamed_addr global i64 87, align 8
+ at IThreadLocalVarInit = internal thread_local(localexec) global i64 1, align 8
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
+%struct.anon = type { i64 }
+ at ThreadLocalStruct = thread_local(localexec) global %struct.anon zeroinitializer, align 1
+ at d = thread_local(localexec) global [87 x i64] zeroinitializer, align 8
+
+define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, d[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, d[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @d)
+  ret ptr %0
+}
+
+define i64 @testUnaligned() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: testUnaligned:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, ThreadLocalStruct[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: testUnaligned:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, ThreadLocalStruct[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @ThreadLocalStruct)
+  %x = getelementptr inbounds %struct.anon, ptr %0, i32 0, i32 0
+  %1 = load i64, ptr %x, align 1
+  ret i64 %1
+}
+
+define void @storeITLInit(i64 noundef %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @IThreadLocalVarInit)
+  store i64 %x, ptr %0, align 8
+  ret void
+}
+
+define void @storeTLInit(i64 noundef %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @ThreadLocalVarInit)
+  store i64 %x, ptr %0, align 8
+  ret void
+}
+
+define i64 @loadITLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @IThreadLocalVarInit)
+  %1 = load i64, ptr %0, align 8
+  ret i64 %1
+}
+
+define i64 @loadITLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r4, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r4, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @IThreadLocalVarInit)
+  %1 = load i64, ptr %0, align 8
+  %2 = load i64, ptr @VarInit, align 8
+  %add = add nsw i64 %2, %1
+  ret i64 %add
+}
+
+define i64 @loadTLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @ThreadLocalVarInit)
+  %1 = load i64, ptr %0, align 8
+  ret i64 %1
+}
+
+define i64 @loadTLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r4, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r4, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @ThreadLocalVarInit)
+  %1 = load i64, ptr %0, align 8
+  %2 = load i64, ptr @VarInit, align 8
+  %add = add nsw i64 %2, %1
+  ret i64 %add
+}
+
+define void @loadStore1(i64 noundef %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 9
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 9
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @IThreadLocalVarInit)
+  %1 = load i64, ptr %0, align 8
+  %add = add nsw i64 %1, 9
+  store i64 %add, ptr %0, align 8
+  ret void
+}

diff  --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll
new file mode 100644
index 000000000000000..06ed9837b1f1954
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll
@@ -0,0 +1,173 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \
+; RUN:      --check-prefix=SMALL-LOCAL-EXEC-LARGECM64
+
+ at ThreadLocalVarInit = thread_local(localexec) global i16 1, align 2
+ at VarInit = local_unnamed_addr global i16 87, align 2
+ at IThreadLocalVarInit = internal thread_local(localexec) global i16 1, align 2
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1
+ at b = thread_local(localexec) global [87 x i16] zeroinitializer, align 2
+
+define nonnull ptr @AddrTest1() local_unnamed_addr #0 {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, b[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, b[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @b)
+  %arrayidx = getelementptr inbounds [87 x i16], ptr %0, i64 0, i64 2
+  ret ptr %arrayidx
+}
+
+define void @storeITLInit(i16 noundef signext %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    sth r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    sth r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @IThreadLocalVarInit)
+  store i16 %x, ptr %0, align 2
+  ret void
+}
+
+define void @storeTLInit(i16 noundef signext %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    sth r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    sth r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @ThreadLocalVarInit)
+  store i16 %x, ptr %0, align 2
+  ret void
+}
+
+define signext i16 @loadITLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lha r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lha r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @IThreadLocalVarInit)
+  %1 = load i16, ptr %0, align 2
+  ret i16 %1
+}
+
+define signext i16 @loadITLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lhz r4, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lhz r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    extsh r3, r3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadITLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lhz r4, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lhz r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    extsh r3, r3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @IThreadLocalVarInit)
+  %1 = load i16, ptr %0, align 2
+  %2 = load i16, ptr @VarInit, align 2
+  %add = add i16 %2, %1
+  ret i16 %add
+}
+
+define signext i16 @loadTLInit() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lha r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lha r3, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @ThreadLocalVarInit)
+  %1 = load i16, ptr %0, align 2
+  ret i16 %1
+}
+
+define signext i16 @loadTLInit2() {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # @VarInit
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lhz r4, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lhz r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    extsh r3, r3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadTLInit2:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addis r3, L..C0 at u(r2)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lhz r4, ThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, L..C0 at l(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lhz r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    add r3, r3, r4
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    extsh r3, r3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @ThreadLocalVarInit)
+  %1 = load i16, ptr %0, align 2
+  %2 = load i16, ptr @VarInit, align 2
+  %add = add i16 %2, %1
+  ret i16 %add
+}
+
+define void @loadStore1(i16 noundef signext %x) {
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lhz r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r3, 9
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    sth r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadStore1:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lhz r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r3, 9
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    sth r3, IThreadLocalVarInit[TL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
+entry:
+  %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @IThreadLocalVarInit)
+  %1 = load i16, ptr %0, align 2
+  %add = add i16 %1, 9
+  store i16 %add, ptr %0, align 2
+  ret void
+}
+

diff  --git a/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-O0.ll b/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-O0.ll
index c74202eeb8beb95..02e916d1618c9d3 100644
--- a/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-O0.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-O0.ll
@@ -11,6 +11,13 @@
 ; RUN: llc  -O0 -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
 ; RUN:      -mtriple powerpc-ibm-aix-xcoff --code-model=large < %s \
 ; RUN:      | FileCheck %s --check-prefix=LARGE32-O0
+; RUN: llc  -O0 -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \
+; RUN:      | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64
+; RUN: llc  -O0 -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
+; RUN:      -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
+; RUN:      -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \
+; RUN:      --check-prefix=SMALL-LOCAL-EXEC-LARGECM64
 
 @TLInt = internal thread_local(localexec) global i32 0, align 4
 @TLLongLong = internal thread_local(localexec) global i64 0, align 8
@@ -70,6 +77,20 @@ define void @storeInt(i32 noundef %x) {
 ; LARGE32-O0-NEXT:    lwz r0, 8(r1)
 ; LARGE32-O0-NEXT:    mtlr r0
 ; LARGE32-O0-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeInt:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r4, TLInt[UL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stw r3, 0(r4)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeInt:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r4, TLInt[UL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stw r3, 0(r4)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @TLInt)
   store i32 %x, ptr %0, align 4
@@ -132,6 +153,18 @@ define void @storeLongLong(i64 noundef %x) {
 ; LARGE32-O0-NEXT:    lwz r0, 8(r1)
 ; LARGE32-O0-NEXT:    mtlr r0
 ; LARGE32-O0-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeLongLong:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r4, TLLongLong[UL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    std r3, 0(r4)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeLongLong:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r4, TLLongLong[UL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    std r3, 0(r4)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @TLLongLong)
   store i64 %x, ptr %0, align 8
@@ -182,6 +215,18 @@ define void @storeDouble(double noundef %x) {
 ; LARGE32-O0-NEXT:    lwz r0, 8(r1)
 ; LARGE32-O0-NEXT:    mtlr r0
 ; LARGE32-O0-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeDouble:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, TLDouble[UL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stxsdx f1, 0, r3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeDouble:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, TLDouble[UL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stxsdx f1, 0, r3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @TLDouble)
   store double %x, ptr %0, align 8
@@ -232,6 +277,18 @@ define void @storeFloat(float noundef %x) {
 ; LARGE32-O0-NEXT:    lwz r0, 8(r1)
 ; LARGE32-O0-NEXT:    mtlr r0
 ; LARGE32-O0-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: storeFloat:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, TLFloat[UL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stfs f1, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: storeFloat:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, TLFloat[UL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stfs f1, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @TLFloat)
   store float %x, ptr %0, align 4
@@ -282,6 +339,18 @@ define i32 @loadInt() {
 ; LARGE32-O0-NEXT:    lwz r0, 8(r1)
 ; LARGE32-O0-NEXT:    mtlr r0
 ; LARGE32-O0-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadInt:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, TLInt[UL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwz r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadInt:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, TLInt[UL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwz r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @TLInt)
   %1 = load i32, ptr %0, align 4
@@ -336,6 +405,22 @@ define i32 @loadLongLong() {
 ; LARGE32-O0-NEXT:    lwz r0, 8(r1)
 ; LARGE32-O0-NEXT:    mtlr r0
 ; LARGE32-O0-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadLongLong:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, TLLongLong[UL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    ld r3, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    clrldi r3, r3, 32
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadLongLong:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, TLLongLong[UL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    ld r3, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    clrldi r3, r3, 32
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @TLLongLong)
   %1 = load i64, ptr %0, align 8
@@ -407,6 +492,30 @@ define i32 @loadDouble() {
 ; LARGE32-O0-NEXT:    lwz r0, 8(r1)
 ; LARGE32-O0-NEXT:    mtlr r0
 ; LARGE32-O0-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadDouble:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, TLDouble[UL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfd f0, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    # kill: def $f1 killed $f0
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    xscvdpsxws f0, f0
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    addi r3, r1, -12
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stfiwx f0, 0, r3
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwz r3, -12(r1)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    clrldi r3, r3, 32
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadDouble:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, TLDouble[UL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfd f0, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    # kill: def $f1 killed $f0
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    xscvdpsxws f0, f0
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    addi r3, r1, -12
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stfiwx f0, 0, r3
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwz r3, -12(r1)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    clrldi r3, r3, 32
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @TLDouble)
   %1 = load double, ptr %0, align 8
@@ -474,6 +583,26 @@ define i32 @loadFloat() {
 ; LARGE32-O0-NEXT:    lwz r0, 8(r1)
 ; LARGE32-O0-NEXT:    mtlr r0
 ; LARGE32-O0-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: loadFloat:
+; SMALL-LOCAL-EXEC-SMALLCM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    la r3, TLFloat[UL]@le(r13)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lfs f0, 0(r3)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    fctiwz f0, f0
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    stfd f0, -8(r1)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    lwa r3, -4(r1)
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    clrldi r3, r3, 32
+; SMALL-LOCAL-EXEC-SMALLCM64-NEXT:    blr
+;
+; SMALL-LOCAL-EXEC-LARGECM64-LABEL: loadFloat:
+; SMALL-LOCAL-EXEC-LARGECM64:       # %bb.0: # %entry
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    la r3, TLFloat[UL]@le(r13)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lfs f0, 0(r3)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    fctiwz f0, f0
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    stfd f0, -8(r1)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    lwa r3, -4(r1)
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    clrldi r3, r3, 32
+; SMALL-LOCAL-EXEC-LARGECM64-NEXT:    blr
 entry:
   %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @TLFloat)
   %1 = load float, ptr %0, align 4

diff  --git a/llvm/test/CodeGen/PowerPC/ppc64-nonfunc-calls.ll b/llvm/test/CodeGen/PowerPC/ppc64-nonfunc-calls.ll
index 8962e55623a518a..ed588d5f6b3e9bc 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-nonfunc-calls.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-nonfunc-calls.ll
@@ -37,7 +37,7 @@ entry:
 ; CHECK-DAG: addis [[REG1:[0-9]+]], 13, tls_something at tprel@ha
 ; CHECK-DAG: std 2, 40(1)
 ; CHECK-DAG: addi [[REG3:[0-9]+]], [[REG1]], tls_something at tprel@l
-; CHECK-DAG: ld [[REG2:[0-9]+]], 0([[REG3]])
+; CHECK-DAG: ld [[REG2:[0-9]+]], tls_something at tprel@l([[REG1]])
 ; CHECK-DAG: ld 11, 16([[REG3]])
 ; CHECK-DAG: ld 2, 8([[REG3]])
 ; CHECK-DAG: mtctr [[REG2]]


        


More information about the llvm-commits mailing list