[llvm] 02df03c - [AArch64][SME] Add support for arm_locally_streaming functions.

Fri Oct 14 06:48:08 PDT 2022

Author: Sander de Smalen
Date: 2022-10-14T13:47:53Z
New Revision: 02df03c5b7ae9fa2e8b55369dd5ebd3871a60017

URL: https://github.com/llvm/llvm-project/commit/02df03c5b7ae9fa2e8b55369dd5ebd3871a60017
DIFF: https://github.com/llvm/llvm-project/commit/02df03c5b7ae9fa2e8b55369dd5ebd3871a60017.diff

LOG: [AArch64][SME] Add support for arm_locally_streaming functions.

Functions with `aarch64_sme_pstatesm_body` will emit a SMSTART at the start
of the function, and a SMSTOP at the end of the function, such that all
operations use the right value for vscale.

Because the placement of these nodes is critically important (i.e. no
vscale-dependent operations should be done before SMSTART has been issued),
we require glueing the CopyFromReg to the Entry node such that we can
insert the SMSTART as part of that glued chain.

More details about the SME attributes and design can be found
in D131562.

Reviewed By: aemerson

Differential Revision: https://reviews.llvm.org/D131582

Added: 
    llvm/test/CodeGen/AArch64/sme-streaming-body.ll

Modified: 
    llvm/include/llvm/CodeGen/SelectionDAG.h
    llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll
    llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
    llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll
    llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll
    llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
    llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected
    llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 5c83ead02e806..7259e6258df40 100644

--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -237,6 +237,12 @@ class SelectionDAG {
   ProfileSummaryInfo *PSI = nullptr;
   BlockFrequencyInfo *BFI = nullptr;
 
+  /// List of non-single value types.
+  FoldingSet<SDVTListNode> VTListMap;
+
+  /// Pool allocation for misc. objects that are created once per SelectionDAG.
+  BumpPtrAllocator Allocator;
+
   /// The starting token.
   SDNode EntryNode;
 
@@ -263,9 +269,6 @@ class SelectionDAG {
   BumpPtrAllocator OperandAllocator;
   ArrayRecycler<SDUse> OperandRecycler;
 
-  /// Pool allocation for misc. objects that are created once per SelectionDAG.
-  BumpPtrAllocator Allocator;
-
   /// Tracks dbg_value and dbg_label information through SDISel.
   SDDbgInfo *DbgInfo;
 
@@ -2281,9 +2284,6 @@ class SelectionDAG {
   SDNode *FindNodeOrInsertPos(const FoldingSetNodeID &ID, const SDLoc &DL,
                               void *&InsertPos);
 
-  /// List of non-single value types.
-  FoldingSet<SDVTListNode> VTListMap;
-
   /// Maps to auto-CSE operations.
   std::vector<CondCodeSDNode*> CondCodeNodes;
 

diff  --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 7d9bd0a742680..9d225ad4e320f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -1162,7 +1162,6 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
 #endif
     llvm_unreachable("This target-independent node should have been selected!");
   case ISD::EntryToken:
-    llvm_unreachable("EntryToken should have been excluded from the schedule!");
   case ISD::MERGE_VALUES:
   case ISD::TokenFactor: // fall thru
     break;

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 93a74b757c585..9f01fa53a98ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1275,7 +1275,7 @@ Align SelectionDAG::getEVTAlign(EVT VT) const {
 // EntryNode could meaningfully have debug info if we can find it...
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
     : TM(tm), OptLevel(OL),
-      EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
+      EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other, MVT::Glue)),
       Root(getEntryNode()) {
   InsertNode(&EntryNode);
   DbgInfo = new SDDbgInfo();

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 949dc47cbfa12..3d12cf5a0807a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6037,6 +6037,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     (void)Res;
   }
 
+  SMEAttrs Attrs(MF.getFunction());
+  bool IsLocallyStreaming =
+      !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
+  assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
+  SDValue Glue = Chain.getValue(1);
+
+  SmallVector<SDValue, 16> ArgValues;
   unsigned ExtraArgLocs = 0;
   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
@@ -6091,7 +6098,22 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
       // Transform the arguments in physical registers into virtual ones.
       Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
-      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+      if (IsLocallyStreaming) {
+        // LocallyStreamingFunctions must insert the SMSTART in the correct
+        // position, so we use Glue to ensure no instructions can be scheduled
+        // between the chain of:
+        //        t0: ch,glue = EntryNode
+        //      t1:  res,ch,glue = CopyFromReg
+        //     ...
+        //   tn: res,ch,glue = CopyFromReg t(n-1), ..
+        // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
+        // ^^^^^^
+        // This will be the new Chain/Root node.
+        ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
+        Glue = ArgValue.getValue(2);
+      } else
+        ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
       // If this is an 8, 16 or 32-bit value, it is really passed promoted
       // to 64 bits.  Insert an assert[sz]ext to capture this, then
@@ -6245,6 +6267,27 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   }
   assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
 
+  // Insert the SMSTART if this is a locally streaming function and
+  // make sure it is Glued to the last CopyFromReg value.
+  if (IsLocallyStreaming) {
+    const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+    Chain = DAG.getNode(
+        AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+        {DAG.getRoot(),
+          DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
+         DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64),
+         DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue});
+    // Ensure that the SMSTART happens after the CopyWithChain such that its
+    // chain result is used.
+    for (unsigned I=0; I<InVals.size(); ++I) {
+      Register Reg = MF.getRegInfo().createVirtualRegister(
+          getRegClassFor(InVals[I].getValueType().getSimpleVT()));
+      SDValue X = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
+      InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
+                                     InVals[I].getValueType());
+    }
+  }
+
   // varargs
   if (isVarArg) {
     if (!Subtarget->isTargetDarwin() || IsWin64) {
@@ -7485,6 +7528,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     }
   }
 
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+  // Emit SMSTOP before returning from a locally streaming function
+  SMEAttrs FuncAttrs(MF.getFunction());
+  if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
+    Chain = DAG.getNode(
+        AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
+        DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
+        DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64),
+        DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()));
+    Flag = Chain.getValue(1);
+  }
+
   SmallVector<SDValue, 4> RetOps(1, Chain);
   for (auto &RetVal : RetVals) {
     Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
@@ -7509,7 +7565,6 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   }
 
-  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
   if (I) {
     for (; *I; ++I) {

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index f4e807f5057a8..adc03b7656bd3 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4256,6 +4256,8 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
     break;
   case AArch64::ADDVL_XXI:
   case AArch64::ADDPL_XXI:
+  case AArch64::ADDSVL_XXI:
+  case AArch64::ADDSPL_XXI:
     MaxEncoding = 31;
     ShiftSize = 0;
     if (Offset < 0) {
@@ -4270,9 +4272,9 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
 
   // `Offset` can be in bytes or in "scalable bytes".
   int VScale = 1;
-  if (Opc == AArch64::ADDVL_XXI)
+  if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
     VScale = 16;
-  else if (Opc == AArch64::ADDPL_XXI)
+  else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
     VScale = 2;
 
   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
@@ -4369,6 +4371,14 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
                            bool NeedsWinCFI, bool *HasWinCFI,
                            bool EmitCFAOffset, StackOffset CFAOffset,
                            unsigned FrameReg) {
+  // If a function is marked as arm_locally_streaming, then the runtime value of
+  // vscale in the prologue/epilogue is 
diff erent the runtime value of vscale
+  // in the function's body. To avoid having to consider multiple vscales,
+  // we can use `addsvl` to allocate any scalable stack-slots, which under
+  // most circumstances will be only locals, not callee-save slots.
+  const Function &F = MBB.getParent()->getFunction();
+  bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
+
   int64_t Bytes, NumPredicateVectors, NumDataVectors;
   AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
       Offset, Bytes, NumPredicateVectors, NumDataVectors);
@@ -4399,8 +4409,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
 
   if (NumDataVectors) {
     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
-                       AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr,
-                       EmitCFAOffset, CFAOffset, FrameReg);
+                       UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
+                       TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
+                       CFAOffset, FrameReg);
     CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
     SrcReg = DestReg;
   }
@@ -4408,8 +4419,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
   if (NumPredicateVectors) {
     assert(DestReg != AArch64::SP && "Unaligned access to SP");
     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
-                       AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr,
-                       EmitCFAOffset, CFAOffset, FrameReg);
+                       UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
+                       TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
+                       CFAOffset, FrameReg);
   }
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll b/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll
index a20abc8247217..d92290f581efc 100644
--- a/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll
+++ b/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll
@@ -22,7 +22,17 @@ define i64 @get_pstatesm_streaming() nounwind "aarch64_pstate_sm_enabled" {
 define i64 @get_pstatesm_locally_streaming() nounwind "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: get_pstatesm_locally_streaming:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %pstate = call i64 @llvm.aarch64.sme.get.pstatesm()
   ret i64 %pstate

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
new file mode 100644
index 0000000000000..450f42d4f8f0a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
@@ -0,0 +1,265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false  -verify-machineinstrs < %s | FileCheck %s
+
+declare void @normal_callee();
+declare void @streaming_callee() "aarch64_pstate_sm_enabled";
+declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
+
+define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_caller_streaming_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    bl streaming_compatible_callee
+; CHECK-NEXT:    bl streaming_compatible_callee
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+
+  call void @streaming_compatible_callee();
+  call void @streaming_compatible_callee();
+  ret void;
+}
+
+; Test that a streaming body and streaming interface, no smstart/smstop are emitted,
+; because the function already is in streaming mode upon entry.
+define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl streaming_callee
+; CHECK-NEXT:    bl streaming_callee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @streaming_callee();
+  call void @streaming_callee();
+  ret void;
+}
+
+define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_multiple_exit:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    cmp x0, #1
+; CHECK-NEXT:    b.ne .LBB2_2
+; CHECK-NEXT:  // %bb.1: // %if.else
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2: // %if.end
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+
+entry:
+  %tobool = icmp eq i64 %cond, 1
+  br i1 %tobool, label %if.else, label %if.end
+
+if.else:
+  ret void;
+
+if.end:
+  ret void;
+}
+
+; Do a fixed-width vector add on a NEON vector.
+; This tests that:
+; * Incoming vector in v0.d isn't clobbered by the change in streaming mode.
+; * Result vector is correctly preserved after smstop.
+define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_caller_no_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+
+  %add = add <2 x i64> %a, <i64 41, i64 42>;
+  ret <2 x i64> %add;
+}
+
+; Test that we use the interface (not the function's body) to determine what
+; streaming-mode to enter the callee. In this case the interface is normal, so
+; pstate.sm must be 0 on entry and is 0 upon return from the callee.
+define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl locally_streaming_caller_streaming_callee
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+
+  call void @locally_streaming_caller_streaming_callee();
+  ret void;
+}
+
+;
+; Test that a locally streaming function correctly retains the
+; argument/result registers, because smstart/smstop instructions that are
+; inserted to implement the arm_locally_streaming attribute thrashes the
+; vector register contents.
+;
+
+define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl streaming_compatible_callee_vec_args_ret
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible"
+  ret <2 x i64> %res;
+}
+
+declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_pstate_sm_compatible"
+
+define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    bl streaming_compatible_callee_vec_arg_struct_ret
+; CHECK-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    ret
+  %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1
+  %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible"
+  ret {<2 x i64>, <2 x i64>} %res;
+}
+
+declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64>) "aarch64_pstate_sm_compatible"
+
+; Test that we use `addsvl` for allocating any stack space for locals before `smstart`,
+; such that the correct amount of stack space is allocated.
+define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" {
+; CHECK-LABEL: locally_streaming_caller_alloca:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    addsvl sp, sp, #-1
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    mov x0, sp
+; CHECK-NEXT:    bl use_ptr
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    addsvl sp, sp, #1
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloca = alloca <vscale x 4 x i32>
+  call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible"
+  ret void
+}
+
+declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible"
+
+define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" {
+; CHECK-LABEL: call_to_intrinsic_without_chain:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str d0, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #72] // 8-byte Folded Reload
+; CHECK-NEXT:    bl cos
+; CHECK-NEXT:    str d0, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d0, [sp, #72] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = call fast double @llvm.cos.f64(double %x)
+  ret double %0
+}
+
+declare double @llvm.cos.f64(double)

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll
index 98ae39c5dc9b8..62681cd85927e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll
@@ -10,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @foo(<8 x i64>* %a) #0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       SelectionDAG has 14 nodes:
-; CHECK-NEXT:    t0: ch = EntryToken
+; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t12: nxv2i1 = PTRUE_D TargetConstant:i32<31>
 ; CHECK-NEXT:    t2: i64,ch = CopyFromReg t0, Register:i64 %0
 ; CHECK-NEXT:    t18: nxv2i64,ch = LD1D_IMM<Mem:(volatile load (s512) from %ir.a)> t12, t2, TargetConstant:i64<0>, t0

diff  --git a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
index 38d65337a47cc..25c93bd3327ec 100644
--- a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
@@ -7,7 +7,7 @@
 ; GCN: Initial selection DAG: %bb.0 'test_sdag_dump:entry'
 ; GCN: SelectionDAG has 10 nodes:
 
-; GCN-DEFAULT:  t0: ch = EntryToken
+; GCN-DEFAULT:  t0: ch,glue = EntryToken
 ; GCN-DEFAULT:  t2: f32,ch = CopyFromReg t0, Register:f32 %0
 ; GCN-DEFAULT:      t5: f32 = fadd t2, t2
 ; GCN-DEFAULT:      t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1
@@ -15,7 +15,7 @@
 ; GCN-DEFAULT:  t8: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6
 ; GCN-DEFAULT:  t9: ch = RETURN_TO_EPILOG # D:1 t8, Register:f32 $vgpr0, t8:1
 
-; GCN-VERBOSE:  t0: ch = EntryToken # D:0
+; GCN-VERBOSE:  t0: ch,glue = EntryToken # D:0
 ; GCN-VERBOSE:  t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0
 ; GCN-VERBOSE:      t5: f32 = fadd [ORD=2] # D:0 t2, t2
 ; GCN-VERBOSE:      t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0

diff  --git a/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll b/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll
index 4e8bd2fd37de0..876bf78e88b7d 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll
+++ b/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll
@@ -5,7 +5,7 @@
 ; inlineasm_br. Not sure how to get a MachineIR change so this reads the debug
 ; output from SelectionDAG.
 
-; CHECK: t0: ch = EntryToken
+; CHECK: t0: ch,glue = EntryToken
 ; CHECK-NEXT: t4: i32,ch = CopyFromReg t0, Register:i32 %3
 ; CHECK-NEXT: t10: i32 = add t4, Constant:i32<1>
 ; CHECK-NEXT: t12: ch = CopyToReg t0, Register:i32 %0, t10

diff  --git a/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll b/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll
index d48c59a89c3d7..4d0d05e26cded 100644
--- a/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll
+++ b/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll
@@ -14,7 +14,7 @@
 ; X86-NEXT: retq
 
 ; DBGDAG-LABEL: Optimized legalized selection DAG: %bb.0 'merge_store_partial_overlap_load:'
-; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch = EntryToken
+; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch,glue = EntryToken
 ; DBGDAG-DAG: [[BASEPTR:t[0-9]+]]: i64,ch = CopyFromReg [[ENTRYTOKEN]],
 ; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add {{(nuw )?}}[[BASEPTR]], Constant:i64<2>
 

diff  --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
index 0cc5ede932a9a..38c20409191d7 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
@@ -4,7 +4,7 @@
 define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-LABEL: i64_test:
 ; CHECK:       SelectionDAG has 9 nodes:
-; CHECK-NEXT:    t0: ch = EntryToken
+; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t11: ch,glue = CopyToReg t0, Register:i32 $vgpr0, IMPLICIT_DEF:i32
 ; CHECK-NEXT:    t17: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
 ; CHECK-NEXT:    t13: ch,glue = CopyToReg t11, Register:i32 $vgpr1, t17, t11:1
@@ -20,7 +20,7 @@ define i64 @i32_test(i32 %i) nounwind readnone {
 ; CHECK-LABEL: i32_test:
 ; CHECK:       SelectionDAG has 8 nodes:
 ; CHECK-NEXT:    t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT:    t0: ch = EntryToken
+; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5
 ; CHECK-NEXT:    t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1
 ; CHECK-NEXT:    t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1
@@ -36,7 +36,7 @@ define i64 @i16_test(i16 %i) nounwind readnone {
 ; CHECK-LABEL: i16_test:
 ; CHECK:       SelectionDAG has 8 nodes:
 ; CHECK-NEXT:    t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT:    t0: ch = EntryToken
+; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5
 ; CHECK-NEXT:    t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1
 ; CHECK-NEXT:    t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1
@@ -52,7 +52,7 @@ define i64 @i8_test(i8 %i) nounwind readnone {
 ; CHECK-LABEL: i8_test:
 ; CHECK:       SelectionDAG has 8 nodes:
 ; CHECK-NEXT:    t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT:    t0: ch = EntryToken
+; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5
 ; CHECK-NEXT:    t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1
 ; CHECK-NEXT:    t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1

diff  --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected
index 62aa8da4b721c..7d152d9d3ec84 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected
@@ -4,7 +4,7 @@
 define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-LABEL: i64_test:
 ; CHECK:       SelectionDAG has 22 nodes:
-; CHECK-NEXT:    t0: ch = EntryToken
+; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t5: i32,ch = LDW_RI<Mem:(load (s32) from %fixed-stack.0)> TargetFrameIndex:i32<-2>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0
 ; CHECK-NEXT:    t7: i32 = ADD_I_LO TargetFrameIndex:i32<0>, TargetConstant:i32<0>
 ; CHECK-NEXT:    t29: i32 = OR_I_LO t7, TargetConstant:i32<4>
@@ -29,7 +29,7 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 define i64 @i32_test(i32 %i) nounwind readnone {
 ; CHECK-LABEL: i32_test:
 ; CHECK:       SelectionDAG has 14 nodes:
-; CHECK-NEXT:    t0: ch = EntryToken
+; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t21: i32,ch = CopyFromReg t0, Register:i32 $r0
 ; CHECK-NEXT:    t13: ch,glue = CopyToReg t0, Register:i32 $rv, t21
 ; CHECK-NEXT:    t3: i32,ch = LDW_RI<Mem:(load (s32) from %fixed-stack.0, align 8)> TargetFrameIndex:i32<-1>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0
@@ -48,7 +48,7 @@ define i64 @i32_test(i32 %i) nounwind readnone {
 define i64 @i16_test(i16 %i) nounwind readnone {
 ; CHECK-LABEL: i16_test:
 ; CHECK:       SelectionDAG has 19 nodes:
-; CHECK-NEXT:    t0: ch = EntryToken
+; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t33: i32,ch = CopyFromReg t0, Register:i32 $r0
 ; CHECK-NEXT:    t14: ch,glue = CopyToReg t0, Register:i32 $rv, t33
 ; CHECK-NEXT:    t1: i32 = ADD_I_LO TargetFrameIndex:i32<-1>, TargetConstant:i32<0>
@@ -71,7 +71,7 @@ define i64 @i16_test(i16 %i) nounwind readnone {
 define i64 @i8_test(i8 %i) nounwind readnone {
 ; CHECK-LABEL: i8_test:
 ; CHECK:       SelectionDAG has 20 nodes:
-; CHECK-NEXT:    t0: ch = EntryToken
+; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t33: i32,ch = CopyFromReg t0, Register:i32 $r0
 ; CHECK-NEXT:    t14: ch,glue = CopyToReg t0, Register:i32 $rv, t33
 ; CHECK-NEXT:    t1: i32 = ADD_I_LO TargetFrameIndex:i32<-1>, TargetConstant:i32<0>

diff  --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected
index 279c8ee2336eb..0175556cd17ea 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected
@@ -5,7 +5,7 @@
 define i64 @i64_test(i64 %i) nounwind readnone {
 ; PIC-LABEL: i64_test:
 ; PIC:       SelectionDAG has 12 nodes:
-; PIC-NEXT:    t0: ch = EntryToken
+; PIC-NEXT:    t0: ch,glue = EntryToken
 ; PIC-NEXT:    t2: i64,ch = CopyFromReg t0, Register:i64 %0
 ; PIC-NEXT:    t7: i64,i32,ch = ADD64rm<Mem:(dereferenceable load (s64) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
 ; PIC-NEXT:    t10: ch,glue = CopyToReg t0, Register:i64 $rax, t7
@@ -14,7 +14,7 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 ;
 ; WIN-LABEL: i64_test:
 ; WIN:       SelectionDAG has 12 nodes:
-; WIN-NEXT:    t0: ch = EntryToken
+; WIN-NEXT:    t0: ch,glue = EntryToken
 ; WIN-NEXT:    t2: i64,ch = CopyFromReg t0, Register:i64 %0
 ; WIN-NEXT:    t7: i64,i32,ch = ADD64rm<Mem:(dereferenceable load (s64) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
 ; WIN-NEXT:    t10: ch,glue = CopyToReg t0, Register:i64 $rax, t7
@@ -29,7 +29,7 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 define i64 @i32_test(i32 %i) nounwind readnone {
 ; PIC-LABEL: i32_test:
 ; PIC:       SelectionDAG has 15 nodes:
-; PIC-NEXT:    t0: ch = EntryToken
+; PIC-NEXT:    t0: ch,glue = EntryToken
 ; PIC-NEXT:    t2: i32,ch = CopyFromReg t0, Register:i32 %0
 ; PIC-NEXT:    t7: i32,i32,ch = ADD32rm<Mem:(dereferenceable load (s32) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
 ; PIC-NEXT:    t8: i64 = SUBREG_TO_REG TargetConstant:i64<0>, t7, TargetConstant:i32<6>
@@ -39,7 +39,7 @@ define i64 @i32_test(i32 %i) nounwind readnone {
 ;
 ; WIN-LABEL: i32_test:
 ; WIN:       SelectionDAG has 15 nodes:
-; WIN-NEXT:    t0: ch = EntryToken
+; WIN-NEXT:    t0: ch,glue = EntryToken
 ; WIN-NEXT:    t2: i32,ch = CopyFromReg t0, Register:i32 %0
 ; WIN-NEXT:    t7: i32,i32,ch = ADD32rm<Mem:(dereferenceable load (s32) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
 ; WIN-NEXT:    t8: i64 = SUBREG_TO_REG TargetConstant:i64<0>, t7, TargetConstant:i32<6>
@@ -56,7 +56,7 @@ define i64 @i32_test(i32 %i) nounwind readnone {
 define i64 @i16_test(i16 %i) nounwind readnone {
 ; PIC-LABEL: i16_test:
 ; PIC:       SelectionDAG has 18 nodes:
-; PIC-NEXT:    t0: ch = EntryToken
+; PIC-NEXT:    t0: ch,glue = EntryToken
 ; PIC-NEXT:    t2: i32,ch = CopyFromReg t0, Register:i32 %0
 ; PIC-NEXT:    t3: i16 = EXTRACT_SUBREG t2, TargetConstant:i32<4>
 ; PIC-NEXT:    t8: i16,i32,ch = ADD16rm<Mem:(dereferenceable load (s16) from %ir.loc)> t3, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
@@ -68,7 +68,7 @@ define i64 @i16_test(i16 %i) nounwind readnone {
 ;
 ; WIN-LABEL: i16_test:
 ; WIN:       SelectionDAG has 16 nodes:
-; WIN-NEXT:    t0: ch = EntryToken
+; WIN-NEXT:    t0: ch,glue = EntryToken
 ; WIN-NEXT:    t2: i16,ch = CopyFromReg t0, Register:i16 %0
 ; WIN-NEXT:    t7: i16,i32,ch = ADD16rm<Mem:(dereferenceable load (s16) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
 ; WIN-NEXT:    t14: i32 = MOVZX32rr16 t7
@@ -86,7 +86,7 @@ define i64 @i16_test(i16 %i) nounwind readnone {
 define i64 @i8_test(i8 %i) nounwind readnone {
 ; PIC-LABEL: i8_test:
 ; PIC:       SelectionDAG has 18 nodes:
-; PIC-NEXT:    t0: ch = EntryToken
+; PIC-NEXT:    t0: ch,glue = EntryToken
 ; PIC-NEXT:    t2: i32,ch = CopyFromReg t0, Register:i32 %0
 ; PIC-NEXT:    t3: i8 = EXTRACT_SUBREG t2, TargetConstant:i32<1>
 ; PIC-NEXT:    t8: i8,i32,ch = ADD8rm<Mem:(dereferenceable load (s8) from %ir.loc)> t3, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
@@ -98,7 +98,7 @@ define i64 @i8_test(i8 %i) nounwind readnone {
 ;
 ; WIN-LABEL: i8_test:
 ; WIN:       SelectionDAG has 16 nodes:
-; WIN-NEXT:    t0: ch = EntryToken
+; WIN-NEXT:    t0: ch,glue = EntryToken
 ; WIN-NEXT:    t2: i8,ch = CopyFromReg t0, Register:i8 %0
 ; WIN-NEXT:    t7: i8,i32,ch = ADD8rm<Mem:(dereferenceable load (s8) from %ir.loc)> t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0
 ; WIN-NEXT:    t14: i32 = MOVZX32rr8 t7