[llvm] 9e9be99 - [AArch64][SME] Disable remat of VL-dependent ops when function changes streaming mode.

Sander de Smalen via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 1 05:15:39 PDT 2023


Author: Sander de Smalen
Date: 2023-09-01T12:13:27Z
New Revision: 9e9be99c972e125c640a30000731547beb006084

URL: https://github.com/llvm/llvm-project/commit/9e9be99c972e125c640a30000731547beb006084
DIFF: https://github.com/llvm/llvm-project/commit/9e9be99c972e125c640a30000731547beb006084.diff

LOG: [AArch64][SME] Disable remat of VL-dependent ops when function changes streaming mode.

This is a way to prevent the register allocator from inserting instructions
which behave differently for different runtime vector-lengths, inside a
call-sequence which changes the streaming-SVE mode before/after the call.

I've considered using BUNDLEs in Machine IR, but found that using this is
not possible for a few reasons:
* Most passes don't look inside BUNDLEs, but some passes would need to
  look inside these call-sequence bundles, for example the PrologEpilog
  pass (to remove the CALLSEQSTART/END), a PostRA pass to remove COPY
  instructions, or the AArch64PseudoExpand pass.
* Within the streaming-mode-changing call sequence, one of the instructions
  is a CALLSEQEND. The corresponding CALLSEQBEGIN (AArch64::ADJCALLSTACKUP)
  is outside this sequence. This means we'd end up with a BUNDLE that has
  [SMSTART, COPY, BL, ADJCALLSTACKUP, COPY, SMSTOP]. The MachineVerifier
  doesn't accept this, and we also can't move the CALLSEQSTART into the
  call sequence.

Maybe in the future we could model this differently by modelling
the runtime vector-length as a value that's used by certain operations
(similar to e.g. NCZV flags) and clobbered by SMSTART/MMSTOP, such that the
register allocator can consider these as actual dependences and avoid
rematerialization. For now we just want to address the immediate problem.

Reviewed By: paulwalker-arm, aemerson

Differential Revision: https://reviews.llvm.org/D159193

Added: 
    llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/lib/Target/AArch64/AArch64InstrInfo.h
    llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index da9eda6903eb71..3e44eabd8a93c8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7164,6 +7164,10 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
 SDValue AArch64TargetLowering::changeStreamingMode(
     SelectionDAG &DAG, SDLoc DL, bool Enable,
     SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  FuncInfo->setHasStreamingModeChanges(true);
+
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
   SDValue MSROp =

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 4b98dbc1a8dc2c..33a6906b0a0d3c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8524,6 +8524,54 @@ unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
     return AArch64::BLR;
 }
 
+bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
+    const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getMF();
+  const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+
+  // If the function contains changes to streaming mode, then there
+  // is a danger that rematerialised instructions end up between
+  // instruction sequences (e.g. call sequences, or prolog/epilogue)
+  // where the streaming-SVE mode is temporarily changed.
+  if (AFI.hasStreamingModeChanges()) {
+    // Avoid rematerializing rematerializable instructions that use/define
+    // scalable values, such as 'pfalse' or 'ptrue', which result in 
diff erent
+    // results when the runtime vector length is 
diff erent.
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (any_of(MI.operands(), [&MRI](const MachineOperand &MO) {
+          if (!MO.isReg())
+            return false;
+
+          if (MO.getReg().isVirtual()) {
+            const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
+            return AArch64::ZPRRegClass.hasSubClassEq(RC) ||
+                   AArch64::PPRRegClass.hasSubClassEq(RC);
+          }
+          return AArch64::ZPRRegClass.contains(MO.getReg()) ||
+                 AArch64::PPRRegClass.contains(MO.getReg());
+        }))
+      return false;
+
+    // Avoid rematerializing instructions that return a value that is
+    // 
diff erent depending on vector length, even when it is not returned
+    // in a scalable vector/predicate register.
+    switch (MI.getOpcode()) {
+    default:
+      break;
+    case AArch64::RDVLI_XI:
+    case AArch64::ADDVL_XXI:
+    case AArch64::ADDPL_XXI:
+    case AArch64::CNTB_XPiI:
+    case AArch64::CNTH_XPiI:
+    case AArch64::CNTW_XPiI:
+    case AArch64::CNTD_XPiI:
+      return false;
+    }
+  }
+
+  return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
+}
+
 #define GET_INSTRINFO_HELPERS
 #define GET_INSTRMAP_INFO
 #include "AArch64GenInstrInfo.inc"

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 92ea6cd7e2b60e..2ccadd35d7345d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -355,6 +355,8 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset,
                                                   int64_t &ByteSized,
                                                   int64_t &VGSized);
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
 #define GET_INSTRINFO_HELPER_DECLS
 #include "AArch64GenInstrInfo.inc"
 

diff  --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d82fb436925ec6..8df95ff1e6eaea 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -185,6 +185,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// The frame-index for the TPIDR2 object used for lazy saves.
   Register LazySaveTPIDR2Obj = 0;
 
+  /// Whether this function changes streaming mode within the function.
+  bool HasStreamingModeChanges = false;
 
   /// True if the function need unwind information.
   mutable std::optional<bool> NeedsDwarfUnwindInfo;
@@ -447,6 +449,11 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool needsDwarfUnwindInfo(const MachineFunction &MF) const;
   bool needsAsyncDwarfUnwindInfo(const MachineFunction &MF) const;
 
+  bool hasStreamingModeChanges() const { return HasStreamingModeChanges; }
+  void setHasStreamingModeChanges(bool HasChanges) {
+    HasStreamingModeChanges = HasChanges;
+  }
+
 private:
   // Hold the lists of LOHs.
   MILOHContainer LOHContainerSet;

diff  --git a/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll
new file mode 100644
index 00000000000000..b3aeb1fcc42da1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64"
+
+
+define void @dont_rematerialize_cntd(i32 %N) #0 {
+; CHECK-LABEL: dont_rematerialize_cntd:
+; CHECK:        cntd
+; CHECK:        smstop sm
+; CHECK-NOT:    cntd
+; CHECK:        bl      foo
+; CHECK:        smstart  sm
+entry:
+  %cmp2 = icmp sgt i32 %N, 0
+  br i1 %cmp2, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %index.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"() nounwind
+  %.tr = call i32 @llvm.vscale.i32()
+  %conv = shl nuw nsw i32 %.tr, 4
+  call void @foo(i32 %conv)
+  %inc = add nuw nsw i32 %index.03, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+; This test doesn't strictly make sense, because it passes a scalable predicate
+; to a function, which makes little sense if the VL is not the same in/out of
+; streaming-SVE mode. If the VL is known to be the same, then we could just as
+; well rematerialize the `ptrue` inside the call sequence. However, the purpose
+; of this test is more to ensure that the logic works, which may also trigger
+; when the value is not being passed as argument (e.g. when it is hoisted from
+; a loop and placed inside the call sequence).
+;
+; FIXME: This test also exposes another bug, where the 'mul vl' addressing mode
+; is used before/after the smstop. This will be fixed in a future patch.
+define void @dont_rematerialize_ptrue(i32 %N) #0 {
+; CHECK-LABEL: dont_rematerialize_ptrue:
+; CHECK:        ptrue [[PTRUE:p[0-9]+]].b
+; CHECK:        str [[PTRUE]], [[[SPILL_ADDR:.*]]]
+; CHECK:        smstop sm
+; CHECK:        ldr p0, [[[SPILL_ADDR]]]
+; CHECK-NOT:    ptrue
+; CHECK:        bl      bar
+; CHECK:        smstart  sm
+entry:
+  %cmp2 = icmp sgt i32 %N, 0
+  br i1 %cmp2, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %index.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"() nounwind
+  %ptrue.ins = insertelement <vscale x 16 x i1> poison, i1 1, i32 0
+  %ptrue = shufflevector <vscale x 16 x i1> %ptrue.ins, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  call void @bar(<vscale x 16 x i1> %ptrue)
+  %inc = add nuw nsw i32 %index.03, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+}
+declare void @foo(i32)
+declare void @bar(<vscale x 16 x i1>)
+declare i32 @llvm.vscale.i32()
+
+attributes #0 = { "aarch64_pstate_sm_enabled" "frame-pointer"="non-leaf" "target-features"="+sme,+sve" }


        


More information about the llvm-commits mailing list