[llvm] 991a36d - [AArch64][SME] Prevent SVE object address calculations between smstop and call

Wed Oct 5 01:11:25 PDT 2022

Author: David Sherwood
Date: 2022-10-05T08:11:16Z
New Revision: 991a36da1b754dd7e51701253eff7da174cf4a27

URL: https://github.com/llvm/llvm-project/commit/991a36da1b754dd7e51701253eff7da174cf4a27
DIFF: https://github.com/llvm/llvm-project/commit/991a36da1b754dd7e51701253eff7da174cf4a27.diff

LOG: [AArch64][SME] Prevent SVE object address calculations between smstop and call

This patch introduces a new AArch64 ISD node (OBSCURE_COPY) that can
be used when we want to prevent SVE object address calculations
from being rematerialised between a smstop/smstart and a call.
At the moment we use COPY to copy the frame index to a register,
which leads to problems because the "simple register coalescing"
pass understands the COPY instruction and attempts to rematerialise
an address calculation with 'addvl' between an smstop and a call.
When in streaming mode the 'addvl' instruction may have different
behaviour because the streaming SVE vector length is not guaranteed
to equal the normal SVE vector length.

The new ISD opcode OBSCURE_COPY gets lowered to a new pseudo
instruction also called OBSCURE_COPY. This ensures it cannot be
rematerialised and we expand this into a simple move very late in
the machine instruction pipeline.

A new test is added here:

CodeGen/AArch64/sme-streaming-interface.ll

Differential Revision: https://reviews.llvm.org/D134940

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
    llvm/test/CodeGen/AArch64/sme-streaming-interface.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 0b36f1f45fbd..318f8b8554db 100644

--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1377,6 +1377,17 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
        NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
      return true;
    }
+   case AArch64::OBSCURE_COPY: {
+     if (MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) {
+       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs))
+           .add(MI.getOperand(0))
+           .addReg(AArch64::XZR)
+           .add(MI.getOperand(1))
+           .addImm(0);
+     }
+     MI.eraseFromParent();
+     return true;
+   }
   }
   return false;
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2b3449bd303b..6865be2d04ca 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2065,6 +2065,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((AArch64ISD::NodeType)Opcode) {
   case AArch64ISD::FIRST_NUMBER:
     break;
+    MAKE_CASE(AArch64ISD::OBSCURE_COPY)
     MAKE_CASE(AArch64ISD::SMSTART)
     MAKE_CASE(AArch64ISD::SMSTOP)
     MAKE_CASE(AArch64ISD::CALL)
@@ -7036,6 +7037,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
           return ArgReg.Reg == VA.getLocReg();
         });
       } else {
+        // Add an extra level of indirection for streaming mode changes by
+        // using a pseudo copy node that cannot be rematerialised between a
+        // smstart/smstop and the call by the simple register coalescer.
+        if (RequiresSMChange && isa<FrameIndexSDNode>(Arg))
+          Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg);
         RegsToPass.emplace_back(VA.getLocReg(), Arg);
         RegsUsed.insert(VA.getLocReg());
         const TargetOptions &Options = DAG.getTarget().Options;

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index bf5bce71dbdc..786dd249fa8e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -58,6 +58,13 @@ enum NodeType : unsigned {
 
   CALL_BTI, // Function call followed by a BTI instruction.
 
+  // Essentially like a normal COPY that works on GPRs, but cannot be
+  // rematerialised by passes like the simple register coalescer. It's
+  // required for SME when lowering calls because we cannot allow frame
+  // index calculations using addvl to slip in between the smstart/smstop
+  // and the bl instruction. The scalable vector length may change across
+  // the smstart/smstop boundary.
+  OBSCURE_COPY,
   SMSTART,
   SMSTOP,
 

diff  --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 69d1eda88937..f1359d9a3737 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -19,6 +19,8 @@ def AArch64_smstop  : SDNode<"AArch64ISD::SMSTOP", SDTypeProfile<0, 3,
                              [SDNPHasChain, SDNPSideEffect, SDNPVariadic,
                               SDNPOptInGlue, SDNPOutGlue]>;
 
+def AArch64ObscureCopy : SDNode<"AArch64ISD::OBSCURE_COPY", SDTypeProfile<1, 1, []>, []>;
+
 //===----------------------------------------------------------------------===//
 // Add vector elements horizontally or vertically to ZA tile.
 //===----------------------------------------------------------------------===//
@@ -202,6 +204,10 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
 def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
           (MRS 0xde85)>;
 
+def OBSCURE_COPY : Pseudo<(outs GPR64:$dst), (ins GPR64:$idx), []>, Sched<[]> { }
+def : Pat<(i64 (AArch64ObscureCopy (i64 GPR64:$idx))),
+          (OBSCURE_COPY GPR64:$idx)>;
+
 //===----------------------------------------------------------------------===//
 // SVE2 instructions
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 2c44645abb7c..2f4c907c8724 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -360,4 +360,50 @@ define void @disable_tailcallopt() nounwind {
   ret void;
 }
 
+define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #1 {
+; CHECK-LABEL: call_to_non_streaming_pass_sve_objects:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    addvl x9, sp, #2
+; CHECK-NEXT:    addvl x10, sp, #1
+; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    mov x0, x9
+; CHECK-NEXT:    mov x1, x10
+; CHECK-NEXT:    mov x2, x11
+; CHECK-NEXT:    mov x3, x8
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %Data1 = alloca <vscale x 16 x i8>, align 16
+  %Data2 = alloca <vscale x 16 x i8>, align 16
+  %Data3 = alloca <vscale x 16 x i8>, align 16
+  %0 = tail call i64 @llvm.aarch64.sme.cntsb()
+  call void @foo(ptr noundef nonnull %Data1, ptr noundef nonnull %Data2, ptr noundef nonnull %Data3, i64 noundef %0)
+  %1 = load <vscale x 16 x i8>, ptr %Data1, align 16
+  %vecext = extractelement <vscale x 16 x i8> %1, i64 0
+  ret i8 %vecext
+}
+
+declare i64 @llvm.aarch64.sme.cntsb()
+
+declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
+
 attributes #0 = { nounwind "target-features"="+sve" }
+attributes #1 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }