[llvm] 26bf877 - [PowerPC] Fix spilling of vector registers in PEI of EH aware functions

Fri Feb 7 12:42:09 PST 2020

Author: Nemanja Ivanovic
Date: 2020-02-07T14:41:52-06:00
New Revision: 26bf877ec5ce07eaaf2ebf19e78f26fa59a8e41a

URL: https://github.com/llvm/llvm-project/commit/26bf877ec5ce07eaaf2ebf19e78f26fa59a8e41a
DIFF: https://github.com/llvm/llvm-project/commit/26bf877ec5ce07eaaf2ebf19e78f26fa59a8e41a.diff

LOG: [PowerPC] Fix spilling of vector registers in PEI of EH aware functions

On little endian targets prior to Power9, we spill vector registers using a
swapping store (i.e. stdxvd2x saves the vector with the two doublewords in
big endian order regardless of endianness). This is generally not a problem
since we restore them using the corresponding swapping load (lxvd2x). However
if the restore is done by the unwinder, the vector register contains data in
the incorrect order.

This patch fixes that by using Altivec loads/stores for vector saves and
restores in PEI (which keep the order correct) under those specific conditions:
- EH aware function
- Subtarget requires swaps for VSX memops (Little Endian prior to Power9)

Differential revision: https://reviews.llvm.org/D73692

Added: 
    

Modified: 
    llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
    llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
    llvm/lib/Target/PowerPC/PPCInstrInfo.h
    llvm/test/CodeGen/PowerPC/CSR-fit.ll
    llvm/test/CodeGen/PowerPC/reg-scavenging.ll
    llvm/test/CodeGen/PowerPC/vsxD-Form-spills.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 19564348bf0f..15b2d9cb5dc2 100644

--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -2241,8 +2241,15 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
         // Use !IsLiveIn for the kill flag.
         // We do not want to kill registers that are live in this function
         // before their use because they will become undefined registers.
-        TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn,
-                                CSI[i].getFrameIdx(), RC, TRI);
+        // Functions without NoUnwind need to preserve the order of elements in
+        // saved vector registers.
+        if (Subtarget.needsSwapsForVSXMemOps() &&
+            !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
+          TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn,
+                                       CSI[i].getFrameIdx(), RC, TRI);
+        else
+          TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, CSI[i].getFrameIdx(),
+                                  RC, TRI);
       }
     }
   }
@@ -2394,7 +2401,16 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
       } else {
        // Default behavior for non-CR saves.
         const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-        TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+
+        // Functions without NoUnwind need to preserve the order of elements in
+        // saved vector registers.
+        if (Subtarget.needsSwapsForVSXMemOps() &&
+            !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
+          TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC,
+                                        TRI);
+        else
+          TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+
         assert(I != MBB.begin() &&
                "loadRegFromStackSlot didn't insert any code!");
       }

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index cd03fb66cfbe..1b3c52b4edf0 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1222,24 +1222,13 @@ void PPCInstrInfo::StoreRegToStackSlot(
     FuncInfo->setHasNonRISpills();
 }
 
-void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MI,
-                                       Register SrcReg, bool isKill,
-                                       int FrameIdx,
-                                       const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI) const {
+void PPCInstrInfo::storeRegToStackSlotNoUpd(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg,
+    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   SmallVector<MachineInstr *, 4> NewMIs;
 
-  // We need to avoid a situation in which the value from a VRRC register is
-  // spilled using an Altivec instruction and reloaded into a VSRC register
-  // using a VSX instruction. The issue with this is that the VSX
-  // load/store instructions swap the doublewords in the vector and the Altivec
-  // ones don't. The register classes on the spill/reload may be 
diff erent if
-  // the register is defined using an Altivec instruction and is then used by a
-  // VSX instruction.
-  RC = updatedRC(RC);
-
   StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs);
 
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
@@ -1253,6 +1242,23 @@ void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
+void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       Register SrcReg, bool isKill,
+                                       int FrameIdx,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  // We need to avoid a situation in which the value from a VRRC register is
+  // spilled using an Altivec instruction and reloaded into a VSRC register
+  // using a VSX instruction. The issue with this is that the VSX
+  // load/store instructions swap the doublewords in the vector and the Altivec
+  // ones don't. The register classes on the spill/reload may be 
diff erent if
+  // the register is defined using an Altivec instruction and is then used by a
+  // VSX instruction.
+  RC = updatedRC(RC);
+  storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC, TRI);
+}
+
 void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
                                         unsigned DestReg, int FrameIdx,
                                         const TargetRegisterClass *RC,
@@ -1274,12 +1280,10 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
     FuncInfo->setHasNonRISpills();
 }
 
-void
-PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   Register DestReg, int FrameIdx,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const {
+void PPCInstrInfo::loadRegFromStackSlotNoUpd(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg,
+    int FrameIdx, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   SmallVector<MachineInstr*, 4> NewMIs;
   DebugLoc DL;
@@ -1288,16 +1292,6 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setHasSpills();
 
-  // We need to avoid a situation in which the value from a VRRC register is
-  // spilled using an Altivec instruction and reloaded into a VSRC register
-  // using a VSX instruction. The issue with this is that the VSX
-  // load/store instructions swap the doublewords in the vector and the Altivec
-  // ones don't. The register classes on the spill/reload may be 
diff erent if
-  // the register is defined using an Altivec instruction and is then used by a
-  // VSX instruction.
-  if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
-    RC = &PPC::VSRCRegClass;
-
   LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);
 
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
@@ -1311,6 +1305,23 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
+void PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        Register DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  // We need to avoid a situation in which the value from a VRRC register is
+  // spilled using an Altivec instruction and reloaded into a VSRC register
+  // using a VSX instruction. The issue with this is that the VSX
+  // load/store instructions swap the doublewords in the vector and the Altivec
+  // ones don't. The register classes on the spill/reload may be 
diff erent if
+  // the register is defined using an Altivec instruction and is then used by a
+  // VSX instruction.
+  RC = updatedRC(RC);
+
+  loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC, TRI);
+}
+
 bool PPCInstrInfo::
 reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 2 && "Invalid PPC branch opcode!");

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 2bc5e13e6640..12d7e27326a0 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -296,12 +296,30 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
+  // Emits a register spill without updating the register class for vector
+  // registers. This ensures that when we spill a vector register the
+  // element order in the register is the same as it was in memory.
+  void storeRegToStackSlotNoUpd(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                unsigned SrcReg, bool isKill, int FrameIndex,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI) const;
+
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
                             Register DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+  // Emits a register reload without updating the register class for vector
+  // registers. This ensures that when we reload a vector register the
+  // element order in the register is the same as it was in memory.
+  void loadRegFromStackSlotNoUpd(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 unsigned DestReg, int FrameIndex,
+                                 const TargetRegisterClass *RC,
+                                 const TargetRegisterInfo *TRI) const;
+
   unsigned getStoreOpcodeForSpill(unsigned Reg,
                                   const TargetRegisterClass *RC = nullptr) const;
 

diff  --git a/llvm/test/CodeGen/PowerPC/CSR-fit.ll b/llvm/test/CodeGen/PowerPC/CSR-fit.ll
index ebaffdb05346..0f17776ec05c 100644
--- a/llvm/test/CodeGen/PowerPC/CSR-fit.ll
+++ b/llvm/test/CodeGen/PowerPC/CSR-fit.ll
@@ -126,9 +126,9 @@ define dso_local signext i32 @caller3(i32 signext %a, i32 signext %b) local_unna
 ; CHECK-PWR8-NEXT:    .cfi_offset v20, -192
 ; CHECK-PWR8-NEXT:    .cfi_offset v21, -176
 ; CHECK-PWR8-NEXT:    li r5, 48
-; CHECK-PWR8-NEXT:    stxvd2x v20, r1, r5 # 16-byte Folded Spill
+; CHECK-PWR8-NEXT:    stvx v20, r1, r5 # 16-byte Folded Spill
 ; CHECK-PWR8-NEXT:    li r5, 64
-; CHECK-PWR8-NEXT:    stxvd2x v21, r1, r5 # 16-byte Folded Spill
+; CHECK-PWR8-NEXT:    stvx v21, r1, r5 # 16-byte Folded Spill
 ; CHECK-PWR8-NEXT:    #APP
 ; CHECK-PWR8-NEXT:    add r3, r3, r4
 ; CHECK-PWR8-NEXT:    #NO_APP
@@ -136,9 +136,9 @@ define dso_local signext i32 @caller3(i32 signext %a, i32 signext %b) local_unna
 ; CHECK-PWR8-NEXT:    bl callee
 ; CHECK-PWR8-NEXT:    nop
 ; CHECK-PWR8-NEXT:    li r4, 64
-; CHECK-PWR8-NEXT:    lxvd2x v21, r1, r4 # 16-byte Folded Reload
+; CHECK-PWR8-NEXT:    lvx v21, r1, r4 # 16-byte Folded Reload
 ; CHECK-PWR8-NEXT:    li r4, 48
-; CHECK-PWR8-NEXT:    lxvd2x v20, r1, r4 # 16-byte Folded Reload
+; CHECK-PWR8-NEXT:    lvx v20, r1, r4 # 16-byte Folded Reload
 ; CHECK-PWR8-NEXT:    addi r1, r1, 240
 ; CHECK-PWR8-NEXT:    ld r0, 16(r1)
 ; CHECK-PWR8-NEXT:    mtlr r0
@@ -184,9 +184,9 @@ define dso_local signext i32 @caller4(i32 signext %a, i32 signext %b) local_unna
 ; CHECK-PWR8-NEXT:    .cfi_offset v20, -192
 ; CHECK-PWR8-NEXT:    .cfi_offset v21, -176
 ; CHECK-PWR8-NEXT:    li r5, 48
-; CHECK-PWR8-NEXT:    stxvd2x v20, r1, r5 # 16-byte Folded Spill
+; CHECK-PWR8-NEXT:    stvx v20, r1, r5 # 16-byte Folded Spill
 ; CHECK-PWR8-NEXT:    li r5, 64
-; CHECK-PWR8-NEXT:    stxvd2x v21, r1, r5 # 16-byte Folded Spill
+; CHECK-PWR8-NEXT:    stvx v21, r1, r5 # 16-byte Folded Spill
 ; CHECK-PWR8-NEXT:    #APP
 ; CHECK-PWR8-NEXT:    add r3, r3, r4
 ; CHECK-PWR8-NEXT:    #NO_APP
@@ -194,9 +194,9 @@ define dso_local signext i32 @caller4(i32 signext %a, i32 signext %b) local_unna
 ; CHECK-PWR8-NEXT:    bl callee
 ; CHECK-PWR8-NEXT:    nop
 ; CHECK-PWR8-NEXT:    li r4, 64
-; CHECK-PWR8-NEXT:    lxvd2x v21, r1, r4 # 16-byte Folded Reload
+; CHECK-PWR8-NEXT:    lvx v21, r1, r4 # 16-byte Folded Reload
 ; CHECK-PWR8-NEXT:    li r4, 48
-; CHECK-PWR8-NEXT:    lxvd2x v20, r1, r4 # 16-byte Folded Reload
+; CHECK-PWR8-NEXT:    lvx v20, r1, r4 # 16-byte Folded Reload
 ; CHECK-PWR8-NEXT:    addi r1, r1, 240
 ; CHECK-PWR8-NEXT:    ld r0, 16(r1)
 ; CHECK-PWR8-NEXT:    mtlr r0
@@ -246,9 +246,9 @@ define dso_local signext i32 @caller_mixed(i32 signext %a, i32 signext %b) local
 ; CHECK-PWR8-NEXT:    li r5, 48
 ; CHECK-PWR8-NEXT:    std r14, 240(r1) # 8-byte Folded Spill
 ; CHECK-PWR8-NEXT:    stfd f14, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR8-NEXT:    stxvd2x v20, r1, r5 # 16-byte Folded Spill
+; CHECK-PWR8-NEXT:    stvx v20, r1, r5 # 16-byte Folded Spill
 ; CHECK-PWR8-NEXT:    li r5, 64
-; CHECK-PWR8-NEXT:    stxvd2x v21, r1, r5 # 16-byte Folded Spill
+; CHECK-PWR8-NEXT:    stvx v21, r1, r5 # 16-byte Folded Spill
 ; CHECK-PWR8-NEXT:    #APP
 ; CHECK-PWR8-NEXT:    add r3, r3, r4
 ; CHECK-PWR8-NEXT:    #NO_APP
@@ -258,9 +258,9 @@ define dso_local signext i32 @caller_mixed(i32 signext %a, i32 signext %b) local
 ; CHECK-PWR8-NEXT:    li r4, 64
 ; CHECK-PWR8-NEXT:    lfd f14, 384(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    ld r14, 240(r1) # 8-byte Folded Reload
-; CHECK-PWR8-NEXT:    lxvd2x v21, r1, r4 # 16-byte Folded Reload
+; CHECK-PWR8-NEXT:    lvx v21, r1, r4 # 16-byte Folded Reload
 ; CHECK-PWR8-NEXT:    li r4, 48
-; CHECK-PWR8-NEXT:    lxvd2x v20, r1, r4 # 16-byte Folded Reload
+; CHECK-PWR8-NEXT:    lvx v20, r1, r4 # 16-byte Folded Reload
 ; CHECK-PWR8-NEXT:    addi r1, r1, 528
 ; CHECK-PWR8-NEXT:    ld r0, 16(r1)
 ; CHECK-PWR8-NEXT:    mtlr r0

diff  --git a/llvm/test/CodeGen/PowerPC/reg-scavenging.ll b/llvm/test/CodeGen/PowerPC/reg-scavenging.ll
index 6a32c176dc4c..75cf6cc4beba 100644
--- a/llvm/test/CodeGen/PowerPC/reg-scavenging.ll
+++ b/llvm/test/CodeGen/PowerPC/reg-scavenging.ll
@@ -12,7 +12,7 @@ define dso_local signext i32 @caller(i32 signext %a, i32 signext %b) local_unnam
 ; CHECK-NEXT:    .cfi_offset lr, 16
 ; CHECK-NEXT:    .cfi_offset v20, -192
 ; CHECK-NEXT:    li r5, 48
-; CHECK-NEXT:    stxvd2x v20, r1, r5 # 16-byte Folded Spill
+; CHECK-NEXT:    stvx v20, r1, r5 # 16-byte Folded Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    add r3, r3, r4
 ; CHECK-NEXT:    #NO_APP
@@ -20,7 +20,7 @@ define dso_local signext i32 @caller(i32 signext %a, i32 signext %b) local_unnam
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    li r4, 48
-; CHECK-NEXT:    lxvd2x v20, r1, r4 # 16-byte Folded Reload
+; CHECK-NEXT:    lvx v20, r1, r4 # 16-byte Folded Reload
 ; CHECK-NEXT:    addi r1, r1, 240
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0

diff  --git a/llvm/test/CodeGen/PowerPC/vsxD-Form-spills.ll b/llvm/test/CodeGen/PowerPC/vsxD-Form-spills.ll
index 3c3fe1dc1e00..5f4cfd31a2e7 100644
--- a/llvm/test/CodeGen/PowerPC/vsxD-Form-spills.ll
+++ b/llvm/test/CodeGen/PowerPC/vsxD-Form-spills.ll
@@ -8,19 +8,19 @@ define <4 x i32> @testSpill(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-DAG:    li [[REG64:[0-9]+]], 64
 ; CHECK-DAG:    li [[REG80:[0-9]+]], 80
 ; CHECK-DAG:    li [[REG96:[0-9]+]], 96
-; CHECK-DAG:    stxvd2x 60, 1, [[REG48]] # 16-byte Folded Spill
-; CHECK-DAG:    stxvd2x 61, 1, [[REG64]] # 16-byte Folded Spill
-; CHECK-DAG:    stxvd2x 62, 1, [[REG80]] # 16-byte Folded Spill
-; CHECK-DAG:    stxvd2x 63, 1, [[REG96]] # 16-byte Folded Spill
+; CHECK-DAG:    stvx 28, 1, [[REG48]] # 16-byte Folded Spill
+; CHECK-DAG:    stvx 29, 1, [[REG64]] # 16-byte Folded Spill
+; CHECK-DAG:    stvx 30, 1, [[REG80]] # 16-byte Folded Spill
+; CHECK-DAG:    stvx 31, 1, [[REG96]] # 16-byte Folded Spill
 ; CHECK:        .LBB0_3
 ; CHECK-DAG:    li [[REG96_LD:[0-9]+]], 96
 ; CHECK-DAG:    li [[REG80_LD:[0-9]+]], 80
 ; CHECK-DAG:    li [[REG64_LD:[0-9]+]], 64
 ; CHECK-DAG:    li [[REG48_LD:[0-9]+]], 48
-; CHECK-DAG:    lxvd2x 63, 1, [[REG96_LD]] # 16-byte Folded Reload
-; CHECK-DAG:    lxvd2x 62, 1, [[REG80_LD]] # 16-byte Folded Reload
-; CHECK-DAG:    lxvd2x 61, 1, [[REG64_LD]] # 16-byte Folded Reload
-; CHECK-DAG:    lxvd2x 60, 1, [[REG48_LD]] # 16-byte Folded Reload
+; CHECK-DAG:    lvx 31, 1, [[REG96_LD]] # 16-byte Folded Reload
+; CHECK-DAG:    lvx 30, 1, [[REG80_LD]] # 16-byte Folded Reload
+; CHECK-DAG:    lvx 29, 1, [[REG64_LD]] # 16-byte Folded Reload
+; CHECK-DAG:    lvx 28, 1, [[REG48_LD]] # 16-byte Folded Reload
 ; CHECK:    mtlr 0
 ; CHECK-NEXT:    blr
 ;