[llvm] r287792 - [X86] Allow folding of stack reloads when loading a subreg of the spilled reg

Thu Nov 24 19:50:53 PST 2016

On 11/23/2016 10:33 AM, Michael Kuperstein via llvm-commits wrote:
> Author: mkuper
> Date: Wed Nov 23 12:33:49 2016
> New Revision: 287792
>
> URL: http://llvm.org/viewvc/llvm-project?rev=287792&view=rev
> Log:
> [X86] Allow folding of stack reloads when loading a subreg of the spilled reg
>
> We did not support subregs in InlineSpiller:foldMemoryOperand() because targets
> may not deal with them correctly.
>
> This adds a target hook to let the spiller know that a target can handle
> subregs, and actually enables it for x86 for the case of stack slot reloads.
> This fixes PR30832.
This feels like a weird design.  If I remember correctly, 
foldMemoryOperand is allowed to do nothing if it doesn't know how to 
fold.  Given this, why not just update the in tree targets to check for 
a sub-reg load and bail out?  Why do we need yet another target hook?

>
> Differential Revision: https://reviews.llvm.org/D26521
>
> Modified:
>      llvm/trunk/include/llvm/Target/TargetInstrInfo.h
>      llvm/trunk/lib/CodeGen/InlineSpiller.cpp
>      llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp
>      llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
>      llvm/trunk/lib/Target/X86/X86InstrInfo.h
>      llvm/trunk/test/CodeGen/X86/partial-fold32.ll
>      llvm/trunk/test/CodeGen/X86/partial-fold64.ll
>      llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll
>
> Modified: llvm/trunk/include/llvm/Target/TargetInstrInfo.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetInstrInfo.h?rev=287792&r1=287791&r2=287792&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/Target/TargetInstrInfo.h (original)
> +++ llvm/trunk/include/llvm/Target/TargetInstrInfo.h Wed Nov 23 12:33:49 2016
> @@ -817,6 +817,20 @@ public:
>     /// anything was changed.
>     virtual bool expandPostRAPseudo(MachineInstr &MI) const { return false; }
>   
> +  /// Check whether the target can fold a load that feeds a subreg operand
> +  /// (or a subreg operand that feeds a store).
> +  /// For example, X86 may want to return true if it can fold
> +  /// movl (%esp), %eax
> +  /// subb, %al, ...
> +  /// Into:
> +  /// subb (%esp), ...
> +  ///
> +  /// Ideally, we'd like the target implementation of foldMemoryOperand() to
> +  /// reject subregs - but since this behavior used to be enforced in the
> +  /// target-independent code, moving this responsibility to the targets
> +  /// has the potential of causing nasty silent breakage in out-of-tree targets.
> +  virtual bool isSubregFoldable() const { return false; }
> +
>     /// Attempt to fold a load or store of the specified stack
>     /// slot into the specified machine instruction for the specified operand(s).
>     /// If this is possible, a new instruction is returned with the specified
>
> Modified: llvm/trunk/lib/CodeGen/InlineSpiller.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/InlineSpiller.cpp?rev=287792&r1=287791&r2=287792&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/InlineSpiller.cpp (original)
> +++ llvm/trunk/lib/CodeGen/InlineSpiller.cpp Wed Nov 23 12:33:49 2016
> @@ -739,9 +739,12 @@ foldMemoryOperand(ArrayRef<std::pair<Mac
>     bool WasCopy = MI->isCopy();
>     unsigned ImpReg = 0;
>   
> -  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::STATEPOINT ||
> -                       MI->getOpcode() == TargetOpcode::PATCHPOINT ||
> -                       MI->getOpcode() == TargetOpcode::STACKMAP);
> +  // Spill subregs if the target allows it.
> +  // We always want to spill subregs for stackmap/patchpoint pseudos.
> +  bool SpillSubRegs = TII.isSubregFoldable() ||
> +                      MI->getOpcode() == TargetOpcode::STATEPOINT ||
> +                      MI->getOpcode() == TargetOpcode::PATCHPOINT ||
> +                      MI->getOpcode() == TargetOpcode::STACKMAP;
>   
>     // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
>     // operands.
> @@ -754,7 +757,7 @@ foldMemoryOperand(ArrayRef<std::pair<Mac
>         ImpReg = MO.getReg();
>         continue;
>       }
> -    // FIXME: Teach targets to deal with subregs.
> +
>       if (!SpillSubRegs && MO.getSubReg())
>         return false;
>       // We cannot fold a load instruction into a def.
>
> Modified: llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp?rev=287792&r1=287791&r2=287792&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp (original)
> +++ llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp Wed Nov 23 12:33:49 2016
> @@ -515,6 +515,31 @@ MachineInstr *TargetInstrInfo::foldMemor
>     assert(MBB && "foldMemoryOperand needs an inserted instruction");
>     MachineFunction &MF = *MBB->getParent();
>   
> +  // If we're not folding a load into a subreg, the size of the load is the
> +  // size of the spill slot. But if we are, we need to figure out what the
> +  // actual load size is.
> +  int64_t MemSize = 0;
> +  const MachineFrameInfo &MFI = MF.getFrameInfo();
> +  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
> +
> +  if (Flags & MachineMemOperand::MOStore) {
> +    MemSize = MFI.getObjectSize(FI);
> +  } else {
> +    for (unsigned Idx : Ops) {
> +      int64_t OpSize = MFI.getObjectSize(FI);
> +
> +      if (auto SubReg = MI.getOperand(Idx).getSubReg()) {
> +        unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg);
> +        if (SubRegSize > 0 && !(SubRegSize % 8))
> +          OpSize = SubRegSize / 8;
> +      }
> +
> +      MemSize = std::max(MemSize, OpSize);
> +    }
> +  }
> +
> +  assert(MemSize && "Did not expect a zero-sized stack slot");
> +
>     MachineInstr *NewMI = nullptr;
>   
>     if (MI.getOpcode() == TargetOpcode::STACKMAP ||
> @@ -538,10 +563,9 @@ MachineInstr *TargetInstrInfo::foldMemor
>       assert((!(Flags & MachineMemOperand::MOLoad) ||
>               NewMI->mayLoad()) &&
>              "Folded a use to a non-load!");
> -    const MachineFrameInfo &MFI = MF.getFrameInfo();
>       assert(MFI.getObjectOffset(FI) != -1);
>       MachineMemOperand *MMO = MF.getMachineMemOperand(
> -        MachinePointerInfo::getFixedStack(MF, FI), Flags, MFI.getObjectSize(FI),
> +        MachinePointerInfo::getFixedStack(MF, FI), Flags, MemSize,
>           MFI.getObjectAlignment(FI));
>       NewMI->addMemOperand(MF, MMO);
>   
> @@ -558,7 +582,6 @@ MachineInstr *TargetInstrInfo::foldMemor
>   
>     const MachineOperand &MO = MI.getOperand(1 - Ops[0]);
>     MachineBasicBlock::iterator Pos = MI;
> -  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
>   
>     if (Flags == MachineMemOperand::MOStore)
>       storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
>
> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=287792&r1=287791&r2=287792&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Wed Nov 23 12:33:49 2016
> @@ -6843,6 +6843,14 @@ X86InstrInfo::foldMemoryOperandImpl(Mach
>     if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
>       return nullptr;
>   
> +  // Don't fold subreg spills, or reloads that use a high subreg.
> +  for (auto Op : Ops) {
> +    MachineOperand &MO = MI.getOperand(Op);
> +    auto SubReg = MO.getSubReg();
> +    if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
> +      return nullptr;
> +  }
> +
>     const MachineFrameInfo &MFI = MF.getFrameInfo();
>     unsigned Size = MFI.getObjectSize(FrameIndex);
>     unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
> @@ -6967,6 +6975,14 @@ MachineInstr *X86InstrInfo::foldMemoryOp
>       MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
>       MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
>       LiveIntervals *LIS) const {
> +
> +  // TODO: Support the case where LoadMI loads a wide register, but MI
> +  // only uses a subreg.
> +  for (auto Op : Ops) {
> +    if (MI.getOperand(Op).getSubReg())
> +      return nullptr;
> +  }
> +
>     // If loading from a FrameIndex, fold directly from the FrameIndex.
>     unsigned NumOps = LoadMI.getDesc().getNumOperands();
>     int FrameIndex;
>
> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.h?rev=287792&r1=287791&r2=287792&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrInfo.h (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.h Wed Nov 23 12:33:49 2016
> @@ -378,6 +378,10 @@ public:
>   
>     bool expandPostRAPseudo(MachineInstr &MI) const override;
>   
> +  /// Check whether the target can fold a load that feeds a subreg operand
> +  /// (or a subreg operand that feeds a store).
> +  bool isSubregFoldable() const override { return true; }
> +
>     /// foldMemoryOperand - If this target supports it, fold a load or store of
>     /// the specified stack slot into the specified machine instruction for the
>     /// specified operand(s).  If this is possible, the target should perform the
>
> Modified: llvm/trunk/test/CodeGen/X86/partial-fold32.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/partial-fold32.ll?rev=287792&r1=287791&r2=287792&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/partial-fold32.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/partial-fold32.ll Wed Nov 23 12:33:49 2016
> @@ -3,8 +3,7 @@
>   define fastcc i8 @fold32to8(i32 %add, i8 %spill) {
>   ; CHECK-LABEL: fold32to8:
>   ; CHECK:    movl %ecx, (%esp) # 4-byte Spill
> -; CHECK:    movl (%esp), %eax # 4-byte Reload
> -; CHECK:    subb %al, %dl
> +; CHECK:    subb (%esp), %dl  # 1-byte Folded Reload
>   entry:
>     tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
>     %trunc = trunc i32 %add to i8
>
> Modified: llvm/trunk/test/CodeGen/X86/partial-fold64.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/partial-fold64.ll?rev=287792&r1=287791&r2=287792&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/partial-fold64.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/partial-fold64.ll Wed Nov 23 12:33:49 2016
> @@ -3,8 +3,7 @@
>   define i32 @fold64to32(i64 %add, i32 %spill) {
>   ; CHECK-LABEL: fold64to32:
>   ; CHECK:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
> -; CHECK:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
> -; CHECK:    subl %eax, %esi
> +; CHECK:    subl -{{[0-9]+}}(%rsp), %esi # 4-byte Folded Reload
>   entry:
>     tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"()
>     %trunc = trunc i64 %add to i32
> @@ -15,8 +14,7 @@ entry:
>   define i8 @fold64to8(i64 %add, i8 %spill) {
>   ; CHECK-LABEL: fold64to8:
>   ; CHECK:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
> -; CHECK:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
> -; CHECK:    subb %al, %sil
> +; CHECK:    subb -{{[0-9]+}}(%rsp), %sil # 1-byte Folded Reload
>   entry:
>     tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"()
>     %trunc = trunc i64 %add to i8
>
> Modified: llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll?rev=287792&r1=287791&r2=287792&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll Wed Nov 23 12:33:49 2016
> @@ -4788,9 +4788,8 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x
>   ; AVX1-NEXT:    orl %ebx, %r14d
>   ; AVX1-NEXT:    shlq $32, %r14
>   ; AVX1-NEXT:    orq %r15, %r14
> -; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
> -; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
> -; AVX1-NEXT:    vzeroupper
> +; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
> +; AVX1-NEXT:    # xmm0 = mem[1,0]
>   ; AVX1-NEXT:    callq __truncdfhf2
>   ; AVX1-NEXT:    movw %ax, %bx
>   ; AVX1-NEXT:    shll $16, %ebx
> @@ -4856,9 +4855,8 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x
>   ; AVX2-NEXT:    orl %ebx, %r14d
>   ; AVX2-NEXT:    shlq $32, %r14
>   ; AVX2-NEXT:    orq %r15, %r14
> -; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
> -; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
> -; AVX2-NEXT:    vzeroupper
> +; AVX2-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
> +; AVX2-NEXT:    # xmm0 = mem[1,0]
>   ; AVX2-NEXT:    callq __truncdfhf2
>   ; AVX2-NEXT:    movw %ax, %bx
>   ; AVX2-NEXT:    shll $16, %ebx
> @@ -5585,9 +5583,8 @@ define void @store_cvt_8f64_to_8i16(<8 x
>   ; AVX1-NEXT:    vzeroupper
>   ; AVX1-NEXT:    callq __truncdfhf2
>   ; AVX1-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
> -; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
> -; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
> -; AVX1-NEXT:    vzeroupper
> +; AVX1-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
> +; AVX1-NEXT:    # xmm0 = mem[1,0]
>   ; AVX1-NEXT:    callq __truncdfhf2
>   ; AVX1-NEXT:    movl %eax, %r12d
>   ; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
> @@ -5654,9 +5651,8 @@ define void @store_cvt_8f64_to_8i16(<8 x
>   ; AVX2-NEXT:    vzeroupper
>   ; AVX2-NEXT:    callq __truncdfhf2
>   ; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
> -; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
> -; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
> -; AVX2-NEXT:    vzeroupper
> +; AVX2-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
> +; AVX2-NEXT:    # xmm0 = mem[1,0]
>   ; AVX2-NEXT:    callq __truncdfhf2
>   ; AVX2-NEXT:    movl %eax, %r12d
>   ; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits