[llvm] r366819 - [GlobalISel][AArch64] Teach GISel to handle shifts in load addressing modes

Tue Jul 23 09:09:42 PDT 2019

Author: paquette
Date: Tue Jul 23 09:09:42 2019
New Revision: 366819

URL: http://llvm.org/viewvc/llvm-project?rev=366819&view=rev
Log:
[GlobalISel][AArch64] Teach GISel to handle shifts in load addressing modes

When we select the XRO variants of loads, we can pull in very specific shifts
(of the size of an element). E.g.

```
ldr x1, [x2, x3, lsl #3]
```

This teaches GISel to handle these when they're coming from shifts
specifically.

This adds a new addressing mode function, `selectAddrModeShiftedExtendXReg`
which recognizes this pattern.

This also packs this up with `selectAddrModeRegisterOffset` into
`selectAddrModeXRO`. This is intended to be equivalent to `selectAddrModeXRO`
in AArch64ISelDAGtoDAG.

Also update load-addressing-modes to show that all of the cases here work.

Differential Revision: https://reviews.llvm.org/D65119

Modified:
    llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp
    llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir

Modified: llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp?rev=366819&r1=366818&r2=366819&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp Tue Jul 23 09:09:42 2019
@@ -187,7 +187,15 @@ private:
   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
     return selectAddrModeIndexed(Root, Width / 8);
   }
+
+  bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
+                                     const MachineRegisterInfo &MRI) const;
+  ComplexRendererFns
+  selectAddrModeShiftedExtendXReg(MachineOperand &Root,
+                                  unsigned SizeInBytes) const;
   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
+  ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
+                                       unsigned SizeInBytes) const;
 
   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
 
@@ -1238,8 +1246,8 @@ bool AArch64InstructionSelector::earlySe
   if (DstSize != 64)
     return false;
 
-  // Check if we can do any folding from GEPs etc. into the load.
-  auto ImmFn = selectAddrModeRegisterOffset(I.getOperand(1));
+  // Check if we can do any folding from GEPs/shifts etc. into the load.
+  auto ImmFn = selectAddrModeXRO(I.getOperand(1), MemBytes);
   if (!ImmFn)
     return false;
 
@@ -3995,6 +4003,98 @@ AArch64InstructionSelector::selectArithI
   }};
 }
 
+/// Return true if it is worth folding MI into an extended register. That is,
+/// if it's safe to pull it into the addressing mode of a load or store as a
+/// shift.
+bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
+    MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+  // Always fold if there is one use, or if we're optimizing for size.
+  Register DefReg = MI.getOperand(0).getReg();
+  if (MRI.hasOneUse(DefReg) ||
+      MI.getParent()->getParent()->getFunction().hasMinSize())
+    return true;
+
+  // It's better to avoid folding and recomputing shifts when we don't have a
+  // fastpath.
+  if (!STI.hasLSLFast())
+    return false;
+
+  // We have a fastpath, so folding a shift in and potentially computing it
+  // many times may be beneficial. Check if this is only used in memory ops.
+  // If it is, then we should fold.
+  return all_of(MRI.use_instructions(DefReg),
+                [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
+}
+
+/// This is used for computing addresses like this:
+///
+/// ldr x1, [x2, x3, lsl #3]
+///
+/// Where x2 is the base register, and x3 is an offset register. The shift-left
+/// is a constant value specific to this load instruction. That is, we'll never
+/// see anything other than a 3 here (which corresponds to the size of the
+/// element being loaded.)
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
+    MachineOperand &Root, unsigned SizeInBytes) const {
+  if (!Root.isReg())
+    return None;
+  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+  // Make sure that the memory op is a valid size.
+  int64_t LegalShiftVal = Log2_32(SizeInBytes);
+  if (LegalShiftVal == 0)
+    return None;
+
+  // We want to find something like this:
+  //
+  // val = G_CONSTANT LegalShiftVal
+  // shift = G_SHL off_reg val
+  // ptr = G_GEP base_reg shift
+  // x = G_LOAD ptr
+  //
+  // And fold it into this addressing mode:
+  //
+  // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
+
+  // Check if we can find the G_GEP.
+  MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI);
+  if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI))
+    return None;
+
+  // Now try to match the G_SHL.
+  MachineInstr *Shl =
+      getOpcodeDef(TargetOpcode::G_SHL, Gep->getOperand(2).getReg(), MRI);
+  if (!Shl || !isWorthFoldingIntoExtendedReg(*Shl, MRI))
+    return None;
+
+  // Now, try to find the specific G_CONSTANT.
+  auto ValAndVReg =
+      getConstantVRegValWithLookThrough(Shl->getOperand(2).getReg(), MRI);
+  if (!ValAndVReg)
+    return None;
+
+  // The value must fit into 3 bits, and must be positive. Make sure that is
+  // true.
+  int64_t ImmVal = ValAndVReg->Value;
+  if ((ImmVal & 0x7) != ImmVal)
+    return None;
+
+  // We are only allowed to shift by LegalShiftVal. This shift value is built
+  // into the instruction, so we can't just use whatever we want.
+  if (ImmVal != LegalShiftVal)
+    return None;
+
+  // We can use the LHS of the GEP as the base, and the LHS of the shift as an
+  // offset. Signify that we are shifting by setting the shift flag to 1.
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(1)); },
+      [=](MachineInstrBuilder &MIB) { MIB.add(Shl->getOperand(1)); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(1); },
+  }};
+}
+
 /// This is used for computing addresses like this:
 ///
 /// ldr x1, [x2, x3]
@@ -4008,11 +4108,6 @@ AArch64InstructionSelector::selectAddrMo
     MachineOperand &Root) const {
   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
 
-  // If we have a constant offset, then we probably don't want to match a
-  // register offset.
-  if (isBaseWithConstantOffset(Root, MRI))
-    return None;
-
   // We need a GEP.
   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
   if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP)
@@ -4033,6 +4128,28 @@ AArch64InstructionSelector::selectAddrMo
   }};
 }
 
+/// This is intended to be equivalent to selectAddrModeXRO in
+/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
+                                              unsigned SizeInBytes) const {
+  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+  // If we have a constant offset, then we probably don't want to match a
+  // register offset.
+  if (isBaseWithConstantOffset(Root, MRI))
+    return None;
+
+  // Try to fold shifts into the addressing mode.
+  auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
+  if (AddrModeFns)
+    return AddrModeFns;
+
+  // If that doesn't work, see if it's possible to fold in registers from
+  // a GEP.
+  return selectAddrModeRegisterOffset(Root);
+}
+
 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
 /// should only match when there is an offset that is not valid for a scaled
 /// immediate addressing mode.  The "Size" argument is the size in bytes of the

Modified: llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir?rev=366819&r1=366818&r2=366819&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir (original)
+++ llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir Tue Jul 23 09:09:42 2019
@@ -5,6 +5,15 @@
   define void @ldrxrox_breg_oreg(i64* %addr) { ret void }
   define void @ldrdrox_breg_oreg(i64* %addr) { ret void }
   define void @more_than_one_use(i64* %addr) { ret void }
+  define void @ldrxrox_shl(i64* %addr) { ret void }
+  define void @ldrdrox_shl(i64* %addr) { ret void }
+  define void @more_than_one_use_shl_1(i64* %addr) { ret void }
+  define void @more_than_one_use_shl_2(i64* %addr) { ret void }
+  define void @more_than_one_use_shl_lsl_fast(i64* %addr) #1 { ret void }
+  define void @more_than_one_use_shl_lsl_slow(i64* %addr) { ret void }
+  define void @more_than_one_use_shl_minsize(i64* %addr) #0 { ret void }
+  attributes #0 = { optsize minsize }
+  attributes #1 = { "target-features"="+lsl-fast" }
 ...
 
 ---
@@ -88,3 +97,236 @@ body:             |
     %6:gpr(s64) = G_ADD %5, %4
     $x0 = COPY %6(s64)
     RET_ReallyLR implicit $x0
+
+...
+---
+name:            ldrxrox_shl
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: ldrxrox_shl
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+    ; CHECK: $x2 = COPY [[LDRXroX]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    $x2 = COPY %5(s64)
+    RET_ReallyLR implicit $x2
+
+...
+---
+name:            ldrdrox_shl
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $d2
+    ; CHECK-LABEL: name: ldrdrox_shl
+    ; CHECK: liveins: $x0, $x1, $d2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+    ; CHECK: $d2 = COPY [[LDRDroX]]
+    ; CHECK: RET_ReallyLR implicit $d2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:fpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    $d2 = COPY %5(s64)
+    RET_ReallyLR implicit $d2
+
+...
+---
+name:            more_than_one_use_shl_1
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; Show that we can still fall back to the register-register addressing
+    ; mode when we fail to pull in the shift.
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: more_than_one_use_shl_1
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[UBFMXri]], 0, 0 :: (load 8 from %ir.addr)
+    ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
+    ; CHECK: $x2 = COPY [[ADDXrr]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %6:gpr(s64) = G_ADD %2, %1
+    %7:gpr(s64) = G_ADD %5, %6
+    $x2 = COPY %7(s64)
+    RET_ReallyLR implicit $x2
+
+...
+---
+name:            more_than_one_use_shl_2
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; Show that when the GEP is used outside a memory op, we don't do any
+    ; folding at all.
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: more_than_one_use_shl_2
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
+    ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
+    ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+    ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[ADDXri]]
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
+    ; CHECK: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]]
+    ; CHECK: $x2 = COPY [[ADDXrr2]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %6:gpr(s64) = G_ADD %2, %1
+    %7:gpr(s64) = G_ADD %5, %6
+    %8:gpr(s64) = G_PTRTOINT %4
+    %9:gpr(s64) = G_ADD %8, %7
+    $x2 = COPY %9(s64)
+    RET_ReallyLR implicit $x2
+
+...
+---
+name:            more_than_one_use_shl_lsl_fast
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; Show that when we have a fastpath for shift-left, we perform the folding
+    ; if it has more than one use.
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: more_than_one_use_shl_lsl_fast
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+    ; CHECK: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]]
+    ; CHECK: $x2 = COPY [[ADDXrr]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %6:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %7:gpr(s64) = G_ADD %5, %6
+    $x2 = COPY %7(s64)
+    RET_ReallyLR implicit $x2
+
+...
+---
+name:            more_than_one_use_shl_lsl_slow
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; Show that we don't fold into multiple memory ops when we don't have a
+    ; fastpath for shift-left.
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: more_than_one_use_shl_lsl_slow
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[COPY]], 61, 60
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
+    ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
+    ; CHECK: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
+    ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]]
+    ; CHECK: $x2 = COPY [[ADDXrr1]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %6:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %7:gpr(s64) = G_ADD %5, %6
+    $x2 = COPY %7(s64)
+    RET_ReallyLR implicit $x2
+
+...
+---
+name:            more_than_one_use_shl_minsize
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; Show that when we're optimizing for size, we'll do the folding no matter
+    ; what.
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: more_than_one_use_shl_minsize
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[UBFMXri]]
+    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr)
+    ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+    ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
+    ; CHECK: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[ADDXrr1]]
+    ; CHECK: $x2 = COPY [[ADDXrr2]]
+    ; CHECK: RET_ReallyLR implicit $x2
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 3
+    %2:gpr(s64) = G_SHL %0, %1(s64)
+    %3:gpr(p0) = COPY $x1
+    %4:gpr(p0) = G_GEP %3, %2
+    %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr)
+    %6:gpr(s64) = G_ADD %2, %1
+    %7:gpr(s64) = G_ADD %5, %6
+    %8:gpr(s64) = G_PTRTOINT %4
+    %9:gpr(s64) = G_ADD %8, %7
+    $x2 = COPY %9(s64)
+    RET_ReallyLR implicit $x2