[llvm] r231527 - [AArch64][LoadStoreOptimizer] Generate LDP + SXTW instead of LD[U]R + LD[U]RSW.

Fri Aug 7 13:32:01 PDT 2015

Cool!  Thanks for the quick response!  Let me know if I can be of
assistance..

 

From: Quentin Colombet [mailto:qcolombet at apple.com] 
Sent: Friday, August 07, 2015 4:30 PM
To: mcrosier at codeaurora.org
Cc: llvm-commits at lists.llvm.org
Subject: Re: [llvm] r231527 - [AArch64][LoadStoreOptimizer] Generate LDP +
SXTW instead of LD[U]R + LD[U]RSW.

 

Hi Chad,

 

Nice catch.

There is indeed a potential bug here.

 

Looking if I can produce a test case to expose the problem.

 

Thanks,

-Quentin

On Aug 7, 2015, at 12:12 PM, Chad Rosier <mcrosier at codeaurora.org
<mailto:mcrosier at codeaurora.org> > wrote:

 

+ <mailto:llvm-commits at lists.llvm.org> llvm-commits at lists.llvm.org

Hi Quentin,
I was eyeballing this change and I'm concerned we're not properly tracking
what registers are being clobbered and used.

Inline comments below.  Please let me know if I've missed something..

 Chad

-----Original Message-----
From:  <mailto:llvm-commits-bounces at cs.uiuc.edu>
llvm-commits-bounces at cs.uiuc.edu
[ <mailto:llvm-commits-bounces at cs.uiuc.edu>
mailto:llvm-commits-bounces at cs.uiuc.edu] On Behalf Of Quentin Colombet
Sent: Friday, March 06, 2015 5:42 PM
To:  <mailto:llvm-commits at cs.uiuc.edu> llvm-commits at cs.uiuc.edu
Subject: [llvm] r231527 - [AArch64][LoadStoreOptimizer] Generate LDP + SXTW
instead of LD[U]R + LD[U]RSW.

Author: qcolombet
Date: Fri Mar  6 16:42:10 2015
New Revision: 231527

URL: http://llvm.org/viewvc/llvm-project?rev=231527
<http://llvm.org/viewvc/llvm-project?rev=231527&view=rev> &view=rev
Log:
[AArch64][LoadStoreOptimizer] Generate LDP + SXTW instead of LD[U]R +
LD[U]RSW.
Teach the load store optimizer how to sign extend a result of a load pair
when it helps creating more pairs.
The rational is that loads are more expensive than sign extensions, so if we
gather some in one instruction this is better!

<rdar://problem/20072968>

Modified:
   llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
   llvm/trunk/test/CodeGen/AArch64/arm64-ldp.ll

Modified: llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
URL:
http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64Loa
dStoreOptimizer.cpp?rev=231527&r1=231526&r2=231527&view=diff
============================================================================
==

--- llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp Fri Mar
+++ 6 16:42:10 2015
@@ -63,16 +63,24 @@ struct AArch64LoadStoreOpt : public Mach
  // If a matching instruction is found, MergeForward is set to true if the
  // merge is to remove the first instruction and replace the second with
  // a pair-wise insn, and false if the reverse is true.
+  // \p SExtIdx[out] gives the index of the result of the load pair 
+ that  // must be extended. The value of SExtIdx assumes that the 
+ paired load  // produces the value in this order: (I, returned 
+ iterator), i.e.,  // -1 means no value has to be extended, 0 means I, 
+ and 1 means the  // returned iterator.
  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator
I,
-                                               bool &MergeForward,
+                                               bool &MergeForward, int 
+ &SExtIdx,
                                               unsigned Limit);
  // Merge the two instructions indicated into a single pair-wise
instruction.
  // If MergeForward is true, erase the first instruction and fold its
  // operation into the second. If false, the reverse. Return the
instruction
  // following the first instruction (which may change during processing).
+  // \p SExtIdx index of the result that must be extended for a paired
load.
+  // -1 means none, 0 means I, and 1 means Paired.
  MachineBasicBlock::iterator
  mergePairedInsns(MachineBasicBlock::iterator I,
-                   MachineBasicBlock::iterator Paired, bool MergeForward);
+                   MachineBasicBlock::iterator Paired, bool MergeForward,
+                   int SExtIdx);

  // Scan the instruction list to find a base register update that can
  // be combined with the current instruction (a load or store) using @@
-181,6 +189,43 @@ int AArch64LoadStoreOpt::getMemSize(Mach
  }
}

+static unsigned getMatchingNonSExtOpcode(unsigned Opc,
+                                         bool *IsValidLdStrOpc =
+nullptr) {
+  if (IsValidLdStrOpc)
+    *IsValidLdStrOpc = true;
+  switch (Opc) {
+  default:
+    if (IsValidLdStrOpc)
+      *IsValidLdStrOpc = false;
+    return UINT_MAX;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return Opc;
+  case AArch64::LDRSWui:
+    return AArch64::LDRWui;
+  case AArch64::LDURSWi:
+    return AArch64::LDURWi;
+  }
+}
+
static unsigned getMatchingPairOpcode(unsigned Opc) {
  switch (Opc) {
  default:
@@ -282,7 +327,7 @@ static unsigned getPostIndexedOpcode(uns
MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
                                      MachineBasicBlock::iterator Paired,
-                                      bool MergeForward) {
+                                      bool MergeForward, int SExtIdx) {
  MachineBasicBlock::iterator NextI = I;
  ++NextI;
  // If NextI is the second of the two instructions to be merged, we need
@@ -292,11 +337,13 @@ AArch64LoadStoreOpt::mergePairedInsns(Ma
  if (NextI == Paired)
    ++NextI;

-  bool IsUnscaled = isUnscaledLdst(I->getOpcode());
+  unsigned Opc =
+      SExtIdx == -1 ? I->getOpcode() : 
+ getMatchingNonSExtOpcode(I->getOpcode());
+  bool IsUnscaled = isUnscaledLdst(Opc);
  int OffsetStride =
      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;

-  unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
+  unsigned NewOpc = getMatchingPairOpcode(Opc);
  // Insert our new paired instruction after whichever of the paired
  // instructions MergeForward indicates.
  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
@@ -311,6 +358,11 @@ AArch64LoadStoreOpt::mergePairedInsns(Ma
      Paired->getOperand(2).getImm() + OffsetStride) {
    RtMI = Paired;
    Rt2MI = I;
+    // Here we swapped the assumption made for SExtIdx.
+    // I.e., we turn ldp I, Paired into ldp Paired, I.
+    // Update the index accordingly.
+    if (SExtIdx != -1)
+      SExtIdx = (SExtIdx + 1) % 2;
  } else {
    RtMI = I;
    Rt2MI = Paired;
@@ -337,8 +389,47 @@ AArch64LoadStoreOpt::mergePairedInsns(Ma
  DEBUG(dbgs() << "    ");
  DEBUG(Paired->print(dbgs()));
  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
+
+  if (SExtIdx != -1) {
+    // Generate the sign extension for the proper result of the ldp.
+    // I.e., with X1, that would be:
+    // %W1<def> = KILL %W1, %X1<imp-def>
+    // %X1<def> = SBFMXri %X1<kill>, 0, 31
+    MachineOperand &DstMO = MIB->getOperand(SExtIdx);
+    // Right now, DstMO has the extended register, since it comes from an
+    // extended opcode.
+    unsigned DstRegX = DstMO.getReg();
+    // Get the W variant of that register.
+    unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
+    // Update the result of LDP to use the W instead of the X variant.
+    DstMO.setReg(DstRegW);
+    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+    DEBUG(dbgs() << "\n");
+    // Make the machine verifier happy by providing a definition for
+    // the X register.
+    // Insert this definition right after the generated LDP, i.e., before
+    // InsertionPoint.
+    MachineInstrBuilder MIBKill =
+        BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                TII->get(TargetOpcode::KILL), DstRegW)
+            .addReg(DstRegW)
+            .addReg(DstRegX, RegState::Define);
+    MIBKill->getOperand(2).setImplicit();
+    // Create the sign extension.
+    MachineInstrBuilder MIBSXTW =
+        BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                TII->get(AArch64::SBFMXri), DstRegX)
+            .addReg(DstRegX)
+            .addImm(0)
+            .addImm(31);
+    (void)MIBSXTW;
+    DEBUG(dbgs() << "  Extend operand:\n    ");
+    DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
+    DEBUG(dbgs() << "\n");
+  } else {
+    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+    DEBUG(dbgs() << "\n");
+  }

  // Erase the old instructions.
  I->eraseFromParent();
@@ -396,7 +487,8 @@ static int alignTo(int Num, int PowOf2)  /// be combined
with the current instruction into a load/store pair.
MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
-                                      bool &MergeForward, unsigned Limit) {
+                                      bool &MergeForward, int &SExtIdx,
+                                      unsigned Limit) {
  MachineBasicBlock::iterator E = I->getParent()->end();
  MachineBasicBlock::iterator MBBI = I;
  MachineInstr *FirstMI = I;
@@ -436,7 +528,19 @@ AArch64LoadStoreOpt::findMatchingInsn(Ma
    // Now that we know this is a real instruction, count it.
    ++Count;

-    if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
+    bool CanMergeOpc = Opc == MI->getOpcode();
+    SExtIdx = -1;
+    if (!CanMergeOpc) {
+      bool IsValidLdStrOpc;
+      unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc,
&IsValidLdStrOpc);
+      if (!IsValidLdStrOpc)

I believe we need to add the below code on the continue path to ensure the
register defs/uses and memory operations are tracked correctly.

       trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
       if (MI->mayLoadOrStore())
           MemInsns.push_back(MI);

Alternatively, we could allow the control flow to fall thru to the logic at
the end of the for loop that takes care of the above logic but negating the
condtion.  Something like..

       if (IsValidLdStrOpc) {
           // Opc will be the first instruction in the pair.
           SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0;
           CanMergeOpc = NonSExtOpc ==
getMatchingNonSExtOpcode(MI->getOpcode());
       }

Please let me know your thoughts, Quentin.  Again this is just something I
saw in passing and haven't proven it's actually a problem.

Chad

+        continue;
+      // Opc will be the first instruction in the pair.
+      SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0;
+      CanMergeOpc = NonSExtOpc ==
getMatchingNonSExtOpcode(MI->getOpcode());
+    }
+
+    if (CanMergeOpc && MI->getOperand(2).isImm()) {
      // If we've found another instruction with the same opcode, check to
see
      // if the base and offset are compatible with our starting
instruction.
      // These instructions all have scaled immediate operands, so we just
@@ -823,13 +927,14 @@ bool AArch64LoadStoreOpt::optimizeBlock(
      }
      // Look ahead up to ScanLimit instructions for a pairable
instruction.
      bool MergeForward = false;
+      int SExtIdx = -1;
      MachineBasicBlock::iterator Paired =
-          findMatchingInsn(MBBI, MergeForward, ScanLimit);
+          findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit);
      if (Paired != E) {
        // Merge the loads into a pair. Keeping the iterator straight is a
        // pain, so we let the merge routine tell us what the next
instruction
        // is after it's done mucking about.
-        MBBI = mergePairedInsns(MBBI, Paired, MergeForward);
+        MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx);

        Modified = true;
        ++NumPairCreated;

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-ldp.ll
URL:
http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-ld
p.ll?rev=231527&r1=231526&r2=231527&view=diff
============================================================================
==
--- llvm/trunk/test/CodeGen/AArch64/arm64-ldp.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-ldp.ll Fri Mar  6 16:42:10
+++ 2015
@@ -24,6 +24,33 @@ define i64 @ldp_sext_int(i32* %p) nounwi
  ret i64 %add
}

+; CHECK-LABEL: ldp_half_sext_res0_int:
+; CHECK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0]
+; CHECK: sxtw     x[[DST1]], w[[DST1]]
+define i64 @ldp_half_sext_res0_int(i32* %p) nounwind {
+  %tmp = load i32, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %tmp1 = load i32, i32* %add.ptr, align 4
+  %sexttmp = sext i32 %tmp to i64
+  %sexttmp1 = zext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  ret i64 %add
+}
+
+; CHECK-LABEL: ldp_half_sext_res1_int:
+; CHECK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0]
+; CHECK: sxtw     x[[DST2]], w[[DST2]]
+define i64 @ldp_half_sext_res1_int(i32* %p) nounwind {
+  %tmp = load i32, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %tmp1 = load i32, i32* %add.ptr, align 4
+  %sexttmp = zext i32 %tmp to i64
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  ret i64 %add
+}
+
+
; CHECK: ldp_long
; CHECK: ldp
define i64 @ldp_long(i64* %p) nounwind { @@ -83,6 +110,39 @@ define i64
@ldur_sext_int(i32* %a) nounw
  ret i64 %tmp3
}

+define i64 @ldur_half_sext_int_res0(i32* %a) nounwind { ; LDUR_CHK: 
+ldur_half_sext_int_res0
+; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8]
+; LDUR_CHK: sxtw     x[[DST1]], w[[DST1]]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32, i32* %a, i32 -1
+  %tmp1 = load i32, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32, i32* %a, i32 -2
+  %tmp2 = load i32, i32* %p2, align 2
+  %sexttmp1 = zext i32 %tmp1 to i64
+  %sexttmp2 = sext i32 %tmp2 to i64
+  %tmp3 = add i64 %sexttmp1, %sexttmp2
+  ret i64 %tmp3
+}
+
+define i64 @ldur_half_sext_int_res1(i32* %a) nounwind { ; LDUR_CHK: 
+ldur_half_sext_int_res1
+; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8]
+; LDUR_CHK: sxtw     x[[DST2]], w[[DST2]]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32, i32* %a, i32 -1
+  %tmp1 = load i32, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32, i32* %a, i32 -2
+  %tmp2 = load i32, i32* %p2, align 2
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %sexttmp2 = zext i32 %tmp2 to i64
+  %tmp3 = add i64 %sexttmp1, %sexttmp2
+  ret i64 %tmp3
+}
+
+
define i64 @ldur_long(i64* %a) nounwind ssp {  ; LDUR_CHK: ldur_long
; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
@@ -152,6 +212,40 @@ define i64 @pairUpBarelyInSext(i32* %a)
  %tmp3 = add i64 %sexttmp1, %sexttmp2
  ret i64 %tmp3
}
+
+define i64 @pairUpBarelyInHalfSextRes0(i32* %a) nounwind ssp { ;
+LDUR_CHK: pairUpBarelyInHalfSextRes0 ; LDUR_CHK-NOT: ldur
+; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256]
+; LDUR_CHK: sxtw     x[[DST1]], w[[DST1]]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32, i32* %a, i64 -63
+  %tmp1 = load i32, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32, i32* %a, i64 -64
+  %tmp2 = load i32, i32* %p2, align 2
+  %sexttmp1 = zext i32 %tmp1 to i64
+  %sexttmp2 = sext i32 %tmp2 to i64
+  %tmp3 = add i64 %sexttmp1, %sexttmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpBarelyInHalfSextRes1(i32* %a) nounwind ssp { ;
+LDUR_CHK: pairUpBarelyInHalfSextRes1 ; LDUR_CHK-NOT: ldur
+; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256]
+; LDUR_CHK: sxtw     x[[DST2]], w[[DST2]]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32, i32* %a, i64 -63
+  %tmp1 = load i32, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32, i32* %a, i64 -64
+  %tmp2 = load i32, i32* %p2, align 2
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %sexttmp2 = zext i32 %tmp2 to i64
+  %tmp3 = add i64 %sexttmp1, %sexttmp2
+  ret i64 %tmp3
+}

define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {  ; LDUR_CHK:
pairUpBarelyOut


_______________________________________________
llvm-commits mailing list
llvm-commits at cs.uiuc.edu <mailto:llvm-commits at cs.uiuc.edu> 
http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits

 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150807/4bc77f8d/attachment.html>