[llvm] 006b3bd - [ARM] Deliberately prevent inline asm in low overhead loops. NFC

Thu Nov 19 05:28:40 PST 2020

Author: David Green
Date: 2020-11-19T13:28:21Z
New Revision: 006b3bdeddb0234f53a1ab72e427ef74184461f5

URL: https://github.com/llvm/llvm-project/commit/006b3bdeddb0234f53a1ab72e427ef74184461f5
DIFF: https://github.com/llvm/llvm-project/commit/006b3bdeddb0234f53a1ab72e427ef74184461f5.diff

LOG: [ARM] Deliberately prevent inline asm in low overhead loops. NFC

This was already something that was handled by one of the "else"
branches in maybeLoweredToCall, so this patch is an NFC but makes it
explicit and adds a test. We may in the future want to support this
under certain situations but for the moment just don't try and create
low overhead loops with inline asm in them.

Differential Revision: https://reviews.llvm.org/D91257

Added: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll

Modified: 
    llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index d9f83a0b58ff..99ad8ae0c38e 100644

--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1694,7 +1694,8 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
   auto ScanLoop = [&](Loop *L) {
     for (auto *BB : L->getBlocks()) {
       for (auto &I : *BB) {
-        if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I)) {
+        if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
+            isa<InlineAsm>(I)) {
           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
           return false;
         }

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll
new file mode 100644
index 000000000000..6e3499bda4cb
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+
+define i32 @test(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: test:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    blt .LBB0_4
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    mov lr, r0
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:  .LBB0_2: @ %for.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrh r3, [r1], #2
+; CHECK-NEXT:    subs r2, #1
+; CHECK-NEXT:    ldrh r12, [lr], #2
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    add r3, r12
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    add r0, r3
+; CHECK-NEXT:    bne .LBB0_2
+; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %s.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %s.011 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.010
+  %0 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.010
+  %1 = load i16, i16* %arrayidx1, align 2
+  %2 = tail call i32 asm "add $0, $1, $2", "=r,r,r"(i16 %0, i16 %1) #1
+  %add = add nsw i32 %2, %s.011
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define i32 @testlr(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: testlr:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    blt .LBB1_4
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    mov r3, r0
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:  .LBB1_2: @ %for.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrh r4, [r1], #2
+; CHECK-NEXT:    subs r2, #1
+; CHECK-NEXT:    ldrh r12, [r3], #2
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    add r4, r12
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    add r0, r4
+; CHECK-NEXT:    bne .LBB1_2
+; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %s.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %s.011 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.010
+  %0 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.010
+  %1 = load i16, i16* %arrayidx1, align 2
+  %2 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{lr}"(i16 %0, i16 %1) #1
+  %add = add nsw i32 %2, %s.011
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}