[llvm-branch-commits] [llvm] [llvm][CodeGen] Address the issue discovered In window scheduling (#101665) (PR #102881)

Kai Yan via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Aug 12 04:12:48 PDT 2024


https://github.com/kaiyan96 created https://github.com/llvm/llvm-project/pull/102881

We have following bugfixes for window scheduler, do we need submit them by ourselves?
* [Added a new restriction for II by pragma in window scheduler](https://github.com/llvm/llvm-project/pull/99448)
* [Fixed a bug in stall cycle calculation for window scheduler](https://github.com/llvm/llvm-project/pull/99451)
* [Added missing initialization failure information for window scheduler](https://github.com/llvm/llvm-project/pull/99449)
* [Fixed max cycle calculation with zero-cost instructions for window scheduler ](https://github.com/llvm/llvm-project/pull/99454)
* [Address the issue of multiple resource reservations In window scheduling](https://github.com/llvm/llvm-project/pull/101665)


>From 9f1b8a25b51bdb6842f8bf27813569ca1c341d5f Mon Sep 17 00:00:00 2001
From: Kai Yan <aklkaiyan at tencent.com>
Date: Wed, 24 Jul 2024 12:06:35 +0800
Subject: [PATCH 1/5] [llvm][CodeGen] Added missing initialization failure
 information for window scheduler (#99449)

Added missing initialization failure information for window scheduler.
---
 llvm/lib/CodeGen/WindowScheduler.cpp        | 5 ++++-
 llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp
index 0777480499e55b..3fe8a1aaafd128 100644
--- a/llvm/lib/CodeGen/WindowScheduler.cpp
+++ b/llvm/lib/CodeGen/WindowScheduler.cpp
@@ -232,8 +232,11 @@ bool WindowScheduler::initialize() {
       return false;
     }
     for (auto &Def : MI.all_defs())
-      if (Def.isReg() && Def.getReg().isPhysical())
+      if (Def.isReg() && Def.getReg().isPhysical()) {
+        LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
+                             "window scheduling!\n");
         return false;
+      }
   }
   if (SchedInstrNum <= WindowRegionLimit) {
     LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
index 601b98dca8e20b..be75301b016ed9 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
@@ -3,6 +3,7 @@
 # RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
 # RUN: | FileCheck %s
 
+# CHECK: Physical registers are not supported in window scheduling!
 # CHECK: The WindowScheduler failed to initialize!
 
 ---

>From 0c7e10d69547adeb460feeff88d10c774461cd94 Mon Sep 17 00:00:00 2001
From: Kai Yan <aklkaiyan at tencent.com>
Date: Wed, 24 Jul 2024 12:11:58 +0800
Subject: [PATCH 2/5] [llvm][CodeGen] Added a new restriction for II by pragma
 in window scheduler (#99448)

Added a new restriction for window scheduling.
Window scheduling is disabled when llvm.loop.pipeline.initiationinterval
is set.
---
 llvm/lib/CodeGen/MachinePipeliner.cpp         | 12 ++-
 ...swp-ws-pragma-initiation-interval-fail.mir | 83 +++++++++++++++++++
 2 files changed, 93 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir

diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 497e282bb97682..5c68711ff61938 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -528,8 +528,16 @@ bool MachinePipeliner::useSwingModuloScheduler() {
 }
 
 bool MachinePipeliner::useWindowScheduler(bool Changed) {
-  // WindowScheduler does not work when it is off or when SwingModuloScheduler
-  // is successfully scheduled.
+  // WindowScheduler does not work for following cases:
+  // 1. when it is off.
+  // 2. when SwingModuloScheduler is successfully scheduled.
+  // 3. when pragma II is enabled.
+  if (II_setByPragma) {
+    LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
+                         "llvm.loop.pipeline.initiationinterval is set.\n");
+    return false;
+  }
+
   return WindowSchedulingOption == WindowSchedulingFlag::WS_Force ||
          (WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed);
 }
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir b/llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir
new file mode 100644
index 00000000000000..6e69a76290fb1d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir
@@ -0,0 +1,83 @@
+# RUN: llc  --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
+# REQUIRES: asserts
+
+# Test that checks no window scheduler is performed if the II set by pragma was
+# enabled
+
+# CHECK: Window scheduling is disabled when llvm.loop.pipeline.initiationinterval is set.
+
+--- |
+  define void @test_pragma_ii_fail(ptr %a0, i32 %a1) {
+  b0:
+    %v0 = icmp sgt i32 %a1, 1
+    br i1 %v0, label %b1, label %b4
+
+  b1:                                               ; preds = %b0
+    %v1 = load i32, ptr %a0, align 4
+    %v2 = add i32 %v1, 10
+    %v4 = add i32 %a1, -1
+    %cgep = getelementptr i32, ptr %a0, i32 1
+    br label %b2
+
+  b2:                                               ; preds = %b2, %b1
+    %v5 = phi i32 [ %v12, %b2 ], [ %v4, %b1 ]
+    %v6 = phi ptr [ %cgep2, %b2 ], [ %cgep, %b1 ]
+    %v7 = phi i32 [ %v10, %b2 ], [ %v2, %b1 ]
+    store i32 %v7, ptr %v6, align 4
+    %v8 = add i32 %v7, 10
+    %cgep1 = getelementptr i32, ptr %v6, i32 -1
+    store i32 %v8, ptr %cgep1, align 4
+    %v10 = add i32 %v7, 10
+    %v12 = add i32 %v5, -1
+    %v13 = icmp eq i32 %v12, 0
+    %cgep2 = getelementptr i32, ptr %v6, i32 1
+    br i1 %v13, label %b4, label %b2, !llvm.loop !0
+
+  b4:                                               ; preds = %b2, %b0
+    ret void
+  }
+
+  !0 = distinct !{!0, !1}
+  !1 = !{!"llvm.loop.pipeline.initiationinterval", i32 2}
+...
+---
+name:            test_pragma_ii_fail
+tracksRegLiveness: true
+body:             |
+  bb.0.b0:
+    successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    liveins: $r0, $r1
+  
+    %0:intregs = COPY $r1
+    %1:intregs = COPY $r0
+    %2:predregs = C2_cmpgti %0, 1
+    J2_jumpf %2, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.b1:
+    successors: %bb.2(0x80000000)
+  
+    %3:intregs, %4:intregs = L2_loadri_pi %1, 4
+    %5:intregs = A2_addi killed %3, 10
+    %6:intregs = A2_addi %0, -1
+    %7:intregs = COPY %6
+    J2_loop0r %bb.2, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.2.b2 (machine-block-address-taken):
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %8:intregs = PHI %4, %bb.1, %9, %bb.2
+    %10:intregs = PHI %5, %bb.1, %11, %bb.2
+    S2_storeri_io %8, 0, %10
+    %11:intregs = A2_addi %10, 10
+    S2_storeri_io %8, -4, %11
+    %9:intregs = A2_addi %8, 4
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.3.b4:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...

>From 907280997d216922c28d550fa56557623588d3b3 Mon Sep 17 00:00:00 2001
From: Kai Yan <aklkaiyan at tencent.com>
Date: Thu, 25 Jul 2024 19:16:23 +0800
Subject: [PATCH 3/5] [llvm][CodeGen] Fixed a bug in stall cycle calculation
 for window scheduler (#99451)

Fixed a bug in stall cycle calculation.
When a register defined by an instruction in the current iteration is
used by an instruction in the next iteration, we have modified the
number of stall cycle that need to be inserted.
---
 llvm/lib/CodeGen/WindowScheduler.cpp          |  7 ++-
 .../CodeGen/Hexagon/swp-ws-stall-cycle.mir    | 59 +++++++++++++++++++
 2 files changed, 63 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir

diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp
index 3fe8a1aaafd128..02275afff4e6dd 100644
--- a/llvm/lib/CodeGen/WindowScheduler.cpp
+++ b/llvm/lib/CodeGen/WindowScheduler.cpp
@@ -488,6 +488,7 @@ int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
 // ========================================
 int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
   int MaxStallCycle = 0;
+  int CurrentII = MaxCycle + 1;
   auto Range = getScheduleRange(Offset, SchedInstrNum);
   for (auto &MI : Range) {
     auto *SU = TripleDAG->getSUnit(&MI);
@@ -495,8 +496,8 @@ int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
     for (auto &Succ : SU->Succs) {
       if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU)
         continue;
-      // If the expected cycle does not exceed MaxCycle, no check is needed.
-      if (DefCycle + (int)Succ.getLatency() <= MaxCycle)
+      // If the expected cycle does not exceed CurrentII, no check is needed.
+      if (DefCycle + (int)Succ.getLatency() <= CurrentII)
         continue;
       // If the cycle of the scheduled MI A is less than that of the scheduled
       // MI B, the scheduling will fail because the lifetime of the
@@ -506,7 +507,7 @@ int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
       if (DefCycle < UseCycle)
         return WindowIILimit;
       // Get the stall cycle introduced by the register between two trips.
-      int StallCycle = DefCycle + (int)Succ.getLatency() - MaxCycle - UseCycle;
+      int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
       MaxStallCycle = std::max(MaxStallCycle, StallCycle);
     }
   }
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir b/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir
new file mode 100644
index 00000000000000..ddba67d78eb58c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir
@@ -0,0 +1,59 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs \
+# RUN: -window-region-limit=1 -window-search-ratio=100 -window-diff-limit=0 \
+# RUN: 2>&1 | FileCheck %s
+
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+
+---
+name:            test_window_stall_cycle
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $r0, $r1
+  
+    %0:intregs = COPY $r1
+    %1:intregs = COPY $r0
+    %2:intregs = nsw A2_add %0, %1
+    %3:intregs = S2_lsr_i_r_acc %2, %2, 31
+    %4:intregs = S2_asr_i_r killed %3, 1
+    %5:predregs = C2_cmpgt %1, %4
+    %6:intregs = A2_tfrsi 0
+    J2_jumpt killed %5, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1:
+    successors: %bb.2(0x80000000)
+  
+    %7:intregs = A2_addi %4, 2
+    %8:intregs = A2_tfrsi 0
+    %9:intregs = A2_sub %4, %1
+    %10:intregs = A2_addi %9, 1
+    %11:intregs = COPY %10
+    J2_loop0r %bb.2, %11, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.2 (machine-block-address-taken):
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %12:intregs = PHI %7, %bb.1, %13, %bb.2
+    %14:intregs = PHI %8, %bb.1, %15, %bb.2
+    %16:intregs = PHI %8, %bb.1, %17, %bb.2
+    %18:intregs, %13:intregs = L2_loadri_pi %12, -4
+    %17:intregs = nsw A2_add killed %18, %16
+    %15:intregs = A2_max %17, %14
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.3:
+    %19:intregs = PHI %6, %bb.0, %15, %bb.2
+    $r0 = COPY %19
+    PS_jmpret $r31, implicit-def dead $pc, implicit $r0
+
+...

>From 7bcec75a6f02a65b856231ff3723e8eaacabdfe3 Mon Sep 17 00:00:00 2001
From: Kai Yan <aklkaiyan at tencent.com>
Date: Wed, 24 Jul 2024 12:06:10 +0800
Subject: [PATCH 4/5] [llvm][CodeGen] Fixed max cycle calculation with
 zero-cost instructions for window scheduler (#99454)

We discovered some scheduling failures occurring when zero-cost
instructions were involved. This issue will be addressed by this patch.
---
 llvm/lib/CodeGen/WindowScheduler.cpp          | 16 ++++---
 .../test/CodeGen/Hexagon/swp-ws-zero-cost.mir | 45 +++++++++++++++++++
 2 files changed, 55 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir

diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp
index 02275afff4e6dd..6a2c6537470db4 100644
--- a/llvm/lib/CodeGen/WindowScheduler.cpp
+++ b/llvm/lib/CodeGen/WindowScheduler.cpp
@@ -440,12 +440,16 @@ int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
       int PredCycle = getOriCycle(PredMI);
       ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
     }
-    // ResourceManager can be used to detect resource conflicts between the
-    // current MI and the previously inserted MIs.
-    while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
-      ++CurCycle;
-      if (CurCycle == (int)WindowIILimit)
-        return CurCycle;
+    // Zero cost instructions do not need to check resource.
+    if (!TII->isZeroCost(MI.getOpcode())) {
+      // ResourceManager can be used to detect resource conflicts between the
+      // current MI and the previously inserted MIs.
+      while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
+        ++CurCycle;
+        if (CurCycle == (int)WindowIILimit)
+          return CurCycle;
+      }
+      RM.reserveResources(*SU, CurCycle);
     }
     RM.reserveResources(*SU, CurCycle);
     OriToCycle[getOriMI(&MI)] = CurCycle;
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir b/llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir
new file mode 100644
index 00000000000000..ecf49a83c69e15
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir
@@ -0,0 +1,45 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
+
+# CHECK-NOT: Can't find a valid II. Keep searching...
+# CHECK: Start analyzing II
+# CHECK: Start scheduling Phis
+# CHECK: Current window Offset is {{[0-9]+}} and II is {{[0-9]+}}
+
+---
+name:            relu
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  bb.1:
+    successors: %bb.3(0x80000000)
+    %4:hvxvr = V6_vd0
+    %5:intregs = A2_addi %2, 31
+    %6:intregs = S2_lsr_i_r %5, 5
+    %7:intregs = COPY %6
+    J2_loop0r %bb.3, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  bb.2:
+    PS_jmpret $r31, implicit-def dead $pc
+  bb.3 (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+    %8:intregs = PHI %1, %bb.1, %9, %bb.3
+    %10:intregs = PHI %0, %bb.1, %14, %bb.3
+    %11:hvxvr, %9:intregs = V6_vL32b_pi %8, 128
+    %12:intregs = COPY %10
+    %13:hvxvr = V6_vmaxw killed %11, %4
+    %14:intregs = V6_vS32b_pi %12, 128, killed %13
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+...
+

>From d1d92078bd7894d18c540135a26a801bd6de6f8d Mon Sep 17 00:00:00 2001
From: Kai Yan <aklkaiyan at tencent.com>
Date: Mon, 5 Aug 2024 17:44:05 +0800
Subject: [PATCH 5/5] [llvm][CodeGen] Address the issue of multiple resource
 reservations In window scheduling (#101665)

Address the issue of multiple resource reservations in window
scheduling.
---
 llvm/lib/CodeGen/WindowScheduler.cpp          |   1 -
 .../Hexagon/swp-ws-resource-reserve.mir       | 100 ++++++++++++++++++
 2 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-ws-resource-reserve.mir

diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp
index 6a2c6537470db4..f1658e36ae1e92 100644
--- a/llvm/lib/CodeGen/WindowScheduler.cpp
+++ b/llvm/lib/CodeGen/WindowScheduler.cpp
@@ -451,7 +451,6 @@ int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
       }
       RM.reserveResources(*SU, CurCycle);
     }
-    RM.reserveResources(*SU, CurCycle);
     OriToCycle[getOriMI(&MI)] = CurCycle;
     LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
                       << getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-resource-reserve.mir b/llvm/test/CodeGen/Hexagon/swp-ws-resource-reserve.mir
new file mode 100644
index 00000000000000..4a9a09c4148cb1
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-resource-reserve.mir
@@ -0,0 +1,100 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: -window-search-ratio=100 -window-search-num=100 -window-diff-limit=1 \
+# RUN: | FileCheck %s
+
+# We want to verify that all three V6_vaddw instructions are emitted in the same cycle.
+# CHECK-LABEL: Current window Offset is 2
+# CHECK: Cycle [[CycleNum:[0-9]+]] [{{S.[0-9]+}}]: {{%[0-9]+}}:hvxvr = V6_vaddw {{%[0-9]+}}:hvxvr, {{%[0-9]+}}:hvxvr
+# CHECK: Cycle [[CycleNum]] [{{S.[0-9]+}}]: {{%[0-9]+}}:hvxvr = V6_vaddw {{%[0-9]+}}:hvxvr, {{%[0-9]+}}:hvxvr
+# CHECK: Cycle [[CycleNum]] [{{S.[0-9]+}}]: {{%[0-9]+}}:hvxvr = V6_vaddw {{%[0-9]+}}:hvxvr, {{%[0-9]+}}:hvxvr
+# CHECK-LABEL: Current window Offset is 3
+
+--- |
+  define void @add_parallel(i32 %N, ptr noalias %x, ptr noalias %y) {
+  entry:
+    %isZeroLength = icmp eq i32 %N, 0
+    br i1 %isZeroLength, label %loop.exit, label %loop.preheader
+
+  loop.preheader:                                   ; preds = %entry
+    %half_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+    %one_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216)
+    %two_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1073741824)
+    br label %loop.body
+
+  loop.exit:                                        ; preds = %loop.body, %entry
+    ret void
+
+  loop.body:                                        ; preds = %loop.body, %loop.preheader
+    %lsr.iv1 = phi ptr [ %cgep2, %loop.body ], [ %x, %loop.preheader ]
+    %lsr.iv = phi ptr [ %cgep1, %loop.body ], [ %y, %loop.preheader ]
+    %index = phi i32 [ 0, %loop.preheader ], [ %index.next, %loop.body ]
+    %vec_x1 = load <32 x i32>, ptr %lsr.iv1, align 128
+    %vec_add_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %one_splat, <32 x i32> %vec_x1)
+    %vec_add_2 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %half_splat, <32 x i32> %vec_x1)
+    %vec_add_3 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %two_splat, <32 x i32> %vec_x1)
+    %vec_add_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_add_1, <32 x i32> %vec_add_2)
+    %vec_add_5 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_add_1, <32 x i32> %vec_add_3)
+    %vec_add_6 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_add_5, <32 x i32> %vec_add_4)
+    store <32 x i32> %vec_add_6, ptr %lsr.iv, align 128
+    %index.next = add nuw i32 %index, 32
+    %continue = icmp ult i32 %index.next, %N
+    %cgep1 = getelementptr i8, ptr %lsr.iv, i32 128
+    %cgep2 = getelementptr i8, ptr %lsr.iv1, i32 128
+    br i1 %continue, label %loop.body, label %loop.exit
+  }
+
+  declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32)
+  declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>)
+...
+---
+name:            add_parallel
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1.loop.preheader:
+    successors: %bb.3(0x80000000)
+
+    %4:intregs = A2_tfrsi 1056964608
+    %5:hvxvr = V6_lvsplatw killed %4
+    %6:intregs = A2_tfrsi 1065353216
+    %7:hvxvr = V6_lvsplatw killed %6
+    %8:intregs = A2_tfrsi 1073741824
+    %9:hvxvr = V6_lvsplatw killed %8
+    %10:intregs = A2_addi %2, 31
+    %11:intregs = S2_lsr_i_r %10, 5
+    %12:intregs = COPY %11
+    J2_loop0r %bb.3, %12, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.2.loop.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+
+  bb.3.loop.body (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+
+    %13:intregs = PHI %1, %bb.1, %14, %bb.3
+    %15:intregs = PHI %0, %bb.1, %16, %bb.3
+    %17:hvxvr, %14:intregs = V6_vL32b_pi %13, 128 :: (load (s1024) from %ir.lsr.iv1)
+    %18:hvxvr = V6_vaddw %7, %17
+    %19:hvxvr = V6_vaddw %5, %17
+    %20:hvxvr = V6_vaddw %9, %17
+    %21:hvxvr = V6_vaddw %18, killed %19
+    %22:hvxvr = V6_vaddw %18, killed %20
+    %23:hvxvr = V6_vaddw killed %22, killed %21
+    %16:intregs = V6_vS32b_pi %15, 128, killed %23 :: (store (s1024) into %ir.lsr.iv)
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+
+...



More information about the llvm-branch-commits mailing list