[llvm] [llvm][CodeGen] Add a new software pipeliner 'Window Scheduler' (PR #84443)

Mon Mar 11 22:44:59 PDT 2024

================
@@ -0,0 +1,124 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -O2 -debug-only=pipeliner \
+# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+
+# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
+
+--- |
+  define void @sqrt_approx(i32 noundef %N, ptr noalias %x, ptr noalias %y) #0 {
+  entry:
+    %isZeroLength = icmp eq i32 %N, 0
+    br i1 %isZeroLength, label %loop.exit, label %loop.preheader
+
+  loop.preheader:                                   ; preds = %entry
+    %half_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+    %one_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216)
+    %two_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1073741824)
+    br label %loop.body
+
+  loop.exit:                                        ; preds = %loop.body, %entry
+    ret void
+
+  loop.body:                                        ; preds = %loop.body, %loop.preheader
+    %lsr.iv1 = phi ptr [ %cgep3, %loop.body ], [ %x, %loop.preheader ]
+    %lsr.iv = phi ptr [ %cgep, %loop.body ], [ %y, %loop.preheader ]
+    %index = phi i32 [ 0, %loop.preheader ], [ %index.next, %loop.body ]
+    %vec_x = load <32 x i32>, ptr %lsr.iv1, align 128
+    %vec_sqrt_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %one_splat, <32 x i32> %vec_x)
+    %vec_sqrt_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_1, <32 x i32> %half_splat)
+    %vec_recip_1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_2, <32 x i32> %half_splat)
+    %vec_recip_2 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_1)
+    %vec_y1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_2, <32 x i32> %vec_recip_2)
+    %vec_recip_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_2, <32 x i32> %vec_y1)
+    %vec_recop_4 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_3)
+    %vec_y2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_y1, <32 x i32> %vec_recop_4)
+    %vec_sqrt_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_x, <32 x i32> %vec_y2)
+    %vec_sqrt_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_y2, <32 x i32> %vec_sqrt_3)
+    %vec_sqrt_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_4, <32 x i32> %half_splat)
+    %vec_recip_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_5, <32 x i32> %half_splat)
+    %vec_recip_6 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_5)
+    %vec_y3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_5, <32 x i32> %vec_recip_6)
+    %vec_recip_7 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_5, <32 x i32> %vec_y3)
+    %vec_recop_8 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_7)
+    %vec_y4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_y3, <32 x i32> %vec_recop_8)
+    %vec_sqrt_7 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_x, <32 x i32> %vec_y4)
+    %vec_sqrt_8 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_y4, <32 x i32> %vec_sqrt_7)
+    %vec_sqrt_9 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_8, <32 x i32> %half_splat)
+    store <32 x i32> %vec_sqrt_9, ptr %lsr.iv, align 128
+    %index.next = add nuw i32 %index, 32
+    %continue = icmp ult i32 %index.next, %N
+    %cgep = getelementptr i8, ptr %lsr.iv, i32 128
+    %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128
+    br i1 %continue, label %loop.body, label %loop.exit
+  }
+
+  declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32)
+  declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>)
+  declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>)
+  declare <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32>, <32 x i32>)
+
+  attributes #0 = { "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" }
+...
+---
+name:            sqrt_approx
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
----------------
arsenm wrote:

can you run this through -run-pass=none to compact the register numbers? 

https://github.com/llvm/llvm-project/pull/84443