[llvm] [RISCV][LoopIdiomVectorize] Support VP intrinsics in LoopIdiomVectorize (PR #94082)
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 2 11:56:00 PDT 2024
================
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mattr=+v < %s | FileCheck %s
+
+; Testing VFIRST patterns related to llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
+
+define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
+; CHECK-LABEL: compare_bytes_simple:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: addiw a5, a2, 1
+; CHECK-NEXT: bltu a4, a5, .LBB0_7
+; CHECK-NEXT: # %bb.1: # %mismatch_mem_check
+; CHECK-NEXT: slli a2, a5, 32
+; CHECK-NEXT: srli a2, a2, 32
+; CHECK-NEXT: slli a6, a3, 32
+; CHECK-NEXT: srli a6, a6, 32
+; CHECK-NEXT: add a7, a0, a2
+; CHECK-NEXT: add t0, a0, a6
+; CHECK-NEXT: srli a7, a7, 12
+; CHECK-NEXT: srli t0, t0, 12
+; CHECK-NEXT: bne a7, t0, .LBB0_7
+; CHECK-NEXT: # %bb.2: # %mismatch_mem_check
+; CHECK-NEXT: add a7, a1, a2
+; CHECK-NEXT: add t0, a1, a6
+; CHECK-NEXT: srli a7, a7, 12
+; CHECK-NEXT: srli t0, t0, 12
+; CHECK-NEXT: bne a7, t0, .LBB0_7
+; CHECK-NEXT: .LBB0_3: # %mismatch_vec_loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub a4, a6, a2
+; CHECK-NEXT: vsetvli a4, a4, e8, m2, ta, ma
+; CHECK-NEXT: add a5, a0, a2
+; CHECK-NEXT: vle8.v v8, (a5)
+; CHECK-NEXT: add a5, a1, a2
+; CHECK-NEXT: vle8.v v10, (a5)
+; CHECK-NEXT: vmsne.vv v12, v8, v10
+; CHECK-NEXT: vfirst.m a7, v12
+; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: bltz a7, .LBB0_5
+; CHECK-NEXT: # %bb.4: # %mismatch_vec_loop
+; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: mv a5, a7
+; CHECK-NEXT: .LBB0_5: # %mismatch_vec_loop
+; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: sext.w a7, a5
+; CHECK-NEXT: bne a7, a4, .LBB0_11
+; CHECK-NEXT: # %bb.6: # %mismatch_vec_loop_inc
+; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: bne a2, a6, .LBB0_3
+; CHECK-NEXT: j .LBB0_9
+; CHECK-NEXT: .LBB0_7: # %mismatch_loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: slli a2, a5, 32
+; CHECK-NEXT: srli a2, a2, 32
+; CHECK-NEXT: add a6, a0, a2
+; CHECK-NEXT: lbu a6, 0(a6)
+; CHECK-NEXT: add a2, a1, a2
+; CHECK-NEXT: lbu a2, 0(a2)
+; CHECK-NEXT: bne a6, a2, .LBB0_10
+; CHECK-NEXT: # %bb.8: # %mismatch_loop_inc
+; CHECK-NEXT: # in Loop: Header=BB0_7 Depth=1
+; CHECK-NEXT: addiw a5, a5, 1
+; CHECK-NEXT: bne a4, a5, .LBB0_7
+; CHECK-NEXT: .LBB0_9: # %while.end
+; CHECK-NEXT: mv a0, a3
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_10:
+; CHECK-NEXT: mv a0, a5
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_11: # %mismatch_vec_loop_found
+; CHECK-NEXT: slli a5, a5, 32
+; CHECK-NEXT: srli a3, a5, 32
+; CHECK-NEXT: add a0, a2, a3
+; CHECK-NEXT: ret
+entry:
+ %0 = add i32 %len, 1
+ br label %mismatch_min_it_check
+
+mismatch_min_it_check: ; preds = %entry
+ %1 = zext i32 %0 to i64
+ %2 = zext i32 %n to i64
+ %3 = icmp ule i32 %0, %n
+ br i1 %3, label %mismatch_mem_check, label %mismatch_loop_pre
+
+mismatch_mem_check: ; preds = %mismatch_min_it_check
+ %4 = getelementptr i8, ptr %a, i64 %1
+ %5 = getelementptr i8, ptr %b, i64 %1
+ %6 = ptrtoint ptr %5 to i64
+ %7 = ptrtoint ptr %4 to i64
+ %8 = getelementptr i8, ptr %a, i64 %2
+ %9 = getelementptr i8, ptr %b, i64 %2
+ %10 = ptrtoint ptr %8 to i64
+ %11 = ptrtoint ptr %9 to i64
+ %12 = lshr i64 %7, 12
+ %13 = lshr i64 %10, 12
+ %14 = lshr i64 %6, 12
+ %15 = lshr i64 %11, 12
+ %16 = icmp ne i64 %12, %13
+ %17 = icmp ne i64 %14, %15
+ %18 = or i1 %16, %17
+ br i1 %18, label %mismatch_loop_pre, label %mismatch_vec_loop_preheader
+
+mismatch_vec_loop_preheader: ; preds = %mismatch_mem_check
+ br label %mismatch_vec_loop
+
+mismatch_vec_loop: ; preds = %mismatch_vec_loop_inc, %mismatch_vec_loop_preheader
+ %mismatch_vector_index = phi i64 [ %1, %mismatch_vec_loop_preheader ], [ %25, %mismatch_vec_loop_inc ]
+ %avl = sub nuw nsw i64 %2, %mismatch_vector_index
+ %19 = call i32 @llvm.experimental.get.vector.length.i64(i64 %avl, i32 16, i1 true)
+ %20 = getelementptr inbounds i8, ptr %a, i64 %mismatch_vector_index
+ %lhs.load = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr %20, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %19)
+ %21 = getelementptr inbounds i8, ptr %b, i64 %mismatch_vector_index
+ %rhs.load = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr %21, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %19)
+ %mismatch.cmp = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> %lhs.load, <vscale x 16 x i8> %rhs.load, metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %19)
+ %22 = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %mismatch.cmp, i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %19)
+ %23 = icmp ne i32 %22, %19
+ br i1 %23, label %mismatch_vec_loop_found, label %mismatch_vec_loop_inc
+
+mismatch_vec_loop_inc: ; preds = %mismatch_vec_loop
+ %24 = zext i32 %19 to i64
+ %25 = add nuw nsw i64 %mismatch_vector_index, %24
+ %26 = icmp ne i64 %25, %2
+ br i1 %26, label %mismatch_vec_loop, label %mismatch_end
+
+mismatch_vec_loop_found: ; preds = %mismatch_vec_loop
+ %ctz = phi i32 [ %22, %mismatch_vec_loop ]
+ %mismatch_vector_index1 = phi i64 [ %mismatch_vector_index, %mismatch_vec_loop ]
+ %27 = zext i32 %ctz to i64
+ %28 = add nuw nsw i64 %mismatch_vector_index1, %27
+ %29 = trunc i64 %28 to i32
+ br label %mismatch_end
+
+mismatch_loop_pre: ; preds = %mismatch_mem_check, %mismatch_min_it_check
+ br label %mismatch_loop
+
+mismatch_loop: ; preds = %mismatch_loop_inc, %mismatch_loop_pre
+ %mismatch_index = phi i32 [ %0, %mismatch_loop_pre ], [ %36, %mismatch_loop_inc ]
+ %30 = zext i32 %mismatch_index to i64
+ %31 = getelementptr inbounds i8, ptr %a, i64 %30
+ %32 = load i8, ptr %31, align 1
+ %33 = getelementptr inbounds i8, ptr %b, i64 %30
+ %34 = load i8, ptr %33, align 1
+ %35 = icmp eq i8 %32, %34
+ br i1 %35, label %mismatch_loop_inc, label %mismatch_end
+
+mismatch_loop_inc: ; preds = %mismatch_loop
+ %36 = add i32 %mismatch_index, 1
+ %37 = icmp eq i32 %36, %n
+ br i1 %37, label %mismatch_end, label %mismatch_loop
+
+mismatch_end: ; preds = %mismatch_loop_inc, %mismatch_loop, %mismatch_vec_loop_found, %mismatch_vec_loop_inc
+ %mismatch_result = phi i32 [ %n, %mismatch_loop_inc ], [ %mismatch_index, %mismatch_loop ], [ %n, %mismatch_vec_loop_inc ], [ %29, %mismatch_vec_loop_found ]
+ br i1 true, label %byte.compare, label %while.cond
+
+while.cond: ; preds = %mismatch_end, %while.body
----------------
topperc wrote:
Not for this patch, but this code looks a lot like the scalar mismatch_loop. Is it possible to use the original loop in place of mismatch_loop and just insert the vector loop and checks on top of it? I think that's conceptually similar to the how the normal vectorizer works.
https://github.com/llvm/llvm-project/pull/94082
More information about the llvm-commits
mailing list