[PATCH] D116740: [SLP]Improve reordering for the nodes beeing used in alternate vectorization.

Wed Feb 9 10:12:06 PST 2022

vdmitrie added subscribers: test, vdmitrie.
vdmitrie added a comment.

I have a question wrt this patch. Consider this test case:
define dso_local void @test(i32* noalias nocapture readonly %0, i32* noalias nocapture readonly %1, i32* noalias nocapture %2) {

  %4 = getelementptr inbounds i32, i32* %1, i64 0
  %5 = load i32, i32* %4, align 4
  %6 = getelementptr inbounds i32, i32* %0, i64 0
  %7 = load i32, i32* %6, align 4
  %8 = getelementptr inbounds i32, i32* %1, i64 4
  %9 = load i32, i32* %8, align 4
  %10 = getelementptr inbounds i32, i32* %0, i64 4
  %11 = load i32, i32* %10, align 4
  %12 = getelementptr inbounds i32, i32* %1, i64 1
  %13 = load i32, i32* %12, align 4
  %14 = getelementptr inbounds i32, i32* %0, i64 1
  %15 = load i32, i32* %14, align 4
  %16 = getelementptr inbounds i32, i32* %1, i64 5
  %17 = load i32, i32* %16, align 4
  %18 = getelementptr inbounds i32, i32* %0, i64 5
  %19 = load i32, i32* %18, align 4
  %20 = getelementptr inbounds i32, i32* %1, i64 2
  %21 = load i32, i32* %20, align 4
  %22 = getelementptr inbounds i32, i32* %0, i64 2
  %23 = load i32, i32* %22, align 4
  %24 = getelementptr inbounds i32, i32* %1, i64 6
  %25 = load i32, i32* %24, align 4
  %26 = getelementptr inbounds i32, i32* %0, i64 6
  %27 = load i32, i32* %26, align 4
  %28 = getelementptr inbounds i32, i32* %1, i64 3
  %29 = load i32, i32* %28, align 4
  %30 = getelementptr inbounds i32, i32* %0, i64 3
  %31 = load i32, i32* %30, align 4
  %32 = getelementptr inbounds i32, i32* %1, i64 7
  %33 = load i32, i32* %32, align 4
  %34 = getelementptr inbounds i32, i32* %0, i64 7
  %35 = load i32, i32* %34, align 4
  %36 = sub i32 %33, %31
  %37 = sub i32 %36, %35
  %38 = add i32 %37, %29
  %39 = sub i32 %25, %23
  %40 = sub i32 %39, %27
  %41 = add i32 %40, %21
  %42 = sub i32 %17, %15
  %43 = sub i32 %42, %19
  %44 = add i32 %43, %13
  %45 = sub i32 %9, %7
  %46 = sub i32 %45, %11
  %47 = add i32 %46, %5
  %48 = getelementptr inbounds i32, i32* %2, i64 0
  %49 = add i32 %41, %38
  %50 = add i32 %49, %47
  %51 = add i32 %50, %44
  store i32 %51, i32* %48, align 4
  %52 = getelementptr inbounds i32, i32* %2, i64 2
  %53 = add i32 %47, %44
  %54 = sub i32 %53, %38
  %55 = sub i32 %54, %41
  store i32 %55, i32* %52, align 4
  %56 = getelementptr inbounds i32, i32* %2, i64 1
  %57 = add i32 %47, %41
  %58 = sub i32 %57, %44
  %59 = sub i32 %58, %38
  store i32 %59, i32* %56, align 4
  %60 = getelementptr inbounds i32, i32* %2, i64 3
  %61 = sub i32 %38, %44
  %62 = sub i32 %61, %41
  %63 = add i32 %62, %47
  store i32 %63, i32* %60, align 4
  ret void

}

opt -slp-vectorizer -dce -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -S

After the patch SLP produced more shufflevector instructions then before:

  %9 = load <4 x i32>, <4 x i32>* %8, align 4
  %shuffle2 = shufflevector <4 x i32> %9, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
  %10 = bitcast i32* %5 to <4 x i32>*
  %11 = load <4 x i32>, <4 x i32>* %10, align 4
  %12 = bitcast i32* %6 to <4 x i32>*
  %13 = load <4 x i32>, <4 x i32>* %12, align 4
  %14 = bitcast i32* %7 to <4 x i32>*
  %15 = load <4 x i32>, <4 x i32>* %14, align 4
  %shuffle1 = shufflevector <4 x i32> %15, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
  %16 = sub <4 x i32> %13, %11
  %shuffle = shufflevector <4 x i32> %16, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
  %17 = sub <4 x i32> %shuffle, %shuffle1
  %18 = add <4 x i32> %17, %shuffle2

instcombine pass then optimizes these shuffles but the question is whether SLP should rely on that? Is it expected or considered as a regression in SLP vectorizer?

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D116740/new/

https://reviews.llvm.org/D116740