[PATCH] D116740: [SLP]Improve reordering for the nodes beeing used in alternate vectorization.

Fri Apr 1 15:18:26 PDT 2022

vdmitrie added a comment.
Herald added a project: All.

I believe https://reviews.llvm.org/D120492 supposed to fix the issue I reported earlier, but that did not happen. The test I've sent earlier is a simplified one. I just slightly modified it to show misbehaving of the reordering.

This is okay:
define void @test(i32* %arg, i32* %arg1, i32* %arg2) {
bb:

  %i3 = load i32, i32* %arg, align 4
  %s3 = add i32 %i3, %i3
  %i4 = getelementptr inbounds i32, i32* %arg, i64 4
  %i5 = load i32, i32* %i4, align 4
  %s5 = add i32 %i5, %i5
  %i8 = getelementptr inbounds i32, i32* %arg, i64 1
  %i9 = load i32, i32* %i8, align 4
  %s9 = add i32 %i9, %i9
  %i10 = getelementptr inbounds i32, i32* %arg, i64 5
  %i11 = load i32, i32* %i10, align 4
  %s11 = add i32 %i11, %i11
  %i14 = getelementptr inbounds i32, i32* %arg, i64 2
  %i15 = load i32, i32* %i14, align 4
  %s15 = add i32 %i15, %i15
  %i16 = getelementptr inbounds i32, i32* %arg, i64 6
  %i17 = load i32, i32* %i16, align 4
  %s17 = add i32 %i17, %i17
  %i20 = getelementptr inbounds i32, i32* %arg, i64 3
  %i21 = load i32, i32* %i20, align 4
  %s21 = add i32 %i21, %i21
  %i22 = getelementptr inbounds i32, i32* %arg, i64 7
  %i23 = load i32, i32* %i22, align 4
  %s23 = add i32 %i23, %i23

  %i1 = load i32, i32* %arg1, align 4
  %i6 = getelementptr inbounds i32, i32* %arg1, i64 1
  %i7 = load i32, i32* %i6, align 4
  %i12 = getelementptr inbounds i32, i32* %arg1, i64 2
  %i13 = load i32, i32* %i12, align 4
  %i18 = getelementptr inbounds i32, i32* %arg1, i64 3
  %i19 = load i32, i32* %i18, align 4

  %i24 = sub i32 0, %s21
  %i25 = sub i32 %i24, %s23
  %i26 = add i32 %i25, %i19
  %i27 = sub i32 undef, %s15
  %i28 = sub i32 %i27, %s17
  %i29 = add i32 %i28, %i13
  %i30 = sub i32 0, %s9
  %i31 = sub i32 %i30, %s11
  %i32 = add i32 %i31, %i7
  %i33 = sub i32 0, %s3
  %i34 = sub i32 %i33, %s5
  %i35 = add i32 %i34, %i1
  %i36 = add i32 %i29, 1
  %i37 = add i32 %i36, 0
  %i38 = add i32 %i37, 0
  store i32 %i38, i32* %arg2, align 4
  %i39 = getelementptr inbounds i32, i32* %arg2, i64 2
  %i40 = add i32 0, %i32
  %i41 = sub i32 %i40, 0
  %i42 = sub i32 %i41, 0
  store i32 %i42, i32* %i39, align 4
  %i43 = getelementptr inbounds i32, i32* %arg2, i64 1
  %i44 = add i32 %i35, 0
  %i45 = sub i32 %i44, 0
  %i46 = sub i32 %i45, 0
  store i32 %i46, i32* %i43, align 4
  %i47 = getelementptr inbounds i32, i32* %arg2, i64 3
  %i48 = sub i32 %i26, 0
  %i49 = sub i32 %i48, 0
  %i50 = add i32 %i49, 0
  store i32 %i50, i32* %i47, align 4
  ret void

}

merely because  of this if statement:

  if (UserTE->UserTreeIndices.size() != 1)
    break;

effectively returning to behavior prior to the patch.

But this test still produce all these extra shuffles:
define void @test(i32* %arg, i32* %arg1, i32* %arg2) {
bb:

  %i3 = load i32, i32* %arg, align 4
  %s3 = add i32 %i3, 6
  %i4 = getelementptr inbounds i32, i32* %arg, i64 4
  %i5 = load i32, i32* %i4, align 4
  %s5 = add i32 %i5, 6
  %i8 = getelementptr inbounds i32, i32* %arg, i64 1
  %i9 = load i32, i32* %i8, align 4
  %s9 = add i32 %i9, 6
  %i10 = getelementptr inbounds i32, i32* %arg, i64 5
  %i11 = load i32, i32* %i10, align 4
  %s11 = add i32 %i11, 6
  %i14 = getelementptr inbounds i32, i32* %arg, i64 2
  %i15 = load i32, i32* %i14, align 4
  %s15 = add i32 %i15, 6
  %i16 = getelementptr inbounds i32, i32* %arg, i64 6
  %i17 = load i32, i32* %i16, align 4
  %s17 = add i32 %i17, 6
  %i20 = getelementptr inbounds i32, i32* %arg, i64 3
  %i21 = load i32, i32* %i20, align 4
  %s21 = add i32 %i21, 6
  %i22 = getelementptr inbounds i32, i32* %arg, i64 7
  %i23 = load i32, i32* %i22, align 4
  %s23 = add i32 %i23, 6

  %i1 = load i32, i32* %arg1, align 4
  %i6 = getelementptr inbounds i32, i32* %arg1, i64 1
  %i7 = load i32, i32* %i6, align 4
  %i12 = getelementptr inbounds i32, i32* %arg1, i64 2
  %i13 = load i32, i32* %i12, align 4
  %i18 = getelementptr inbounds i32, i32* %arg1, i64 3
  %i19 = load i32, i32* %i18, align 4

  %i24 = sub i32 0, %s21
  %i25 = sub i32 %i24, %s23
  %i26 = add i32 %i25, %i19
  %i27 = sub i32 undef, %s15
  %i28 = sub i32 %i27, %s17
  %i29 = add i32 %i28, %i13
  %i30 = sub i32 0, %s9
  %i31 = sub i32 %i30, %s11
  %i32 = add i32 %i31, %i7
  %i33 = sub i32 0, %s3
  %i34 = sub i32 %i33, %s5
  %i35 = add i32 %i34, %i1
  %i36 = add i32 %i29, 1
  %i37 = add i32 %i36, 0
  %i38 = add i32 %i37, 0
  store i32 %i38, i32* %arg2, align 4
  %i39 = getelementptr inbounds i32, i32* %arg2, i64 2
  %i40 = add i32 0, %i32
  %i41 = sub i32 %i40, 0
  %i42 = sub i32 %i41, 0
  store i32 %i42, i32* %i39, align 4
  %i43 = getelementptr inbounds i32, i32* %arg2, i64 1
  %i44 = add i32 %i35, 0
  %i45 = sub i32 %i44, 0
  %i46 = sub i32 %i45, 0
  store i32 %i46, i32* %i43, align 4
  %i47 = getelementptr inbounds i32, i32* %arg2, i64 3
  %i48 = sub i32 %i26, 0
  %i49 = sub i32 %i48, 0
  %i50 = add i32 %i49, 0
  store i32 %i50, i32* %i47, align 4
  ret void

}

================
Comment at: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:3073
+          return;
+        if (UserTE->UserTreeIndices.empty())
+          UserTE = nullptr;
----------------
this condition is always false because if UserTE->UserTreeIndices.size() != 1 we exit loop at 3067

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D116740/new/

https://reviews.llvm.org/D116740