[llvm] [SLP] Fix incorrect operand order in interchangeable instruction. (PR #139225)

Fri May 9 01:46:31 PDT 2025

https://github.com/HanKuanChen created https://github.com/llvm/llvm-project/pull/139225

None

>From 27c980d2059f09646ad4e58ce5a32aa280112bb6 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Fri, 9 May 2025 00:11:34 -0700
Subject: [PATCH 1/2] [SLP] Pre-commit test.

---
 .../Transforms/SLPVectorizer/X86/pr139090.ll  | 113 ++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/pr139090.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr139090.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr139090.ll
new file mode 100644
index 0000000000000..bfbe9d1bcba7c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr139090.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+ at global = global i32 0
+
+define i64 @main() #0 {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr @global, align 8
+; CHECK-NEXT:    [[ADD7:%.*]] = add i32 1, 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, i32 [[LOAD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ADD7]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> <i32 -1, i32 0, i32 0, i32 0>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[TMP3]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SUB]], 10
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[MUL]], 10
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> <i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 0>, <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 11, i32 12, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <8 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    br label [[BB26:%.*]]
+; CHECK:       bb21:
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i64 40
+; CHECK-NEXT:    [[LOAD22:%.*]] = load i32, ptr [[GETELEMENTPTR]], align 8
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[LOAD22]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT:    [[SUB23:%.*]] = sub i32 1, [[TMP8]]
+; CHECK-NEXT:    [[SEXT24:%.*]] = sext i32 [[SUB23]] to i64
+; CHECK-NEXT:    [[OR25:%.*]] = or i64 [[SEXT]], [[SEXT24]]
+; CHECK-NEXT:    ret i64 [[OR25]]
+; CHECK:       bb26:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 1, [[BB:%.*]] ], [ 0, [[BB26]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR27:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 10
+; CHECK-NEXT:    store i32 [[ADD1]], ptr [[GETELEMENTPTR27]], align 4
+; CHECK-NEXT:    [[ADD28:%.*]] = add i64 [[PHI]], 10
+; CHECK-NEXT:    [[GETELEMENTPTR29:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[ADD28]]
+; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr [[GETELEMENTPTR29]], align 4
+; CHECK-NEXT:    br i1 true, label [[BB21:%.*]], label [[BB26]]
+;
+bb:
+  %alloca = alloca i32, align 8
+  %load = load i32, ptr @global, align 8
+  %add = add i32 1, %load
+  %sub = sub i32 0, %add
+  %mul = mul i32 %sub, 10
+  %add1 = add i32 %mul, 10
+  %mul2 = mul i32 0, 0
+  %or = or i32 %mul2, 0
+  %mul3 = mul i32 %add, 0
+  %or4 = or i32 %mul3, 0
+  %mul5 = mul i32 0, 0
+  %or6 = or i32 %mul5, 0
+  %add7 = add i32 1, 0
+  %sub8 = sub i32 0, 0
+  %mul9 = mul i32 %sub8, 0
+  %or10 = or i32 %mul9, 0
+  %sub11 = sub i32 0, %add7
+  %mul12 = mul i32 %sub11, 0
+  %or13 = or i32 %mul12, 0
+  %sub14 = sub i32 0, 0
+  %mul15 = mul i32 %sub14, 0
+  %or16 = or i32 %mul15, 0
+  %mul17 = mul i32 0, 0
+  %or18 = or i32 %mul17, 0
+  %mul19 = mul i32 0, 0
+  %or20 = or i32 %mul19, 0
+  br label %bb26
+
+bb21:                                             ; preds = %bb26
+  %getelementptr = getelementptr i8, ptr %alloca, i64 40
+  %load22 = load i32, ptr %getelementptr, align 8
+  %sext = sext i32 %load22 to i64
+  %sub23 = sub i32 1, %sub11
+  %sext24 = sext i32 %sub23 to i64
+  %or25 = or i64 %sext, %sext24
+  ret i64 %or25
+
+bb26:                                             ; preds = %bb26, %bb
+  %phi = phi i64 [ 1, %bb ], [ 0, %bb26 ]
+  %getelementptr27 = getelementptr i32, ptr %alloca, i64 10
+  store i32 %add1, ptr %getelementptr27, align 4
+  %add28 = add i64 %phi, 10
+  %getelementptr29 = getelementptr i32, ptr %alloca, i64 %add28
+  store i32 %or, ptr %getelementptr29, align 4
+  %add30 = add i64 %phi, 11
+  %getelementptr31 = getelementptr i32, ptr %alloca, i64 %add30
+  store i32 %or4, ptr %getelementptr31, align 4
+  %add32 = add i64 %phi, 12
+  %getelementptr33 = getelementptr i32, ptr %alloca, i64 %add32
+  store i32 %or6, ptr %getelementptr33, align 4
+  %add34 = add i64 %phi, 13
+  %getelementptr35 = getelementptr i32, ptr %alloca, i64 %add34
+  store i32 %or10, ptr %getelementptr35, align 4
+  %add36 = add i64 %phi, 14
+  %getelementptr37 = getelementptr i32, ptr %alloca, i64 %add36
+  store i32 %or13, ptr %getelementptr37, align 4
+  %add38 = add i64 %phi, 15
+  %getelementptr39 = getelementptr i32, ptr %alloca, i64 %add38
+  store i32 %or16, ptr %getelementptr39, align 4
+  %add40 = add i64 %phi, 16
+  %getelementptr41 = getelementptr i32, ptr %alloca, i64 %add40
+  store i32 %or18, ptr %getelementptr41, align 4
+  %add42 = add i64 %phi, 17
+  %getelementptr43 = getelementptr i32, ptr %alloca, i64 %add42
+  store i32 %or20, ptr %getelementptr43, align 4
+  br i1 true, label %bb21, label %bb26
+}
+
+attributes #0 = { "target-features"="+avx2" }

>From ebac9db2b1a9d778b5f61e8b559ad0b3b299c8a1 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Fri, 9 May 2025 00:16:46 -0700
Subject: [PATCH 2/2] [SLP] Fix incorrect operand order in interchangeable
 instruction.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  5 +-
 .../Transforms/SLPVectorizer/X86/pr139090.ll  |  6 +--
 .../X86/reorder_diamond_match.ll              | 47 ++++++++++++++-----
 3 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7fbbb2681b9ed..d8f7bc3e8e6ff 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -995,7 +995,10 @@ class BinOpSameOpcodeHelper {
       Value *LHS = I->getOperand(1 - Pos);
       Constant *RHS =
           ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
-      if (Pos == 1)
+      // constant + x cannot be -constant - x
+      // instead, it should be x - -constant
+      if (Pos == 1 ||
+          (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
         return SmallVector<Value *>({LHS, RHS});
       return SmallVector<Value *>({RHS, LHS});
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr139090.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr139090.ll
index bfbe9d1bcba7c..6c3c7f89d9743 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr139090.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr139090.ll
@@ -11,9 +11,9 @@ define i64 @main() #0 {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 8
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr @global, align 8
 ; CHECK-NEXT:    [[ADD7:%.*]] = add i32 1, 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, i32 [[LOAD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ADD7]], i32 2
-; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> <i32 -1, i32 0, i32 0, i32 0>, [[TMP1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[LOAD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 -1, i32 0, i32 poison, i32 0>, i32 [[ADD7]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[TMP3]]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SUB]], 10
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
index fff2b72df613e..fd16a5200b868 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
@@ -4,19 +4,40 @@
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr undef, i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr undef, i64 0, i64 1, i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i16> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl <4 x i16> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i16> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = sub <4 x i16> zeroinitializer, [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = sext <4 x i16> [[TMP13]] to <4 x i32>
-; CHECK-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nsw i32 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr undef, i64 5
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw i32 0, [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nsw i32 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr undef, i64 6
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = sub nsw i32 0, [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr undef, i64 7
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i8 [[TMP20]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw i32 0, [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nsw i32 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw i32 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw i32 [[TMP12]], [[TMP6]]
+; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw i32 [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add nsw i32 [[TMP24]], [[TMP18]]
+; CHECK-NEXT:    [[TMP28:%.*]] = sub nsw i32 [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr undef, i64 0, i64 1, i64 0
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, i32 [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP26]], i32 1
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, i32 [[TMP27]], i32 2
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP28]], i32 3
+; CHECK-NEXT:    [[TMP34:%.*]] = sub nsw <4 x i32> [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    store <4 x i32> [[TMP34]], ptr [[TMP29]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %1 = getelementptr inbounds i8, ptr undef, i64 4