[llvm] [SLP]Improve analysis of copyables operands for commmutative main instruction (PR #185320)

Sun Mar 8 13:32:44 PDT 2026

https://github.com/alexey-bataev created https://github.com/llvm/llvm-project/pull/185320

For commutative copyables, instruction operands are always LHS and other
are RHS. But if some instruction is main and has 2 instructions
operands and RHS is more compatible with LHS operands, than LHS
operands, need to swap such operands for better analysis.


>From 74b009c29527161bc821b42a6e387bd5f9cb2cb3 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Sun, 8 Mar 2026 13:32:34 -0700
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.7
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 17 +++++++++++
 .../X86/bswap-i64-by-i32-chunks.ll            | 30 ++++++++++++-------
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 75bfa34093d47..51a4f9841e331 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11503,18 +11503,35 @@ class InstructionsCompatibilityAnalysis {
     // Check profitability if number of copyables > VL.size() / 2.
     // 1. Reorder operands for better matching.
     if (isCommutative(MainOp)) {
+      Value *BestFrontOp = nullptr;
       for (auto [OpL, OpR] : zip(Operands.front(), Operands.back())) {
         // Make instructions the first operands.
         if (!isa<Instruction>(OpL) && isa<Instruction>(OpR)) {
+          BestFrontOp = OpR;
           std::swap(OpL, OpR);
           continue;
         }
         // Make constants the second operands.
         if ((isa<Constant>(OpL) && !match(OpR, m_Zero())) ||
             match(OpL, m_Zero())) {
+          BestFrontOp = OpR;
           std::swap(OpL, OpR);
           continue;
         }
+        if (isa<Instruction>(OpL))
+          BestFrontOp = OpL;
+      }
+      // If some of the RHS operands better match most of LHS - swap such
+      // operands to increase matching rate.
+      if (auto *BestLHS = dyn_cast_if_present<Instruction>(BestFrontOp)) {
+        const unsigned BestOpcode = BestLHS->getOpcode();
+        for (auto [OpL, OpR] : zip(Operands.front(), Operands.back())) {
+          auto *OpRI = dyn_cast<Instruction>(OpR);
+          if (!OpRI)
+            continue;
+          if (OpRI->getOpcode() == BestOpcode)
+            std::swap(OpL, OpR);
+        }
       }
     }
     // 2. Check, if operands can be vectorized.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bswap-i64-by-i32-chunks.ll b/llvm/test/Transforms/SLPVectorizer/X86/bswap-i64-by-i32-chunks.ll
index 1ce883cb293be..754bab4107bee 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bswap-i64-by-i32-chunks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bswap-i64-by-i32-chunks.ll
@@ -5,19 +5,27 @@ define i64 @test(ptr %buf) {
 ; CHECK-LABEL: define i64 @test(
 ; CHECK-SAME: ptr [[BUF:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[BUF]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[BUF]], i64 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i8>, ptr [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds nuw i8, ptr [[BUF]], i64 6
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BUF]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds nuw i8, ptr [[BUF]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX22]], align 1
 ; CHECK-NEXT:    [[CONV23:%.*]] = zext i8 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i8> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[CONV23]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw <4 x i64> [[TMP8]], <i64 16, i64 24, i64 32, i64 8>
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[BUF]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[TMP2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[BUF]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> <i64 poison, i64 0, i64 0, i64 0>, i64 [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw <4 x i64> [[TMP3]], <i64 24, i64 0, i64 0, i64 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i64> <i64 poison, i64 0, i64 0, i64 0>, i64 [[CONV23]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw <4 x i64> [[TMP5]], <i64 16, i64 0, i64 0, i64 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <4 x i64> [[TMP6]], [[TMP14]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> <i64 poison, i64 0, i64 0, i64 0>, i64 [[CONV5]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw <4 x i64> [[TMP8]], <i64 8, i64 0, i64 0, i64 0>
+; CHECK-NEXT:    [[TMP16:%.*]] = or disjoint <4 x i64> [[TMP7]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = or disjoint <4 x i64> [[TMP16]], [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw <4 x i64> [[TMP13]], <i64 32, i64 24, i64 16, i64 8>
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds nuw i8, ptr [[BUF]], i64 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX27]], align 1
 ; CHECK-NEXT:    [[CONV28:%.*]] = zext i8 [[TMP10]] to i64