[llvm] [X86] LowerShuffle - don't call canonicalizeShuffleMaskWithHorizOp if we could shuffle whole lanes (PR #170838)

Fri Dec 5 03:21:25 PST 2025

https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/170838

canonicalizeShuffleMaskWithHorizOp was getting stuck as it was canonicalizing a SHUFFLE(HADD(X,X)) to only refer to the results of the LHS X, but the original shuffle was shuffling entire lanes (with VPERM2F128), and the canonicalised shuffle was then attempting to lowering back to the original VPERM2F128 pattern.

I think we can drop this call to canonicalizeShuffleMaskWithHorizOp once #143000 is addressed as vectorcombine should fold away all the patterns this addresses.

Fixes #167793

>From 12090763b8ecb3c96b08eca1455316581986dbd0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 5 Dec 2025 11:18:57 +0000
Subject: [PATCH] [X86] LowerShuffle - don't call
 canonicalizeShuffleMaskWithHorizOp if we could shuffle whole lanes

canonicalizeShuffleMaskWithHorizOp was getting stuck as it was canonicalizing a SHUFFLE(HADD(X,X)) to only refer to the results of the LHS X, but the original shuffle was shuffling entire lanes (with VPERM2F128), and the canonicalised shuffle was then attempting to lowering back to the original VPERM2F128 pattern.

I think we can drop this call to canonicalizeShuffleMaskWithHorizOp once #143000 is addressed as vectorcombine should fold away all the patterns this addresses.

Fixes #167793
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 22 ++++++++++--------
 llvm/test/CodeGen/X86/pr167793.ll       | 30 +++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr167793.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6e16bb148b5df..2722321a03e2e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18462,16 +18462,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
   SmallVector<int> Mask(OrigMask);
 
   // Canonicalize the shuffle with any horizontal ops inputs.
+  // Don't attempt this if the shuffle can still be widened as we may lose
+  // whole lane shuffle patterns.
   // NOTE: This may update Ops and Mask.
-  if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
-          Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
-    return DAG.getBitcast(VT, HOp);
-
-  V1 = DAG.getBitcast(VT, Ops[0]);
-  V2 = DAG.getBitcast(VT, Ops[1]);
-  assert(NumElements == (int)Mask.size() &&
-         "canonicalizeShuffleMaskWithHorizOp "
-         "shouldn't alter the shuffle mask size");
+  if (!canWidenShuffleElements(Mask)) {
+    if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
+            Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
+      return DAG.getBitcast(VT, HOp);
+
+    V1 = DAG.getBitcast(VT, Ops[0]);
+    V2 = DAG.getBitcast(VT, Ops[1]);
+    assert(NumElements == (int)Mask.size() &&
+           "canonicalizeShuffleMaskWithHorizOp "
+           "shouldn't alter the shuffle mask size");
+  }
 
   // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
   // These will be materialized uniformly anyway, so make splat matching easier.
diff --git a/llvm/test/CodeGen/X86/pr167793.ll b/llvm/test/CodeGen/X86/pr167793.ll
new file mode 100644
index 0000000000000..9b394bfddc396
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr167793.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s
+
+define <4 x double> @PR167793(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: PR167793:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    vhaddpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; CHECK-NEXT:    retq
+  %i5 = shufflevector <4 x double> %a0, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %i6 = fadd <4 x double> %a0, %i5
+  %i8 = shufflevector <4 x double> %a1, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %i9 = fadd <4 x double> %a1, %i8
+  %i10 = shufflevector <4 x double> %i6, <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+  %i11 = shufflevector <4 x double> %i6, <4 x double> poison, <2 x i32> <i32 poison, i32 1>
+  %i12 = fadd <2 x double> %i10, %i11
+  %i13 = shufflevector <4 x double> %i9, <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+  %i14 = shufflevector <4 x double> %i9, <4 x double> poison, <2 x i32> <i32 poison, i32 1>
+  %i15 = fadd <2 x double> %i13, %i14
+  %i16 = shufflevector <4 x double> zeroinitializer, <4 x double> poison, <2 x i32> <i32 poison, i32 1>
+  %i18 = shufflevector <2 x double> %i15, <2 x double> %i16, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+  %i19 = shufflevector <2 x double> %i12, <2 x double> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+  %i20 = shufflevector <4 x double> %i19, <4 x double> %i18, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %i20
+}