[llvm] [X86] Remove single-use checks when combining xor and vfmulc/vcfmulc. (PR #128910)

Daniel Zabawa via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 26 08:53:56 PST 2025


https://github.com/daniel-zabawa created https://github.com/llvm/llvm-project/pull/128910

The current implementation to combine xor patterns for conjugation with complex multiplies will not perform the transformation when either the conjugate xor result or other multiplicand have other uses. This change eliminates both single-use checks.

The xor result check isn't required as even if the conjugate result is needed elsewhere, the transformation eliminates the dependence. The check of the other multiplicand isn't required for correctness and has no apparent performance implications.

>From 2e913a8fdd5e1006de84834c5665948d79a8e535 Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Wed, 26 Feb 2025 08:52:41 -0800
Subject: [PATCH] [X86] Remove single-use checks when combining xor and
 vfmulc/vcfmulc.

The current implementation to combine xor patterns for conjugation with
complex multiplies will not perform the transformation when either the
conjugate xor result or other multiplicand have other uses. This change
eliminates both single-use checks.

The xor result check isn't required as even if the conjugate result is
needed elsewhere, the transformation eliminates the dependence. The
check of the other multiplicand isn't required for correctness and
has no apparent performance implications.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  4 ++--
 .../X86/avx512fp16-combine-xor-vfmulc.ll      | 20 +++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9a259fef719f5..a53311a692dc0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53621,9 +53621,9 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
   int CombineOpcode =
       N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
   auto combineConjugation = [&](SDValue &r) {
-    if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
+    if (LHS->getOpcode() == ISD::BITCAST) {
       SDValue XOR = LHS.getOperand(0);
-      if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
+      if (XOR->getOpcode() == ISD::XOR) {
         KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
         if (XORRHS.isConstant()) {
           APInt ConjugationInt32 = APInt(32, 0x80000000);
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
index 946029ae921c0..d4608457f7e9a 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
@@ -83,6 +83,26 @@ entry:
   ret <32 x half> %3
 }
 
+define dso_local <32 x half> @test6(<16 x i32> %a) local_unnamed_addr #0 {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vfcmulcph %zmm0, %zmm3, %zmm1
+; CHECK-NEXT:    vfcmaddcph %zmm0, %zmm2, %zmm1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = xor <16 x i32> %a, splat (i32 -2147483648)
+  %1 = bitcast <16 x i32> %0 to <16 x float>
+  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %3 = bitcast <16 x float> %2 to <32 x half>
+  %4 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> zeroinitializer, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %5 = bitcast <16 x float> %4 to <32 x half>
+  %6 = fadd <32 x half> %3, %5
+  ret <32 x half> %6
+}
+
 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)



More information about the llvm-commits mailing list