[llvm] [X86] Remove single-use checks when combining xor and vfmulc/vcfmulc. (PR #128910)

Thu Feb 27 06:10:27 PST 2025

https://github.com/daniel-zabawa updated https://github.com/llvm/llvm-project/pull/128910

>From d8da2e02b44de0120497775f66feaf25aee34c69 Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Wed, 26 Feb 2025 08:52:41 -0800
Subject: [PATCH] [X86] Remove single-use checks when combining xor and
 vfmulc/vcfmulc.

The current implementation to combine xor patterns for conjugation with
complex multiplies will not perform the transformation when either the
conjugate xor result or other multiplicand have other uses. This change
eliminates both single-use checks.

The xor result check isn't required as even if the conjugate result is
needed elsewhere, the transformation eliminates the dependence. The
check of the other multiplicand isn't required for correctness and
has no apparent performance implications.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  4 ++--
 .../X86/avx512fp16-combine-xor-vfmulc.ll      | 20 +++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9a259fef719f5..a53311a692dc0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53621,9 +53621,9 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
   int CombineOpcode =
       N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
   auto combineConjugation = [&](SDValue &r) {
-    if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
+    if (LHS->getOpcode() == ISD::BITCAST) {
       SDValue XOR = LHS.getOperand(0);
-      if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
+      if (XOR->getOpcode() == ISD::XOR) {
         KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
         if (XORRHS.isConstant()) {
           APInt ConjugationInt32 = APInt(32, 0x80000000);
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
index 946029ae921c0..d4608457f7e9a 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
@@ -83,6 +83,26 @@ entry:
   ret <32 x half> %3
 }
 
+define dso_local <32 x half> @test6(<16 x i32> %a) local_unnamed_addr #0 {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vfcmulcph %zmm0, %zmm3, %zmm1
+; CHECK-NEXT:    vfcmaddcph %zmm0, %zmm2, %zmm1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = xor <16 x i32> %a, splat (i32 -2147483648)
+  %1 = bitcast <16 x i32> %0 to <16 x float>
+  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %3 = bitcast <16 x float> %2 to <32 x half>
+  %4 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> zeroinitializer, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %5 = bitcast <16 x float> %4 to <32 x half>
+  %6 = fadd <32 x half> %3, %5
+  ret <32 x half> %6
+}
+
 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)