[llvm] [X86] Generate `vpmuludq` instead of `vpmullq` (PR #121456)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 2 00:12:30 PST 2025
https://github.com/abhishek-kaushik22 updated https://github.com/llvm/llvm-project/pull/121456
>From a0551f887bf63971ecb3bb16155b48972bb631b8 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Thu, 2 Jan 2025 13:05:07 +0530
Subject: [PATCH 1/3] [X86] Generate `vpmuludq` instead of `vpmullq`
When lowering `_mm512_mul_epu32` intrinsic if the generated value if later used in a vector shuffle we generate `vpmullq` instead of `vpmuludq` (https://godbolt.org/z/WbaGMqs8e) because `SimplifyDemandedVectorElts` simplifies the arguments and we fail the combine to `PMULDQ`.
Added an override to `shouldSimplifyDemandedVectorElts` in `X86TargetLowering` to check if we can combine the `MUL` to `PMULDQ` first.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 21 +++++++++++++++++++++
llvm/lib/Target/X86/X86ISelLowering.h | 3 +++
2 files changed, 24 insertions(+)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a0514e93d6598b..e104264bcbf918 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -60832,3 +60832,24 @@ Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
return TargetLowering::getPrefLoopAlignment();
}
+
+bool X86TargetLowering::shouldSimplifyDemandedVectorElts(
+ SDValue Op, const TargetLoweringOpt &TLO) const {
+ if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ SDValue V0 = peekThroughBitcasts(Op.getOperand(0));
+ SDValue V1 = peekThroughBitcasts(Op.getOperand(1));
+
+ if (V0.getOpcode() == ISD::MUL || V1.getOpcode() == ISD::MUL) {
+ SDNode *Mul = V0.getOpcode() == ISD::MUL ? V0.getNode() : V1.getNode();
+ SelectionDAG &DAG = TLO.DAG;
+ const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
+ const SDLoc DL(Mul);
+
+ if (SDValue V = combineMulToPMULDQ(Mul, DL, DAG, Subtarget)) {
+ DAG.ReplaceAllUsesWith(Mul, V.getNode());
+ return false;
+ }
+ }
+ }
+ return true;
+}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 2b7a8eaf249d83..0a6cd53f557bb2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1207,6 +1207,9 @@ namespace llvm {
bool hasBitTest(SDValue X, SDValue Y) const override;
+ bool shouldSimplifyDemandedVectorElts(
+ SDValue Op, const TargetLoweringOpt &TLO) const override;
+
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
>From a8fc42049e4527f64bfb652cb8f986c4beac40c3 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Thu, 2 Jan 2025 13:11:43 +0530
Subject: [PATCH 2/3] Add test
---
llvm/test/CodeGen/X86/pr121456.ll | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/pr121456.ll
diff --git a/llvm/test/CodeGen/X86/pr121456.ll b/llvm/test/CodeGen/X86/pr121456.ll
new file mode 100644
index 00000000000000..ccb1309c96a6d7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr121456.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512dq -O3 | FileCheck %s
+
+define <8 x i64> @pr121456(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: pr121456:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = and <8 x i64> %a, splat (i64 4294967295)
+ %1 = and <8 x i64> %b, splat (i64 4294967295)
+ %2 = mul nuw <8 x i64> %1, %0
+ %3 = bitcast <8 x i64> %2 to <16 x i32>
+ %4 = shufflevector <16 x i32> <i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison>, <16 x i32> %3, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+ %5 = bitcast <16 x i32> %4 to <8 x i64>
+ ret <8 x i64> %5
+}
>From 225b0c418a4a672a7c10c6d4ba57245e0c4f6a5b Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Thu, 2 Jan 2025 13:42:16 +0530
Subject: [PATCH 3/3] Remove `-O3` from lit test
---
llvm/test/CodeGen/X86/pr121456.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/X86/pr121456.ll b/llvm/test/CodeGen/X86/pr121456.ll
index ccb1309c96a6d7..9f8211aa78785f 100644
--- a/llvm/test/CodeGen/X86/pr121456.ll
+++ b/llvm/test/CodeGen/X86/pr121456.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512dq -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512dq | FileCheck %s
define <8 x i64> @pr121456(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: pr121456:
More information about the llvm-commits
mailing list