[llvm] c1a3960 - [X86] Add APX imulzu support. (#116806)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 25 16:26:27 PST 2024
Author: Daniel Zabawa
Date: 2024-11-26T08:26:23+08:00
New Revision: c1a3960abe5ca316e9a26e87cdc3a7f94e420dc6
URL: https://github.com/llvm/llvm-project/commit/c1a3960abe5ca316e9a26e87cdc3a7f94e420dc6
DIFF: https://github.com/llvm/llvm-project/commit/c1a3960abe5ca316e9a26e87cdc3a7f94e420dc6.diff
LOG: [X86] Add APX imulzu support. (#116806)
Add patterns to select 16b imulzu with -mapx-feature=zu, including
folding of zero-extends of the result. IsDesirableToPromoteOp is changed
to leave 16b multiplies by constant un-promoted, as imulzu will not
cause partial-write stalls.
Added:
llvm/test/CodeGen/X86/apx/imulzu.ll
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86InstrCompiler.td
llvm/lib/Target/X86/X86InstrPredicates.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dcde5eeff02202..d1c4c394c756a8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59285,6 +59285,15 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
return Ld->getBasePtr() == St->getBasePtr();
};
+ auto IsFoldableZext = [](SDValue Op) {
+ if (!Op.hasOneUse())
+ return false;
+ SDNode *User = *Op->use_begin();
+ EVT VT = User->getValueType(0);
+ return (User->getOpcode() == ISD::ZERO_EXTEND &&
+ (VT == MVT::i32 || VT == MVT::i64));
+ };
+
bool Commute = false;
switch (Op.getOpcode()) {
default: return false;
@@ -59301,8 +59310,15 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
return false;
break;
}
- case ISD::ADD:
case ISD::MUL:
+ // When ZU is enabled, we prefer to not promote for MUL by a constant
+ // when there is an opportunity to fold a zext with imulzu.
+ if (Subtarget.hasZU() && IsFoldableZext(Op) &&
+ (isa<ConstantSDNode>(Op.getOperand(0)) ||
+ isa<ConstantSDNode>(Op.getOperand(1))))
+ return false;
+ [[fallthrough]];
+ case ISD::ADD:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ea0b66c2f55162..7d4c5c0e10e492 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2184,6 +2184,18 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
defm : EFLAGSDefiningPats<"", NoNDD>;
defm : EFLAGSDefiningPats<"_ND", HasNDD>;
+let Predicates = [HasZU] in {
+ // zext (mul reg/mem, imm) -> imulzu
+ def : Pat<(i32 (zext (i16 (mul GR16:$src1, imm:$src2)))),
+ (SUBREG_TO_REG (i32 0), (IMULZU16rri GR16:$src1, imm:$src2), sub_16bit)>;
+ def : Pat<(i32 (zext (i16 (mul (loadi16 addr:$src1), imm:$src2)))),
+ (SUBREG_TO_REG (i32 0), (IMULZU16rmi addr:$src1, imm:$src2), sub_16bit)>;
+ def : Pat<(i64 (zext (i16 (mul GR16:$src1, imm:$src2)))),
+ (SUBREG_TO_REG (i64 0), (IMULZU16rri GR16:$src1, imm:$src2), sub_16bit)>;
+ def : Pat<(i64 (zext (i16 (mul (loadi16 addr:$src1), imm:$src2)))),
+ (SUBREG_TO_REG (i64 0), (IMULZU16rmi addr:$src1, imm:$src2), sub_16bit)>;
+}
+
// mul reg, imm
def : Pat<(mul GR16:$src1, imm:$src2),
(IMUL16rri GR16:$src1, imm:$src2)>;
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index eb2e93a94b197c..5bdcf51be9dd84 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -45,6 +45,7 @@ def NoEGPR : Predicate<"!Subtarget->hasEGPR()">;
// entries, so that the NDD variant can be selected first to benefit RA.
def HasNDD : Predicate<"Subtarget->hasNDD()">;
def NoNDD : Predicate<"!Subtarget->hasNDD()">;
+def HasZU : Predicate<"Subtarget->hasZU()">;
def HasCF : Predicate<"Subtarget->hasCF()">;
def HasCMOV : Predicate<"Subtarget->canUseCMOV()">;
def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">;
diff --git a/llvm/test/CodeGen/X86/apx/imulzu.ll b/llvm/test/CodeGen/X86/apx/imulzu.ll
new file mode 100644
index 00000000000000..9a4a63750a1dbf
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/imulzu.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+zu | FileCheck %s --check-prefixes=CHECK,ZU
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s --check-prefixes=CHECK,NOZU
+
+; Test generation of 16b imulzu when -mattr=+zu is specified.
+; The mulzu_* tests check for basic generation, which is limited to cases where a
+; zero-extend of the result can be folded into imulzu.
+; The remaining tests are modifications of selected test/CodeGen/X86/imul.ll tests with
+; 16b multiplies, to check that common strength reductions in ISel are still performed
+; when -mattr=+zu is in effect.
+;
+; FIXME: several cases from imul.ll covering DAG combines, in particular those using LEA,
+; are not ported as X86's IsDesirableToPromoteOp has no way to accurately identify when
+; promotion will permit a better sequence than an unpromoted imulzu.
+; These cases should be added when they are implemented.
+
+define i32 @mulzu_16_32(i16 %A) {
+; ZU-LABEL: mulzu_16_32:
+; ZU: # %bb.0:
+; ZU-NEXT: imulzuw $1234, %di, %ax # imm = 0x4D2
+; ZU-NEXT: retq
+;
+; NOZU-LABEL: mulzu_16_32:
+; NOZU: # %bb.0:
+; NOZU-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT: movzwl %ax, %eax
+; NOZU-NEXT: retq
+ %mul = mul i16 %A, 1234
+ %r = zext i16 %mul to i32
+ ret i32 %r
+}
+
+define i64 @mulzu_16_64(i16 %A) {
+; ZU-LABEL: mulzu_16_64:
+; ZU: # %bb.0:
+; ZU-NEXT: imulzuw $1234, %di, %ax # imm = 0x4D2
+; ZU-NEXT: retq
+;
+; NOZU-LABEL: mulzu_16_64:
+; NOZU: # %bb.0:
+; NOZU-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT: movzwl %ax, %eax
+; NOZU-NEXT: retq
+ %mul = mul i16 %A, 1234
+ %r = zext i16 %mul to i64
+ ret i64 %r
+}
+
+define i32 @mulzu_16_32_mem(ptr %P) {
+; ZU-LABEL: mulzu_16_32_mem:
+; ZU: # %bb.0:
+; ZU-NEXT: imulzuw $1234, (%rdi), %ax # imm = 0x4D2
+; ZU-NEXT: retq
+;
+; NOZU-LABEL: mulzu_16_32_mem:
+; NOZU: # %bb.0:
+; NOZU-NEXT: movzwl (%rdi), %eax
+; NOZU-NEXT: imull $1234, %eax, %eax # imm = 0x4D2
+; NOZU-NEXT: movzwl %ax, %eax
+; NOZU-NEXT: retq
+ %gep = getelementptr i16, ptr %P, i64 0
+ %A = load i16, ptr %gep
+ %mul = mul i16 %A, 1234
+ %r = zext i16 %mul to i32
+ ret i32 %r
+}
+
+define i64 @mulzu_16_64_mem(ptr %P) {
+; ZU-LABEL: mulzu_16_64_mem:
+; ZU: # %bb.0:
+; ZU-NEXT: imulzuw $1234, (%rdi), %ax # imm = 0x4D2
+; ZU-NEXT: retq
+;
+; NOZU-LABEL: mulzu_16_64_mem:
+; NOZU: # %bb.0:
+; NOZU-NEXT: movzwl (%rdi), %eax
+; NOZU-NEXT: imull $1234, %eax, %eax # imm = 0x4D2
+; NOZU-NEXT: movzwl %ax, %eax
+; NOZU-NEXT: retq
+ %gep = getelementptr i16, ptr %P, i64 0
+ %A = load i16, ptr %gep
+ %mul = mul i16 %A, 1234
+ %r = zext i16 %mul to i64
+ ret i64 %r
+}
+
+; The following mulzu cases check that imulzu is not
+; generated in the absence of a single zext user. The ZU/NOZU
+; cases should match.
+
+define void @mulzu_16_store(i16 %A, ptr %R) {
+; CHECK-LABEL: mulzu_16_store:
+; CHECK: # %bb.0:
+; CHECK-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
+; CHECK-NEXT: movw %ax, (%rsi)
+; CHECK-NEXT: retq
+ %gep = getelementptr i16, ptr %R, i64 0
+ %mul = mul i16 %A, 1234
+ store i16 %mul, ptr %gep
+ ret void
+}
+
+define i32 @mulzu_16_store_32(i16 %A, ptr %R) {
+; CHECK-LABEL: mulzu_16_store_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
+; CHECK-NEXT: movw %ax, (%rsi)
+; CHECK-NEXT: movzwl %ax, %eax
+; CHECK-NEXT: retq
+ %gep = getelementptr i16, ptr %R, i64 0
+ %mul = mul i16 %A, 1234
+ store i16 %mul, ptr %gep
+ %r = zext i16 %mul to i32
+ ret i32 %r
+}
+
+define i64 @mulzu_16_store_64(i16 %A, ptr %R) {
+; CHECK-LABEL: mulzu_16_store_64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
+; CHECK-NEXT: movw %ax, (%rsi)
+; CHECK-NEXT: movzwl %ax, %eax
+; CHECK-NEXT: retq
+ %gep = getelementptr i16, ptr %R, i64 0
+ %mul = mul i16 %A, 1234
+ store i16 %mul, ptr %gep
+ %r = zext i16 %mul to i64
+ ret i64 %r
+}
+
+define i32 @mulzu_sext_16_32(i16 %A) {
+; CHECK-LABEL: mulzu_sext_16_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
+; CHECK-NEXT: cwtl
+; CHECK-NEXT: retq
+ %mul = mul i16 %A, 1234
+ %r = sext i16 %mul to i32
+ ret i32 %r
+}
+
+define i64 @mulzu_sext_16_64(i16 %A) {
+; CHECK-LABEL: mulzu_sext_16_64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
+; CHECK-NEXT: movswq %ax, %rax
+; CHECK-NEXT: retq
+ %mul = mul i16 %A, 1234
+ %r = sext i16 %mul to i64
+ ret i64 %r
+}
+
+; Tests ported from test/CodeGen/X86/imul.ll follow from this point.
+; The generated code, which strength-reduces multiplies by certain
+; constants, should be unaffected by enabling zu.
+
+define i16 @mul4_16(i16 %A) {
+;
+; CHECK-LABEL: mul4_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: leal (,%rdi,4), %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %mul = mul i16 %A, 4
+ ret i16 %mul
+}
+
+define i16 @mul4096_16(i16 %A) {
+;
+; CHECK-LABEL: mul4096_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shll $12, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %mul = mul i16 %A, 4096
+ ret i16 %mul
+}
+
+define i16 @mulmin4096_16(i16 %A) {
+;
+; CHECK-LABEL: mulmin4096_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shll $12, %eax
+; CHECK-NEXT: negl %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %mul = mul i16 %A, -4096
+ ret i16 %mul
+}
+
+define i16 @mul4_16_minsize(i16 %A) minsize {
+;
+; CHECK-LABEL: mul4_16_minsize:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: leal (,%rdi,4), %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %mul = mul i16 %A, 4
+ ret i16 %mul
+}
+
+define i16 @mul0_16(i16 %A) {
+;
+; CHECK-LABEL: mul0_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
+ %mul = mul i16 %A, 0
+ ret i16 %mul
+}
+
+define i16 @mul4294967295_16(i16 %A) {
+;
+; CHECK-LABEL: mul4294967295_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: negl %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %mul = mul i16 %A, 4294967295
+ ret i16 %mul
+}
More information about the llvm-commits
mailing list