[llvm] [X86] Add APX imulzu support. (PR #116806)

Mon Nov 25 07:58:54 PST 2024

https://github.com/daniel-zabawa updated https://github.com/llvm/llvm-project/pull/116806

>From 2066851877739f16e8f97e8b2d9de7af88c1a059 Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Thu, 14 Nov 2024 11:53:53 -0800
Subject: [PATCH 1/3] [X86] Add APX imulzu support.

Add patterns to select 16b imulzu with -mapx-feature=zu, including
folding of zero-extends of the result. IsDesirableToPromoteOp is
changed to leave 16b multiplies by constant un-promoted, as imulzu
will not cause partial-write stalls.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp   |   6 +
 llvm/lib/Target/X86/X86InstrCompiler.td   |  34 +++-
 llvm/lib/Target/X86/X86InstrPredicates.td |   2 +
 llvm/test/CodeGen/X86/apx/imulzu.ll       | 238 ++++++++++++++++++++++
 4 files changed, 276 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/apx/imulzu.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 34bc5d76c15cea..7476c268e5acbc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58919,6 +58919,12 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
     if (IsFoldableAtomicRMW(N0, Op) ||
         (Commute && IsFoldableAtomicRMW(N1, Op)))
       return false;
+    // When ZU is enabled, we prefer to not promote for MUL by a constant,
+    // since a 16b imulzu will not incur partial-write stalls, and may be
+    // able to fold away a zero-extend of the 16b result.
+    if (Subtarget.hasZU() && Op.getOpcode() == ISD::MUL &&
+        (isa<ConstantSDNode>(N0) || isa<ConstantSDNode>(N1)))
+      return false;
   }
   }
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index a05c3f028442c0..885c9e98fce4a5 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2184,17 +2184,43 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
 defm : EFLAGSDefiningPats<"", NoNDD>;
 defm : EFLAGSDefiningPats<"_ND", HasNDD>;
 
+let Predicates = [HasZU] in {
+  // zext (mul reg/mem, imm) -> imulzu
+  def : Pat<(i32 (zext (i16 (mul GR16:$src1, imm:$src2)))),
+            (SUBREG_TO_REG (i32 0), (IMULZU16rri GR16:$src1, imm:$src2), sub_16bit)>;
+  def : Pat<(i32 (zext (i16 (mul (loadi16 addr:$src1), imm:$src2)))),
+            (SUBREG_TO_REG (i32 0), (IMULZU16rmi addr:$src1, imm:$src2), sub_16bit)>;
+  def : Pat<(i64 (zext (i16 (mul GR16:$src1, imm:$src2)))),
+            (SUBREG_TO_REG (i64 0), (IMULZU16rri GR16:$src1, imm:$src2), sub_16bit)>;
+  def : Pat<(i64 (zext (i16 (mul (loadi16 addr:$src1), imm:$src2)))),
+            (SUBREG_TO_REG (i64 0), (IMULZU16rmi addr:$src1, imm:$src2), sub_16bit)>;
+  
+  // (mul (reg/mem), imm) -> imulzu
+  // Note this pattern doesn't explicitly require the zero-upper behaviour of imulzu,
+  // but instead avoids the zero-extend of the reg/mem operand that would be 
+  // required if the multiply were promoted to 32b to avoid partial-write stalls.
+  // The imulzu here simply doesn't incur any partial-write stalls.
+  def : Pat<(mul GR16:$src1, imm:$src2),
+            (IMULZU16rri GR16:$src1, imm:$src2)>;
+  def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+            (IMULZU16rmi addr:$src1, imm:$src2)>;
+}
+
 // mul reg, imm
-def : Pat<(mul GR16:$src1, imm:$src2),
-          (IMUL16rri GR16:$src1, imm:$src2)>;
+let Predicates = [NoZU] in {
+  def : Pat<(mul GR16:$src1, imm:$src2),
+            (IMUL16rri GR16:$src1, imm:$src2)>;
+}
 def : Pat<(mul GR32:$src1, imm:$src2),
           (IMUL32rri GR32:$src1, imm:$src2)>;
 def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
           (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
 
 // reg = mul mem, imm
-def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
-          (IMUL16rmi addr:$src1, imm:$src2)>;
+let Predicates = [NoZU] in {
+  def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+            (IMUL16rmi addr:$src1, imm:$src2)>;
+}
 def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
           (IMUL32rmi addr:$src1, imm:$src2)>;
 def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7fb566fba51818..02cb4556ba13ba 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -45,6 +45,8 @@ def NoEGPR       : Predicate<"!Subtarget->hasEGPR()">;
 // entries, so that the NDD variant can be selected first to benefit RA.
 def HasNDD       : Predicate<"Subtarget->hasNDD()">;
 def NoNDD        : Predicate<"!Subtarget->hasNDD()">;
+def HasZU        : Predicate<"Subtarget->hasZU()">;
+def NoZU         : Predicate<"!Subtarget->hasZU()">;
 def HasCF        : Predicate<"Subtarget->hasCF()">;
 def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
diff --git a/llvm/test/CodeGen/X86/apx/imulzu.ll b/llvm/test/CodeGen/X86/apx/imulzu.ll
new file mode 100644
index 00000000000000..5b598ebf35ab89
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/imulzu.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+zu | FileCheck %s --check-prefix=ZU
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s --check-prefix=NOZU
+
+; Test generation of 16b imulzu when -mattr=+zu is specified.
+; The mulzu_* tests check for basic generation, which will fold away a zero-extend of the
+; result if present.
+; The following tests are modifications of selected test/CodeGen/X86/imul.ll tests with
+; 16b multiplies, to check that common strength reductions in ISel are still performed
+; when -mattr=+zu is in effect.
+;
+; FIXME: several cases from imul.ll covering DAG combines, in particular those using LEA,
+; are not ported as X86's IsDesirableToPromoteOp has no way to accurately identify when
+; promotion will permit a better sequence than an unpromoted imulzu.
+; These cases should be added when they are implemented.
+
+define i32 @mulzu_16_32(i16 %A) {
+; ZU-LABEL: mulzu_16_32:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, %di, %ax # imm = 0x4D2
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_32:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT:    movzwl %ax, %eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 1234
+    %r = zext i16 %mul to i32
+    ret i32 %r
+}
+
+define i64 @mulzu_16_64(i16 %A) {
+; ZU-LABEL: mulzu_16_64:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, %di, %ax # imm = 0x4D2
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_64:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT:    movzwl %ax, %eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 1234
+    %r = zext i16 %mul to i64
+    ret i64 %r
+}
+
+define i32 @mulzu_16_32_mem(ptr %P) {
+; ZU-LABEL: mulzu_16_32_mem:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, (%rdi), %ax # imm = 0x4D2
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_32_mem:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movzwl (%rdi), %eax
+; NOZU-NEXT:    imull $1234, %eax, %eax # imm = 0x4D2
+; NOZU-NEXT:    movzwl %ax, %eax
+; NOZU-NEXT:    retq
+    %gep = getelementptr i16, ptr %P, i64 0
+    %A = load i16, ptr %gep
+    %mul = mul i16 %A, 1234
+    %r = zext i16 %mul to i32
+    ret i32 %r
+}
+
+define i64 @mulzu_16_64_mem(ptr %P) {
+; ZU-LABEL: mulzu_16_64_mem:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, (%rdi), %ax # imm = 0x4D2
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_64_mem:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movzwl (%rdi), %eax
+; NOZU-NEXT:    imull $1234, %eax, %eax # imm = 0x4D2
+; NOZU-NEXT:    movzwl %ax, %eax
+; NOZU-NEXT:    retq
+    %gep = getelementptr i16, ptr %P, i64 0
+    %A = load i16, ptr %gep
+    %mul = mul i16 %A, 1234
+    %r = zext i16 %mul to i64
+    ret i64 %r
+}
+
+define void @mulzu_16_store(i16 %A, ptr %R) {
+; ZU-LABEL: mulzu_16_store:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, %di, %ax # imm = 0x4D2
+; ZU-NEXT:    movw %ax, (%rsi)
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_store:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT:    movw %ax, (%rsi)
+; NOZU-NEXT:    retq
+    %gep = getelementptr i16, ptr %R, i64 0
+    %mul = mul i16 %A, 1234
+    store i16 %mul, ptr %gep
+    ret void
+}
+
+define void @mulzu_16_store_mem(ptr %P, ptr %R) {
+; ZU-LABEL: mulzu_16_store_mem:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, (%rdi), %ax # imm = 0x4D2
+; ZU-NEXT:    movw %ax, (%rsi)
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_store_mem:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movzwl (%rdi), %eax
+; NOZU-NEXT:    imull $1234, %eax, %eax # imm = 0x4D2
+; NOZU-NEXT:    movw %ax, (%rsi)
+; NOZU-NEXT:    retq
+    %gep = getelementptr i16, ptr %P, i64 0
+    %gep1 = getelementptr i16, ptr %R, i64 0
+    %A = load i16, ptr %gep
+    %mul = mul i16 %A, 1234
+    store i16 %mul, ptr %gep1
+    ret void
+}
+
+; Tests ported from test/CodeGen/X86/imul.ll follow from this point.
+
+define i16 @mul4_16(i16 %A) {
+;
+; ZU-LABEL: mul4_16:
+; ZU:       # %bb.0:
+; ZU-NEXT:    # kill: def $edi killed $edi def $rdi
+; ZU-NEXT:    leal (,%rdi,4), %eax
+; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mul4_16:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    # kill: def $edi killed $edi def $rdi
+; NOZU-NEXT:    leal (,%rdi,4), %eax
+; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 4
+    ret i16 %mul
+}
+
+define i16 @mul4096_16(i16 %A) {
+;
+; ZU-LABEL: mul4096_16:
+; ZU:       # %bb.0:
+; ZU-NEXT:    movl %edi, %eax
+; ZU-NEXT:    shll $12, %eax
+; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mul4096_16:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movl %edi, %eax
+; NOZU-NEXT:    shll $12, %eax
+; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 4096
+    ret i16 %mul
+}
+
+define i16 @mulmin4096_16(i16 %A) {
+;
+; ZU-LABEL: mulmin4096_16:
+; ZU:       # %bb.0:
+; ZU-NEXT:    movl %edi, %eax
+; ZU-NEXT:    shll $12, %eax
+; ZU-NEXT:    negl %eax
+; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulmin4096_16:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movl %edi, %eax
+; NOZU-NEXT:    shll $12, %eax
+; NOZU-NEXT:    negl %eax
+; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, -4096
+    ret i16 %mul
+}
+
+define i16 @mul4_16_minsize(i16 %A) minsize {
+;
+; ZU-LABEL: mul4_16_minsize:
+; ZU:       # %bb.0:
+; ZU-NEXT:    # kill: def $edi killed $edi def $rdi
+; ZU-NEXT:    leal (,%rdi,4), %eax
+; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mul4_16_minsize:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    # kill: def $edi killed $edi def $rdi
+; NOZU-NEXT:    leal (,%rdi,4), %eax
+; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 4
+    ret i16 %mul
+}
+
+define i16 @mul0_16(i16 %A) {
+;
+; ZU-LABEL: mul0_16:
+; ZU:       # %bb.0:
+; ZU-NEXT:    xorl %eax, %eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mul0_16:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    xorl %eax, %eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 0
+    ret i16 %mul
+}
+
+define i16 @mul4294967295_16(i16 %A) {
+;
+; ZU-LABEL: mul4294967295_16:
+; ZU:       # %bb.0:
+; ZU-NEXT:    movl %edi, %eax
+; ZU-NEXT:    negl %eax
+; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mul4294967295_16:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movl %edi, %eax
+; NOZU-NEXT:    negl %eax
+; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 4294967295
+    ret i16 %mul
+}

>From 8b3753a9e9be8835f5f5689e261db94356e40f5a Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Thu, 21 Nov 2024 08:39:16 -0800
Subject: [PATCH 2/3] Remove imulzu usage outside of zero extends and simplify
 code.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +++++--
 llvm/lib/Target/X86/X86InstrCompiler.td | 24 ++-----
 llvm/test/CodeGen/X86/apx/imulzu.ll     | 91 +++++++++++++++++++++----
 3 files changed, 98 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7476c268e5acbc..ebdb07609ae2e7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58881,6 +58881,15 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
     return Ld->getBasePtr() == St->getBasePtr();
   };
 
+  auto IsFoldableZext = [](SDValue Op) {
+    if (!Op.hasOneUse())
+      return false;
+    SDNode *User = *Op->use_begin();
+    EVT VT = User->getValueType(0);
+    return (User->getOpcode() == ISD::ZERO_EXTEND &&
+            (VT == MVT::i32 || VT == MVT::i64));
+  };
+
   bool Commute = false;
   switch (Op.getOpcode()) {
   default: return false;
@@ -58897,8 +58906,15 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
       return false;
     break;
   }
-  case ISD::ADD:
   case ISD::MUL:
+    // When ZU is enabled, we prefer to not promote for MUL by a constant
+    // when there is an opportunity to fold a zext with imulzu.
+    if (Subtarget.hasZU() && IsFoldableZext(Op) &&
+        (isa<ConstantSDNode>(Op.getOperand(0)) ||
+         isa<ConstantSDNode>(Op.getOperand(1))))
+      return false;
+    [[fallthrough]];
+  case ISD::ADD:
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
@@ -58919,12 +58935,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
     if (IsFoldableAtomicRMW(N0, Op) ||
         (Commute && IsFoldableAtomicRMW(N1, Op)))
       return false;
-    // When ZU is enabled, we prefer to not promote for MUL by a constant,
-    // since a 16b imulzu will not incur partial-write stalls, and may be
-    // able to fold away a zero-extend of the 16b result.
-    if (Subtarget.hasZU() && Op.getOpcode() == ISD::MUL &&
-        (isa<ConstantSDNode>(N0) || isa<ConstantSDNode>(N1)))
-      return false;
   }
   }
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 885c9e98fce4a5..c56329db3e1464 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2194,33 +2194,19 @@ let Predicates = [HasZU] in {
             (SUBREG_TO_REG (i64 0), (IMULZU16rri GR16:$src1, imm:$src2), sub_16bit)>;
   def : Pat<(i64 (zext (i16 (mul (loadi16 addr:$src1), imm:$src2)))),
             (SUBREG_TO_REG (i64 0), (IMULZU16rmi addr:$src1, imm:$src2), sub_16bit)>;
-  
-  // (mul (reg/mem), imm) -> imulzu
-  // Note this pattern doesn't explicitly require the zero-upper behaviour of imulzu,
-  // but instead avoids the zero-extend of the reg/mem operand that would be 
-  // required if the multiply were promoted to 32b to avoid partial-write stalls.
-  // The imulzu here simply doesn't incur any partial-write stalls.
-  def : Pat<(mul GR16:$src1, imm:$src2),
-            (IMULZU16rri GR16:$src1, imm:$src2)>;
-  def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
-            (IMULZU16rmi addr:$src1, imm:$src2)>;
 }
-
+  
 // mul reg, imm
-let Predicates = [NoZU] in {
-  def : Pat<(mul GR16:$src1, imm:$src2),
-            (IMUL16rri GR16:$src1, imm:$src2)>;
-}
+def : Pat<(mul GR16:$src1, imm:$src2),
+          (IMUL16rri GR16:$src1, imm:$src2)>;
 def : Pat<(mul GR32:$src1, imm:$src2),
           (IMUL32rri GR32:$src1, imm:$src2)>;
 def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
           (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
 
 // reg = mul mem, imm
-let Predicates = [NoZU] in {
-  def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
-            (IMUL16rmi addr:$src1, imm:$src2)>;
-}
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+          (IMUL16rmi addr:$src1, imm:$src2)>;
 def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
           (IMUL32rmi addr:$src1, imm:$src2)>;
 def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
diff --git a/llvm/test/CodeGen/X86/apx/imulzu.ll b/llvm/test/CodeGen/X86/apx/imulzu.ll
index 5b598ebf35ab89..766993fdc5ba7d 100644
--- a/llvm/test/CodeGen/X86/apx/imulzu.ll
+++ b/llvm/test/CodeGen/X86/apx/imulzu.ll
@@ -3,9 +3,9 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s --check-prefix=NOZU
 
 ; Test generation of 16b imulzu when -mattr=+zu is specified.
-; The mulzu_* tests check for basic generation, which will fold away a zero-extend of the
-; result if present.
-; The following tests are modifications of selected test/CodeGen/X86/imul.ll tests with
+; The mulzu_* tests check for basic generation, which is limited to cases where a
+; zero-extend of the result can be folded into imulzu.
+; The remaining tests are modifications of selected test/CodeGen/X86/imul.ll tests with
 ; 16b multiplies, to check that common strength reductions in ISel are still performed
 ; when -mattr=+zu is in effect.
 ;
@@ -84,10 +84,14 @@ define i64 @mulzu_16_64_mem(ptr %P) {
     ret i64 %r
 }
 
+; The following mulzu cases check that imulzu is not
+; generated in the absence of a single zext user. The ZU/NOZU
+; cases should match.
+
 define void @mulzu_16_store(i16 %A, ptr %R) {
 ; ZU-LABEL: mulzu_16_store:
 ; ZU:       # %bb.0:
-; ZU-NEXT:    imulzuw $1234, %di, %ax # imm = 0x4D2
+; ZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
 ; ZU-NEXT:    movw %ax, (%rsi)
 ; ZU-NEXT:    retq
 ;
@@ -102,28 +106,85 @@ define void @mulzu_16_store(i16 %A, ptr %R) {
     ret void
 }
 
-define void @mulzu_16_store_mem(ptr %P, ptr %R) {
-; ZU-LABEL: mulzu_16_store_mem:
+define i32 @mulzu_16_store_32(i16 %A, ptr %R) {
+; ZU-LABEL: mulzu_16_store_32:
 ; ZU:       # %bb.0:
-; ZU-NEXT:    imulzuw $1234, (%rdi), %ax # imm = 0x4D2
+; ZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
 ; ZU-NEXT:    movw %ax, (%rsi)
+; ZU-NEXT:    movzwl %ax, %eax
 ; ZU-NEXT:    retq
 ;
-; NOZU-LABEL: mulzu_16_store_mem:
+; NOZU-LABEL: mulzu_16_store_32:
 ; NOZU:       # %bb.0:
-; NOZU-NEXT:    movzwl (%rdi), %eax
-; NOZU-NEXT:    imull $1234, %eax, %eax # imm = 0x4D2
+; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
 ; NOZU-NEXT:    movw %ax, (%rsi)
+; NOZU-NEXT:    movzwl %ax, %eax
 ; NOZU-NEXT:    retq
-    %gep = getelementptr i16, ptr %P, i64 0
-    %gep1 = getelementptr i16, ptr %R, i64 0
-    %A = load i16, ptr %gep
+    %gep = getelementptr i16, ptr %R, i64 0
     %mul = mul i16 %A, 1234
-    store i16 %mul, ptr %gep1
-    ret void
+    store i16 %mul, ptr %gep
+    %r = zext i16 %mul to i32
+    ret i32 %r
+}
+
+define i64 @mulzu_16_store_64(i16 %A, ptr %R) {
+; ZU-LABEL: mulzu_16_store_64:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; ZU-NEXT:    movw %ax, (%rsi)
+; ZU-NEXT:    movzwl %ax, %eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_store_64:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT:    movw %ax, (%rsi)
+; NOZU-NEXT:    movzwl %ax, %eax
+; NOZU-NEXT:    retq
+    %gep = getelementptr i16, ptr %R, i64 0
+    %mul = mul i16 %A, 1234
+    store i16 %mul, ptr %gep
+    %r = zext i16 %mul to i64
+    ret i64 %r
+}
+
+define i32 @mulzu_sext_16_32(i16 %A) {
+; ZU-LABEL: mulzu_sext_16_32:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; ZU-NEXT:    cwtl
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_sext_16_32:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT:    cwtl
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 1234
+    %r = sext i16 %mul to i32
+    ret i32 %r
+}
+
+define i64 @mulzu_sext_16_64(i16 %A) {
+; ZU-LABEL: mulzu_sext_16_64:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; ZU-NEXT:    movswq %ax, %rax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_sext_16_64:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT:    movswq %ax, %rax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 1234
+    %r = sext i16 %mul to i64
+    ret i64 %r
 }
 
 ; Tests ported from test/CodeGen/X86/imul.ll follow from this point.
+; The generated code, which strength-reduces multiplies by certain
+; constants, should be unaffected by enabling zu.
 
 define i16 @mul4_16(i16 %A) {
 ;

>From 9c3615f47d9c7fb601ae33a8519d4cfb1df104af Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Mon, 25 Nov 2024 07:58:24 -0800
Subject: [PATCH 3/3] Remove unused predicate and simplify lit test

---
 llvm/lib/Target/X86/X86InstrPredicates.td |   1 -
 llvm/test/CodeGen/X86/apx/imulzu.ll       | 201 +++++++---------------
 2 files changed, 64 insertions(+), 138 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 02cb4556ba13ba..f8a03ec7223917 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -46,7 +46,6 @@ def NoEGPR       : Predicate<"!Subtarget->hasEGPR()">;
 def HasNDD       : Predicate<"Subtarget->hasNDD()">;
 def NoNDD        : Predicate<"!Subtarget->hasNDD()">;
 def HasZU        : Predicate<"Subtarget->hasZU()">;
-def NoZU         : Predicate<"!Subtarget->hasZU()">;
 def HasCF        : Predicate<"Subtarget->hasCF()">;
 def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
diff --git a/llvm/test/CodeGen/X86/apx/imulzu.ll b/llvm/test/CodeGen/X86/apx/imulzu.ll
index 766993fdc5ba7d..9a4a63750a1dbf 100644
--- a/llvm/test/CodeGen/X86/apx/imulzu.ll
+++ b/llvm/test/CodeGen/X86/apx/imulzu.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+zu | FileCheck %s --check-prefix=ZU
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s --check-prefix=NOZU
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+zu | FileCheck %s --check-prefixes=CHECK,ZU
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s --check-prefixes=CHECK,NOZU
 
 ; Test generation of 16b imulzu when -mattr=+zu is specified.
 ; The mulzu_* tests check for basic generation, which is limited to cases where a
@@ -89,17 +89,11 @@ define i64 @mulzu_16_64_mem(ptr %P) {
 ; cases should match.
 
 define void @mulzu_16_store(i16 %A, ptr %R) {
-; ZU-LABEL: mulzu_16_store:
-; ZU:       # %bb.0:
-; ZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
-; ZU-NEXT:    movw %ax, (%rsi)
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mulzu_16_store:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
-; NOZU-NEXT:    movw %ax, (%rsi)
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mulzu_16_store:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; CHECK-NEXT:    movw %ax, (%rsi)
+; CHECK-NEXT:    retq
     %gep = getelementptr i16, ptr %R, i64 0
     %mul = mul i16 %A, 1234
     store i16 %mul, ptr %gep
@@ -107,19 +101,12 @@ define void @mulzu_16_store(i16 %A, ptr %R) {
 }
 
 define i32 @mulzu_16_store_32(i16 %A, ptr %R) {
-; ZU-LABEL: mulzu_16_store_32:
-; ZU:       # %bb.0:
-; ZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
-; ZU-NEXT:    movw %ax, (%rsi)
-; ZU-NEXT:    movzwl %ax, %eax
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mulzu_16_store_32:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
-; NOZU-NEXT:    movw %ax, (%rsi)
-; NOZU-NEXT:    movzwl %ax, %eax
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mulzu_16_store_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; CHECK-NEXT:    movw %ax, (%rsi)
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    retq
     %gep = getelementptr i16, ptr %R, i64 0
     %mul = mul i16 %A, 1234
     store i16 %mul, ptr %gep
@@ -128,19 +115,12 @@ define i32 @mulzu_16_store_32(i16 %A, ptr %R) {
 }
 
 define i64 @mulzu_16_store_64(i16 %A, ptr %R) {
-; ZU-LABEL: mulzu_16_store_64:
-; ZU:       # %bb.0:
-; ZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
-; ZU-NEXT:    movw %ax, (%rsi)
-; ZU-NEXT:    movzwl %ax, %eax
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mulzu_16_store_64:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
-; NOZU-NEXT:    movw %ax, (%rsi)
-; NOZU-NEXT:    movzwl %ax, %eax
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mulzu_16_store_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; CHECK-NEXT:    movw %ax, (%rsi)
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    retq
     %gep = getelementptr i16, ptr %R, i64 0
     %mul = mul i16 %A, 1234
     store i16 %mul, ptr %gep
@@ -149,34 +129,22 @@ define i64 @mulzu_16_store_64(i16 %A, ptr %R) {
 }
 
 define i32 @mulzu_sext_16_32(i16 %A) {
-; ZU-LABEL: mulzu_sext_16_32:
-; ZU:       # %bb.0:
-; ZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
-; ZU-NEXT:    cwtl
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mulzu_sext_16_32:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
-; NOZU-NEXT:    cwtl
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mulzu_sext_16_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; CHECK-NEXT:    cwtl
+; CHECK-NEXT:    retq
     %mul = mul i16 %A, 1234
     %r = sext i16 %mul to i32
     ret i32 %r
 }
 
 define i64 @mulzu_sext_16_64(i16 %A) {
-; ZU-LABEL: mulzu_sext_16_64:
-; ZU:       # %bb.0:
-; ZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
-; ZU-NEXT:    movswq %ax, %rax
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mulzu_sext_16_64:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
-; NOZU-NEXT:    movswq %ax, %rax
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mulzu_sext_16_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; CHECK-NEXT:    movswq %ax, %rax
+; CHECK-NEXT:    retq
     %mul = mul i16 %A, 1234
     %r = sext i16 %mul to i64
     ret i64 %r
@@ -188,112 +156,71 @@ define i64 @mulzu_sext_16_64(i16 %A) {
 
 define i16 @mul4_16(i16 %A) {
 ;
-; ZU-LABEL: mul4_16:
-; ZU:       # %bb.0:
-; ZU-NEXT:    # kill: def $edi killed $edi def $rdi
-; ZU-NEXT:    leal (,%rdi,4), %eax
-; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mul4_16:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    # kill: def $edi killed $edi def $rdi
-; NOZU-NEXT:    leal (,%rdi,4), %eax
-; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mul4_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal (,%rdi,4), %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
     %mul = mul i16 %A, 4
     ret i16 %mul
 }
 
 define i16 @mul4096_16(i16 %A) {
 ;
-; ZU-LABEL: mul4096_16:
-; ZU:       # %bb.0:
-; ZU-NEXT:    movl %edi, %eax
-; ZU-NEXT:    shll $12, %eax
-; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mul4096_16:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    movl %edi, %eax
-; NOZU-NEXT:    shll $12, %eax
-; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mul4096_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shll $12, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
     %mul = mul i16 %A, 4096
     ret i16 %mul
 }
 
 define i16 @mulmin4096_16(i16 %A) {
 ;
-; ZU-LABEL: mulmin4096_16:
-; ZU:       # %bb.0:
-; ZU-NEXT:    movl %edi, %eax
-; ZU-NEXT:    shll $12, %eax
-; ZU-NEXT:    negl %eax
-; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mulmin4096_16:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    movl %edi, %eax
-; NOZU-NEXT:    shll $12, %eax
-; NOZU-NEXT:    negl %eax
-; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mulmin4096_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shll $12, %eax
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
     %mul = mul i16 %A, -4096
     ret i16 %mul
 }
 
 define i16 @mul4_16_minsize(i16 %A) minsize {
 ;
-; ZU-LABEL: mul4_16_minsize:
-; ZU:       # %bb.0:
-; ZU-NEXT:    # kill: def $edi killed $edi def $rdi
-; ZU-NEXT:    leal (,%rdi,4), %eax
-; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mul4_16_minsize:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    # kill: def $edi killed $edi def $rdi
-; NOZU-NEXT:    leal (,%rdi,4), %eax
-; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mul4_16_minsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal (,%rdi,4), %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
     %mul = mul i16 %A, 4
     ret i16 %mul
 }
 
 define i16 @mul0_16(i16 %A) {
 ;
-; ZU-LABEL: mul0_16:
-; ZU:       # %bb.0:
-; ZU-NEXT:    xorl %eax, %eax
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mul0_16:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    xorl %eax, %eax
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mul0_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
     %mul = mul i16 %A, 0
     ret i16 %mul
 }
 
 define i16 @mul4294967295_16(i16 %A) {
 ;
-; ZU-LABEL: mul4294967295_16:
-; ZU:       # %bb.0:
-; ZU-NEXT:    movl %edi, %eax
-; ZU-NEXT:    negl %eax
-; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
-; ZU-NEXT:    retq
-;
-; NOZU-LABEL: mul4294967295_16:
-; NOZU:       # %bb.0:
-; NOZU-NEXT:    movl %edi, %eax
-; NOZU-NEXT:    negl %eax
-; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
-; NOZU-NEXT:    retq
+; CHECK-LABEL: mul4294967295_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
     %mul = mul i16 %A, 4294967295
     ret i16 %mul
 }