[llvm] c3c24be - [AArch64] Split zero cycle zeoring per register class (#154561)

Thu Aug 28 05:57:11 PDT 2025

Author: Tomer Shafir
Date: 2025-08-28T15:57:06+03:00
New Revision: c3c24be13f7928460ca1e2fe613a1146c868854e

URL: https://github.com/llvm/llvm-project/commit/c3c24be13f7928460ca1e2fe613a1146c868854e
DIFF: https://github.com/llvm/llvm-project/commit/c3c24be13f7928460ca1e2fe613a1146c868854e.diff

LOG: [AArch64] Split zero cycle zeoring per register class (#154561)

This change improves LLVM's model accuracy by splitting AArch64
subtarget features of zero cycle zeroing per register class. This aligns
with how uarch is designed (each register bank has unique capabilities).
Similarly to how we improved ZCM modeling.

It splits `HasZeroCycleZeroingGP` to `HasZeroCycleZeroingGPR32` and
`HasZeroCycleZeroingGPR64`, removes opaque `FeatureZCZeroing`, and
infers `FeatureNoZCZeroingFP` to be `FeatureNoZCZeroingFPR64` based on
the single usage in `AArch64AsmPrinter.cpp`.

It also splits `arm64-zero-cycle-zeroing.ll` into 2 tests one `-gpr` and
one `-fpr`, similarly to ZCM, to make the tests more focused and
managable in correspondance with the new modeling.

The test cases are updated as well, exlpoiting the fact that this is a
refactor patch:

- remove redundant functions that just mix isolated ones (t1-4)
- specialize check prefixes
- replace `apple-a10` with `apple-m1`
- add a `-mtriple=arm64-apple-macosx -mcpu=generic` test case for GPR
- isolate `mtriple=arm64-apple-ios -mcpu=cyclone` FP workaround test
case and move `-fullfp16` to another non-workaround test case

Added: 
    llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
    llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
    llvm/lib/Target/AArch64/AArch64Features.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/lib/Target/AArch64/AArch64Processors.td
    llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
    llvm/test/CodeGen/AArch64/f16-imm.ll

Removed: 
    llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 80efd01391556..da344305f39d9 100644

--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1829,8 +1829,8 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
 
 void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
   Register DestReg = MI.getOperand(0).getReg();
-  if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
-      STI->isNeonAvailable()) {
+  if (STI->hasZeroCycleZeroingFPR64() &&
+      !STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
     // Convert H/S register to corresponding D register
     if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
       DestReg = AArch64::D0 + (DestReg - AArch64::H0);

diff  --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 55aea17d29f55..00cf039096d32 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -630,19 +630,18 @@ def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFP
 def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
                                         "Has zero-cycle register moves for FPR32 registers">;
 
-def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
-                                        "Has zero-cycle zeroing instructions for generic registers">;
+def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true",
+                                        "Has zero-cycle zeroing instructions for GPR64 registers">;
+
+def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
+                                        "Has zero-cycle zeroing instructions for GPR32 registers">;
 
 // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
 // as movi is more efficient across all cores. Newer cores can eliminate
 // fmovs early and there is no 
diff erence with movi, but this not true for
 // all implementations.
-def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
-                                        "Has no zero-cycle zeroing instructions for FP registers">;
-
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions",
-                                        [FeatureZCZeroingGP]>;
+def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false",
+                                        "Has no zero-cycle zeroing instructions for FPR64 registers">;
 
 /// ... but the floating-point version doesn't quite work in rare cases on older
 /// CPUs.

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 1aa180688dcdd..68f708c25a241 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5075,7 +5075,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
             .addImm(0)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
       }
-    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
+    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) {
       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@@ -5202,7 +5202,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
           .addReg(SrcReg, getKillRegState(KillSrc))
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
-    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
+    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) {
       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));

diff  --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index b7e08dbe7c792..e4a7d83e7f807 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -322,7 +322,8 @@ def TuneAppleA7  : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR128,
-                                    FeatureZCZeroing,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
                                     FeatureZCZeroingFPWorkaround]>;
 
 def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -336,7 +337,8 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR128,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     "Apple A11", [
@@ -349,7 +351,8 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR128,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     "Apple A12", [
@@ -362,7 +365,8 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR128,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                     "Apple A13", [
@@ -375,7 +379,8 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR128,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     "Apple A14", [
@@ -393,7 +398,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR128,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
                                     "Apple A15", [
@@ -411,7 +417,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR128,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     "Apple A16", [
@@ -429,7 +436,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR128,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     "Apple A17", [
@@ -447,7 +455,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR128,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      "Apple M4", [
@@ -464,8 +473,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      FeatureFuseLiterals,
                                      FeatureZCRegMoveGPR64,
                                      FeatureZCRegMoveFPR128,
-                                     FeatureZCZeroing
-                                     ]>;
+                                     FeatureZCZeroingGPR32,
+                                     FeatureZCZeroingGPR64]>;
 
 def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                     "Samsung Exynos-M3 processors",
@@ -497,13 +506,15 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
                                      FeatureStorePairSuppress,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
-                                     FeatureZCZeroing]>;
+                                     FeatureZCZeroingGPR32,
+                                     FeatureZCZeroingGPR64]>;
 
 def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    "Qualcomm Kryo processors", [
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing,
+                                   FeatureZCZeroingGPR32,
+                                   FeatureZCZeroingGPR64,
                                    FeatureALULSLFast,
                                    FeatureStorePairSuppress]>;
 
@@ -511,7 +522,8 @@ def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    "Qualcomm Falkor processors", [
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing,
+                                   FeatureZCZeroingGPR32,
+                                   FeatureZCZeroingGPR64,
                                    FeatureStorePairSuppress,
                                    FeatureALULSLFast,
                                    FeatureSlowSTRQro]>;
@@ -607,7 +619,8 @@ def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    "Qualcomm Saphira processors", [
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing,
+                                   FeatureZCZeroingGPR32,
+                                   FeatureZCZeroingGPR64,
                                    FeatureStorePairSuppress,
                                    FeatureALULSLFast]>;
 

diff  --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
index 76b5b76130657..284d624a4e68f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
+++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
@@ -1,15 +1,15 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s
 
 --- |

diff  --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
new file mode 100644
index 0000000000000..2a75976d58549
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+
+define half @tf16() {
+entry:
+; ALL-LABEL: tf16:
+; FP-WORKAROUND: mov s0, wzr
+; NOZCZ-FPR64: mov s0, wzr
+; NOZCZ-FPR64-FULLFP16: mov h0, wzr
+; ZCZ-FPR64: movi d0, #0
+  ret half 0.0
+}
+
+define float @tf32() {
+entry:
+; ALL-LABEL: tf32:
+; FP-WORKAROUND: mov s0, wzr
+; NOZCZ-FPR64: mov s0, wzr
+; ZCZ-FPR64: movi d0, #0
+  ret float 0.0
+}
+
+define double @td64() {
+entry:
+; ALL-LABEL: td64:
+; FP-WORKAROUND: mov d0, xzr
+; NOZCZ-FPR64: mov d0, xzr
+; ZCZ-FPR64: movi d0, #0
+  ret double 0.0
+}
+
+define <8 x i8> @tv8i8() {
+entry:
+; ALL-LABEL: tv8i8:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+define <4 x i16> @tv4i16() {
+entry:
+; ALL-LABEL: tv4i16:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
+}
+
+define <2 x i32> @tv2i32() {
+entry:
+; ALL-LABEL: tv2i32:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <2 x i32> <i32 0, i32 0>
+}
+
+define <2 x float> @tv2f32() {
+entry:
+; ALL-LABEL: tv2f32:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <2 x float> <float 0.0, float 0.0>
+}
+
+define <16 x i8> @tv16i8() {
+entry:
+; ALL-LABEL: tv16i8:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+define <8 x i16> @tv8i16() {
+entry:
+; ALL-LABEL: tv8i16:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+define <4 x i32> @tv4i32() {
+entry:
+; ALL-LABEL: tv4i32:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+}
+
+define <2 x i64> @tv2i64() {
+entry:
+; ALL-LABEL: tv2i64:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <2 x i64> <i64 0, i64 0>
+}
+
+define <4 x float> @tv4f32() {
+entry:
+; ALL-LABEL: tv4f32:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
+}
+
+define <2 x double> @tv2d64() {
+entry:
+; ALL-LABEL: tv2d64:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <2 x double> <double 0.0, double 0.0>
+}
+
+; We used to produce spills+reloads for a Q register with zero cycle zeroing
+; enabled.
+; ALL-LABEL: foo:
+; ALL-NOT: str q{{[0-9]+}}
+; ALL-NOT: ldr q{{[0-9]+}}
+define double @foo(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ]
+  %i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %conv21 = sitofp i32 %i.076 to double
+  %call = tail call fast double @sin(double %conv21)
+  %cmp.i = fcmp fast olt double %phi0, %call
+  %v0 = select i1 %cmp.i, double %call, double %phi0
+  %inc = add nuw nsw i32 %i.076, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret double %v0
+}
+
+declare double @sin(double)

diff  --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll
new file mode 100644
index 0000000000000..dc643062d8697
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr32 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr64 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR64
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR
+; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
+
+define i8 @ti8() {
+entry:
+; ALL-LABEL: ti8:
+; NOZCZ-GPR: mov w0, wzr
+; ZCZ-GPR32: mov w0, #0
+  ret i8 0
+}
+
+define i16 @ti16() {
+entry:
+; ALL-LABEL: ti16:
+; NOZCZ-GPR: mov w0, wzr
+; ZCZ-GPR32: mov w0, #0
+  ret i16 0
+}
+
+define i32 @ti32() {
+entry:
+; ALL-LABEL: ti32:
+; NOZCZ-GPR: mov w0, wzr
+; ZCZ-GPR32: mov w0, #0
+  ret i32 0
+}
+
+define i64 @ti64() {
+entry:
+; ALL-LABEL: ti64:
+; NOZCZ-GPR: mov x0, xzr
+; ZCZ-GPR64: mov x0, #0
+  ret i64 0
+}

diff  --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
deleted file mode 100644
index 6c3cd4766d799..0000000000000
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ /dev/null
@@ -1,231 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=-zcz-gp,+no-zcz-fp      | FileCheck %s -check-prefixes=ALL,NONEGP,NONEFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz                    | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz -mattr=+fullfp16   | FileCheck %s -check-prefixes=ALL,ZEROGP,ZERO16
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gp,+no-zcz-fp      | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnu                                | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP
-; RUN: llc < %s -mtriple=arm64-apple-ios   -mcpu=cyclone                  | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP
-; RUN: llc < %s -mtriple=arm64-linux-gnu   -mcpu=apple-a10                | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
-; RUN: llc < %s -mtriple=arm64-apple-ios   -mcpu=cyclone -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,NONE16
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3                | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo                     | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor                   | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
-
-declare void @bar(half, float, double, <2 x double>)
-declare void @bari(i32, i32)
-declare void @barl(i64, i64)
-declare void @barf(float, float)
-
-define void @t1() nounwind ssp {
-entry:
-; ALL-LABEL: t1:
-; ALL-NOT: fmov
-; NONEFP-DAG: fmov s0, wzr
-; NONEFP-DAG: fmov s1, wzr
-; NONEFP-DAG: fmov d2, xzr
-; NONEFP-DAG: movi{{(.16b)?}} v3{{(.2d)?}}, #0
-; NONE16: fmov h0, wzr
-; NONE16: fmov s1, wzr
-; NONE16: fmov d2, xzr
-; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0
-; ZEROFP-DAG: movi d0, #0
-; ZEROFP-DAG: movi d1, #0
-; ZEROFP-DAG: movi d2, #0
-; ZEROFP-DAG: movi v3.2d, #0
-; ZERO16: movi d0, #0
-; ZERO16: movi d1, #0
-; ZERO16: movi d2, #0
-; ZERO16: movi v3.2d, #0
-  tail call void @bar(half 0.000000e+00, float 0.000000e+00, double 0.000000e+00, <2 x double> <double 0.000000e+00, double 0.000000e+00>) nounwind
-  ret void
-}
-
-define void @t2() nounwind ssp {
-entry:
-; ALL-LABEL: t2:
-; NONEGP: mov w0, wzr
-; NONEGP: mov w1, wzr
-; ZEROGP: mov w0, #0
-; ZEROGP: mov w1, #0
-  tail call void @bari(i32 0, i32 0) nounwind
-  ret void
-}
-
-define void @t3() nounwind ssp {
-entry:
-; ALL-LABEL: t3:
-; NONEGP: mov x0, xzr
-; NONEGP: mov x1, xzr
-; ZEROGP: mov x0, #0
-; ZEROGP: mov x1, #0
-  tail call void @barl(i64 0, i64 0) nounwind
-  ret void
-}
-
-define void @t4() nounwind ssp {
-; ALL-LABEL: t4:
-; NONEFP: fmov s{{[0-3]+}}, wzr
-; NONEFP: fmov s{{[0-3]+}}, wzr
-; ZEROFP: movi d0, #0
-; ZEROFP: movi d1, #0
-  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
-  ret void
-}
-
-declare double @sin(double)
-
-; We used to produce spills+reloads for a Q register with zero cycle zeroing
-; enabled.
-; ALL-LABEL: foo:
-; ALL-NOT: str q{{[0-9]+}}
-; ALL-NOT: ldr q{{[0-9]+}}
-define double @foo(i32 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ]
-  %i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %conv21 = sitofp i32 %i.076 to double
-  %call = tail call fast double @sin(double %conv21)
-  %cmp.i = fcmp fast olt double %phi0, %call
-  %v0 = select i1 %cmp.i, double %call, double %phi0
-  %inc = add nuw nsw i32 %i.076, 1
-  %cmp = icmp slt i32 %inc, %n
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  ret double %v0
-}
-
-define <2 x i64> @t6() {
-; ALL-LABEL: t6:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <2 x i64> zeroinitializer
-}
-
-define i1 @ti1() {
-entry:
-; ALL-LABEL: ti1:
-; NONEGP: mov w0, wzr
-; ZEROGP: mov w0, #0
-  ret i1 false
-}
-
-define i8 @ti8() {
-entry:
-; ALL-LABEL: ti8:
-; NONEGP: mov w0, wzr
-; ZEROGP: mov w0, #0
-  ret i8 0
-}
-
-define i16 @ti16() {
-entry:
-; ALL-LABEL: ti16:
-; NONEGP: mov w0, wzr
- ; ZEROGP: mov w0, #0
-  ret i16 0
-}
-
-define i32 @ti32() {
-entry:
-; ALL-LABEL: ti32:
-; NONEGP: mov w0, wzr
-; ZEROGP: mov w0, #0
-  ret i32 0
-}
-
-define i64 @ti64() {
-entry:
-; ALL-LABEL: ti64:
-; NONEGP: mov x0, xzr
-; ZEROGP: mov x0, #0
-  ret i64 0
-}
-
-define float @tf32() {
-entry:
-; ALL-LABEL: tf32:
-; NONEFP: mov s0, wzr
-; ZEROFP: movi d0, #0
-  ret float 0.0
-}
-
-define double @td64() {
-entry:
-; ALL-LABEL: td64:
-; NONEFP: mov d0, xzr
-; ZEROFP: movi d0, #0
-  ret double 0.0
-}
-
-define <8 x i8> @tv8i8() {
-entry:
-; ALL-LABEL: tv8i8:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-}
-
-define <4 x i16> @tv4i16() {
-entry:
-; ALL-LABEL: tv4i16:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
-}
-
-define <2 x i32> @tv2i32() {
-entry:
-; ALL-LABEL: tv2i32:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <2 x i32> <i32 0, i32 0>
-}
-
-define <2 x float> @tv2f32() {
-entry:
-; ALL-LABEL: tv2f32:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <2 x float> <float 0.0, float 0.0>
-}
-
-define <16 x i8> @tv16i8() {
-entry:
-; ALL-LABEL: tv16i8:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-}
-
-define <8 x i16> @tv8i16() {
-entry:
-; ALL-LABEL: tv8i16:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-}
-
-define <4 x i32> @tv4i32() {
-entry:
-; ALL-LABEL: tv4i32:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-}
-
-define <2 x i64> @tv2i64() {
-entry:
-; ALL-LABEL: tv2i64:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <2 x i64> <i64 0, i64 0>
-}
-
-define <4 x float> @tv4f32() {
-entry:
-; ALL-LABEL: tv4f32:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
-}
-
-define <2 x double> @tv2d64() {
-entry:
-; ALL-LABEL: tv2d64:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <2 x double> <double 0.0, double 0.0>
-}
-

diff  --git a/llvm/test/CodeGen/AArch64/f16-imm.ll b/llvm/test/CodeGen/AArch64/f16-imm.ll
index 58793bf19f3a6..68873f9b7c3de 100644
--- a/llvm/test/CodeGen/AArch64/f16-imm.ll
+++ b/llvm/test/CodeGen/AArch64/f16-imm.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+no-zcz-fp | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-NOZCZ
-; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+zcz | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-ZCZ
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+no-zcz-fpr64 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-NOZCZ
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+zcz-gpr32,+zcz-gpr64 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-ZCZ
 ; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK-NOFP16
 
 define half @Const0() {