[llvm] 22c3ba4 - [SVE] Add patterns for shift intrinsics with FalseLanesZero mode

Sat Mar 18 22:49:23 PDT 2023

Author: lizhijin
Date: 2023-03-19T13:49:01+08:00
New Revision: 22c3ba4bb519e12395c676ffe436ea4b8400234a

URL: https://github.com/llvm/llvm-project/commit/22c3ba4bb519e12395c676ffe436ea4b8400234a
DIFF: https://github.com/llvm/llvm-project/commit/22c3ba4bb519e12395c676ffe436ea4b8400234a.diff

LOG: [SVE] Add patterns for shift intrinsics with FalseLanesZero mode

This patch adds patterns to reduce redundant mov and sel instructions
for shift intrinsics with FalseLanesZero mode, when
FeatureExperimentalZeroingPseudosis supported.

For example, before:

mov     z1.b, #0
sel     z0.b, p0, z0.b, z1.b
asr     z0.b, p0/m, z0.b, #7
After:

movprfx z0.b, p0/z, z0.b
asr     z0.b, p0/m, z0.b, #7

Reviewed By: paulwalker-arm
Differential Revision: https://reviews.llvm.org/D145551

Added: 
    llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm-zero.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/lib/Target/AArch64/SVEInstrFormats.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 278f74ef341f8..858b352c8c72e 100644

--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2060,6 +2060,10 @@ let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
   defm LSR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
   defm LSL_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
   defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<AArch64asrd_m1>;
+
+  defm ASR_ZPZI  : sve_int_bin_pred_imm_zeroing_bhsd<int_aarch64_sve_asr, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
+  defm LSR_ZPZI  : sve_int_bin_pred_imm_zeroing_bhsd<int_aarch64_sve_lsr, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
+  defm LSL_ZPZI  : sve_int_bin_pred_imm_zeroing_bhsd<int_aarch64_sve_lsl, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>;
 } // End HasSVEorSME, UseExperimentalZeroingPseudos
 
 let Predicates = [HasSVEorSME] in {

diff  --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 4c97ae88f192a..1d3bf9150ca41 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -571,6 +571,12 @@ class SVE_Shift_DupImm_Any_Predicate_Pat<ValueType vt, SDPatternOperator op,
 : Pat<(vt (op (pt (SVEAnyPredicate)), vt:$Rn, (vt (splat_vector (it (cast i32:$imm)))))),
       (inst $Rn, i32:$imm)>;
 
+class SVE_2_Op_Imm_Pat_Zero<ValueType vt, SDPatternOperator op, ValueType pt,
+                            ValueType it, ComplexPattern cpx, Instruction inst>
+: Pat<(vt (op pt:$Pg, (vselect pt:$Pg, vt:$Op1, (SVEDup0)),
+                      (vt (splat_vector (it (cpx i32:$imm)))))),
+      (inst $Pg, $Op1, i32:$imm)>;
+
 class SVE_2_Op_Fp_Imm_Pat<ValueType vt, SDPatternOperator op,
                           ValueType pt, ValueType it,
                           FPImmLeaf immL, int imm,
@@ -5894,6 +5900,20 @@ multiclass sve_int_bin_pred_zeroing_bhsd<SDPatternOperator op> {
   def : SVE_3_Op_Pat_SelZero<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>;
 }
 
+multiclass sve_int_bin_pred_imm_zeroing_bhsd<SDPatternOperator op,
+                                   ComplexPattern imm_b, ComplexPattern imm_h,
+                                   ComplexPattern imm_s, ComplexPattern imm_d> {
+  def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8,  Operand<i32>, FalseLanesZero>;
+  def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, Operand<i32>, FalseLanesZero>;
+  def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, Operand<i32>, FalseLanesZero>;
+  def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, Operand<i32>, FalseLanesZero>;
+
+  def : SVE_2_Op_Imm_Pat_Zero<nxv16i8, op, nxv16i1, i32, imm_b, !cast<Pseudo>(NAME # _ZERO_B)>;
+  def : SVE_2_Op_Imm_Pat_Zero<nxv8i16, op, nxv8i1,  i32, imm_h, !cast<Pseudo>(NAME # _ZERO_H)>;
+  def : SVE_2_Op_Imm_Pat_Zero<nxv4i32, op, nxv4i1,  i32, imm_s, !cast<Pseudo>(NAME # _ZERO_S)>;
+  def : SVE_2_Op_Imm_Pat_Zero<nxv2i64, op, nxv2i1,  i64, imm_d, !cast<Pseudo>(NAME # _ZERO_D)>;
+}
+
 multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm,
                                   SDPatternOperator op> {
   def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm-zero.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm-zero.ll
new file mode 100644
index 0000000000000..6593978b03d41
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm-zero.ll
@@ -0,0 +1,176 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+use-experimental-zeroing-pseudos < %s | FileCheck %s
+
+;; ASR
+define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i8> %a, <vscale x 16 x i1> %pg) {
+; CHECK-LABEL: asr_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.b, p0/z, z0.b
+; CHECK-NEXT:    asr z0.b, p0/m, z0.b, #8
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> zeroinitializer
+  %ele = insertelement <vscale x 16 x i8> poison, i8 8, i32 0
+  %shuffle = shufflevector <vscale x 16 x i8> %ele, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %vsel, <vscale x 16 x i8> %shuffle)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @asr_i16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg) {
+; CHECK-LABEL: asr_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT:    asr z0.h, p0/m, z0.h, #16
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> zeroinitializer
+  %ele = insertelement <vscale x 8 x i16> poison, i16 16, i32 0
+  %shuffle = shufflevector <vscale x 8 x i16> %ele, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %vsel, <vscale x 8 x i16> %shuffle)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @asr_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg) local_unnamed_addr #0 {
+; CHECK-LABEL: asr_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.s, p0/z, z0.s
+; CHECK-NEXT:    asr z0.s, p0/m, z0.s, #32
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> zeroinitializer
+  %ele = insertelement <vscale x 4 x i32> poison, i32 32, i32 0
+  %shuffle = shufflevector <vscale x 4 x i32> %ele, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %vsel, <vscale x 4 x i32> %shuffle)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @asr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg) {
+; CHECK-LABEL: asr_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.d, p0/z, z0.d
+; CHECK-NEXT:    asr z0.d, p0/m, z0.d, #64
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
+  %ele = insertelement <vscale x 2 x i64> poison, i64 64, i32 0
+  %shuffle = shufflevector <vscale x 2 x i64> %ele, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %vsel, <vscale x 2 x i64> %shuffle)
+  ret <vscale x 2 x i64> %res
+}
+
+;; LSL
+define <vscale x 16 x i8> @lsl_i8(<vscale x 16 x i8> %a, <vscale x 16 x i1> %pg) {
+; CHECK-LABEL: lsl_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.b, p0/z, z0.b
+; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, #7
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> zeroinitializer
+  %ele = insertelement <vscale x 16 x i8> poison, i8 7, i32 0
+  %shuffle = shufflevector <vscale x 16 x i8> %ele, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %vsel, <vscale x 16 x i8> %shuffle)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @lsl_i16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg) {
+; CHECK-LABEL: lsl_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, #15
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> zeroinitializer
+  %ele = insertelement <vscale x 8 x i16> poison, i16 15, i32 0
+  %shuffle = shufflevector <vscale x 8 x i16> %ele, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %vsel, <vscale x 8 x i16> %shuffle)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @lsl_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg) local_unnamed_addr #0 {
+; CHECK-LABEL: lsl_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.s, p0/z, z0.s
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, #31
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> zeroinitializer
+  %ele = insertelement <vscale x 4 x i32> poison, i32 31, i32 0
+  %shuffle = shufflevector <vscale x 4 x i32> %ele, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %vsel, <vscale x 4 x i32> %shuffle)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @lsl_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg) {
+; CHECK-LABEL: lsl_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.d, p0/z, z0.d
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, #63
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
+  %ele = insertelement <vscale x 2 x i64> poison, i64 63, i32 0
+  %shuffle = shufflevector <vscale x 2 x i64> %ele, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %vsel, <vscale x 2 x i64> %shuffle)
+  ret <vscale x 2 x i64> %res
+}
+
+;; LSR
+define <vscale x 16 x i8> @lsr_i8(<vscale x 16 x i8> %a, <vscale x 16 x i1> %pg) {
+; CHECK-LABEL: lsr_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.b, p0/z, z0.b
+; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, #8
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> zeroinitializer
+  %ele = insertelement <vscale x 16 x i8> poison, i8 8, i32 0
+  %shuffle = shufflevector <vscale x 16 x i8> %ele, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %vsel, <vscale x 16 x i8> %shuffle)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @lsr_i16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg) {
+; CHECK-LABEL: lsr_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, #16
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> zeroinitializer
+  %ele = insertelement <vscale x 8 x i16> poison, i16 16, i32 0
+  %shuffle = shufflevector <vscale x 8 x i16> %ele, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %vsel, <vscale x 8 x i16> %shuffle)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @lsr_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg) local_unnamed_addr #0 {
+; CHECK-LABEL: lsr_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.s, p0/z, z0.s
+; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, #32
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> zeroinitializer
+  %ele = insertelement <vscale x 4 x i32> poison, i32 32, i32 0
+  %shuffle = shufflevector <vscale x 4 x i32> %ele, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %vsel, <vscale x 4 x i32> %shuffle)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @lsr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg) {
+; CHECK-LABEL: lsr_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.d, p0/z, z0.d
+; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, #64
+; CHECK-NEXT:    ret
+  %vsel = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
+  %ele = insertelement <vscale x 2 x i64> poison, i64 64, i32 0
+  %shuffle = shufflevector <vscale x 2 x i64> %ele, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %vsel, <vscale x 2 x i64> %shuffle)
+  ret <vscale x 2 x i64> %res
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)