[llvm] ffb109b - [AArch64][SVE] Support logical operation BIC with DestructiveBinary patterns

Thu Nov 10 17:16:42 PST 2022

Author: zhongyunde
Date: 2022-11-11T09:15:50+08:00
New Revision: ffb109b6852d248c9d2e3202477dccf20aac7151

URL: https://github.com/llvm/llvm-project/commit/ffb109b6852d248c9d2e3202477dccf20aac7151
DIFF: https://github.com/llvm/llvm-project/commit/ffb109b6852d248c9d2e3202477dccf20aac7151.diff

LOG: [AArch64][SVE] Support logical operation BIC with DestructiveBinary patterns

Logical operation BIC with DestructiveBinary patterns is temporarily removed as
causes an assert (commit 3c382ed71f15), so try to fix that.
The most significant being that for pseudo instructions that do not have real instructions (including movpfx'd ones) that cover all combinations of register allocation, their expansion will be broken. This is the main reason the zeroing is an experimental feature because it has known bugs.
So we add an extra LSL for movprfx expand BIC_ZPZZ_ZERO A, P, A, A when necessary.
  movprfx	z0.s, p0/z, z0.s
  lsl z0.b, p0/m, z0.b, #0
  bic	z0.s, p0/m, z0.s, z0.s

Depends on D88595

Added: 
    llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.mir

Modified: 
    llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 62a0ae5a264e2..7bfb8c8f3c8be 100644

--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -447,15 +447,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask;
   uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask;
   bool FalseZero = FalseLanes == AArch64::FalseLanesZero;
-
   Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
-
-  if (DType == AArch64::DestructiveBinary)
-    assert(DstReg != MI.getOperand(3).getReg());
-
   bool UseRev = false;
   unsigned PredIdx, DOPIdx, SrcIdx, Src2Idx;
+
   switch (DType) {
   case AArch64::DestructiveBinaryComm:
   case AArch64::DestructiveBinaryCommWithRev:
@@ -489,12 +485,13 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
     llvm_unreachable("Unsupported Destructive Operand type");
   }
 
-#ifndef NDEBUG
   // MOVPRFX can only be used if the destination operand
   // is the destructive operand, not as any other operand,
   // so the Destructive Operand must be unique.
   bool DOPRegIsUnique = false;
   switch (DType) {
+  case AArch64::DestructiveBinary:
+    DOPRegIsUnique = DstReg != MI.getOperand(SrcIdx).getReg();
   case AArch64::DestructiveBinaryComm:
   case AArch64::DestructiveBinaryCommWithRev:
     DOPRegIsUnique =
@@ -512,7 +509,6 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
          MI.getOperand(DOPIdx).getReg() != MI.getOperand(Src2Idx).getReg());
     break;
   }
-#endif
 
   // Resolve the reverse opcode
   if (UseRev) {
@@ -527,23 +523,27 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
 
   // Get the right MOVPRFX
   uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode);
-  unsigned MovPrfx, MovPrfxZero;
+  unsigned MovPrfx, LSLZero, MovPrfxZero;
   switch (ElementSize) {
   case AArch64::ElementSizeNone:
   case AArch64::ElementSizeB:
     MovPrfx = AArch64::MOVPRFX_ZZ;
+    LSLZero = AArch64::LSL_ZPmI_B;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B;
     break;
   case AArch64::ElementSizeH:
     MovPrfx = AArch64::MOVPRFX_ZZ;
+    LSLZero = AArch64::LSL_ZPmI_H;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H;
     break;
   case AArch64::ElementSizeS:
     MovPrfx = AArch64::MOVPRFX_ZZ;
+    LSLZero = AArch64::LSL_ZPmI_S;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S;
     break;
   case AArch64::ElementSizeD:
     MovPrfx = AArch64::MOVPRFX_ZZ;
+    LSLZero = AArch64::LSL_ZPmI_D;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D;
     break;
   default:
@@ -555,9 +555,10 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   //
   MachineInstrBuilder PRFX, DOP;
   if (FalseZero) {
-#ifndef NDEBUG
-    assert(DOPRegIsUnique && "The destructive operand should be unique");
-#endif
+    // If we cannot prefix the requested instruction we'll instead emit a
+    // prefixed_zeroing_mov for DestructiveBinary.
+    assert((DOPRegIsUnique || AArch64::DestructiveBinary == DType) &&
+           "The destructive operand should be unique");
     assert(ElementSize != AArch64::ElementSizeNone &&
            "This instruction is unpredicated");
 
@@ -569,10 +570,19 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
 
     // After the movprfx, the destructive operand is same as Dst
     DOPIdx = 0;
+
+    // Create the additional LSL to zero the lanes when the DstReg is not
+    // unique. Zeros the lanes in z0 that aren't active in p0 with sequence
+    // movprfx z0.b, p0/z, z0.b; lsl z0.b, p0/m, z0.b, #0;
+    if (DType == AArch64::DestructiveBinary && !DOPRegIsUnique) {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LSLZero))
+          .addReg(DstReg, RegState::Define)
+          .add(MI.getOperand(PredIdx))
+          .addReg(DstReg)
+          .addImm(0);
+    }
   } else if (DstReg != MI.getOperand(DOPIdx).getReg()) {
-#ifndef NDEBUG
     assert(DOPRegIsUnique && "The destructive operand should be unique");
-#endif
     PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx))
                .addReg(DstReg, RegState::Define)
                .addReg(MI.getOperand(DOPIdx).getReg());
@@ -591,6 +601,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
         .add(MI.getOperand(PredIdx))
         .add(MI.getOperand(SrcIdx));
     break;
+  case AArch64::DestructiveBinary:
   case AArch64::DestructiveBinaryImm:
   case AArch64::DestructiveBinaryComm:
   case AArch64::DestructiveBinaryCommWithRev:

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 43ea225f0725d..c0764937acb46 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -433,7 +433,7 @@ let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
   defm ORR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_orr>;
   defm EOR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_eor>;
   defm AND_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_and>;
-  defm BIC_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<null_frag>;
+  defm BIC_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_bic>;
 } // End HasSVEorSME, UseExperimentalZeroingPseudos
 
 let Predicates = [HasSVEorSME] in {

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
index 9f13980a8b2c0..42c65a2246778 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
@@ -344,8 +344,7 @@ define <vscale x 2 x i64> @and_i64_zero(<vscale x 2 x i1> %pg, <vscale x 2 x i64
 define <vscale x 16 x i8> @bic_i8_zero(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: bic_i8_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.b, #0 // =0x0
-; CHECK-NEXT:    sel z0.b, p0, z0.b, z2.b
+; CHECK-NEXT:    movprfx z0.b, p0/z, z0.b
 ; CHECK-NEXT:    bic z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
   %a_z = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> zeroinitializer
@@ -358,8 +357,7 @@ define <vscale x 16 x i8> @bic_i8_zero(<vscale x 16 x i1> %pg, <vscale x 16 x i8
 define <vscale x 8 x i16> @bic_i16_zero(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: bic_i16_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.h, #0 // =0x0
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
 ; CHECK-NEXT:    bic z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> zeroinitializer
@@ -372,8 +370,7 @@ define <vscale x 8 x i16> @bic_i16_zero(<vscale x 8 x i1> %pg, <vscale x 8 x i16
 define <vscale x 4 x i32> @bic_i32_zero(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: bic_i32_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.s, #0 // =0x0
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    movprfx z0.s, p0/z, z0.s
 ; CHECK-NEXT:    bic z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %a_z = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> zeroinitializer
@@ -386,8 +383,7 @@ define <vscale x 4 x i32> @bic_i32_zero(<vscale x 4 x i1> %pg, <vscale x 4 x i32
 define <vscale x 2 x i64> @bic_i64_zero(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: bic_i64_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
+; CHECK-NEXT:    movprfx z0.d, p0/z, z0.d
 ; CHECK-NEXT:    bic z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %a_z = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
@@ -397,6 +393,39 @@ define <vscale x 2 x i64> @bic_i64_zero(<vscale x 2 x i1> %pg, <vscale x 2 x i64
   ret <vscale x 2 x i64> %out
 }
 
+; BIC (i.e. A & ~A) is illegal operation with movprfx, so the codegen depend on IR before expand-pseudo
+define <vscale x 2 x i64> @bic_i64_zero_no_unique_reg(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: bic_i64_zero_no_unique_reg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z0.d, p0/z, z0.d
+; CHECK-NEXT:    bic z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a_z,
+                                                               <vscale x 2 x i64> %a_z)
+  ret <vscale x 2 x i64> %out
+}
+
+; BIC (i.e. A & ~B) is not a commutative operation, so disable it when the
+; destination operand is not the destructive operand
+define <vscale x 2 x i64> @bic_i64_zero_no_comm(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: bic_i64_zero_no_comm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
+; CHECK-NEXT:    bic z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %b,
+                                                               <vscale x 2 x i64> %a_z)
+  ret <vscale x 2 x i64> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.add.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.mir b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.mir
new file mode 100644
index 0000000000000..da40014144cd7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+use-experimental-zeroing-pseudos -run-pass=aarch64-expand-pseudo %s -o - | FileCheck %s
+
+# Should create an additional LSL to zero the lanes as the DstReg is not unique
+
+--- |
+  define <vscale x 8 x i16> @bic_i16_zero(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a){
+    %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> zeroinitializer
+    %out = call <vscale x 8 x i16> @llvm.aarch64.sve.bic.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a_z, <vscale x 8 x i16> %a_z)
+    ret <vscale x 8 x i16> %out
+  }
+
+  declare <vscale x 8 x i16> @llvm.aarch64.sve.bic.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+...
+---
+name:            bic_i16_zero
+alignment:       4
+tracksRegLiveness: true
+tracksDebugUserValues: true
+registers:       []
+liveins:
+  - { reg: '$p0', virtual-reg: '' }
+  - { reg: '$z0', virtual-reg: '' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0, $z0
+
+    ; CHECK-LABEL: name: bic_i16_zero
+    ; CHECK: liveins: $p0, $z0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $z0, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $h0, implicit-def $b0, implicit-def $z0_hi, implicit killed $p0, implicit $z0 {
+    ; CHECK-NEXT:   $z0 = MOVPRFX_ZPzZ_H $p0, $z0
+    ; CHECK-NEXT:   $z0 = LSL_ZPmI_H killed renamable $p0, internal $z0, 0
+    ; CHECK-NEXT:   $z0 = BIC_ZPmZ_H killed renamable $p0, internal killed $z0, internal killed renamable $z0
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: RET undef $lr, implicit $z0
+    renamable $z0 = BIC_ZPZZ_ZERO_H killed renamable $p0, killed renamable $z0, killed renamable $z0
+    RET_ReallyLR implicit $z0
+...