[llvm] [AArch64][SVE2] Generate XAR (PR #77160)

Mon Jan 8 18:46:56 PST 2024

https://github.com/UsmanNadeem updated https://github.com/llvm/llvm-project/pull/77160

>From b3e26b39271841a343e1f78b02ca17f69a3eb8ab Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Thu, 21 Dec 2023 13:55:21 -0800
Subject: [PATCH 1/4] [AArch64][SVE2] Generate XAR

Bitwise exclusive OR and rotate right by immediate

Added a new ISD node for XAR and lower the following rotate pattern
to XAR for appropriate types:
    rotr (xor(x, y), imm) -> xar1 (x, y, imm)

Change-Id: If1f649b1bf5365b575dc9fa3e6618e97dc19a066
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  32 +++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   4 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  11 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |   1 +
 llvm/test/CodeGen/AArch64/sve2-xar.ll         | 213 ++++++++++++++++++
 5 files changed, 260 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-xar.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 102fd0c3dae2ab..cd51ce01caee9f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1648,6 +1648,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
   }
 
+  if (Subtarget->hasSVE2orSME()) {
+    for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64})
+      setOperationAction(ISD::ROTL, VT, Custom);
+  }
+
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 
   IsStrictFPEnabled = true;
@@ -2645,6 +2650,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::MSRR)
     MAKE_CASE(AArch64ISD::RSHRNB_I)
     MAKE_CASE(AArch64ISD::CTTZ_ELTS)
+    MAKE_CASE(AArch64ISD::XAR_I)
   }
 #undef MAKE_CASE
   return nullptr;
@@ -3741,6 +3747,30 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
   return std::make_pair(Value, Overflow);
 }
 
+SDValue AArch64TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  assert(VT.isScalableVector() && "Expected a scalable vector.");
+  assert(Subtarget->hasSVE2orSME() && "Custom lowering only for SVE2.");
+
+  // rotr (xor(x, y), imm) -> xar1 (x, y, imm)
+  SDValue Xor = Op.getOperand(0);
+  SDValue RotlValue = Op.getOperand(1);
+
+  if (Xor.getOpcode() != ISD::XOR || RotlValue.getOpcode() != ISD::SPLAT_VECTOR)
+    return SDValue();
+  if (!isa<ConstantSDNode>(RotlValue.getOperand(0).getNode()))
+    return SDValue();
+
+  uint64_t RotrAmt =
+      (VT.getScalarSizeInBits() - RotlValue->getConstantOperandVal(0)) %
+      VT.getScalarSizeInBits();
+
+  SDLoc DL(Op);
+  SDValue Ops[] = {Xor.getOperand(0), Xor.getOperand(1),
+                   DAG.getTargetConstant(RotrAmt, DL, MVT::i32)};
+  return DAG.getNode(AArch64ISD::XAR_I, DL, VT, Ops);
+}
+
 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
                                    !Subtarget->isNeonAvailable()))
@@ -6414,6 +6444,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFunnelShift(Op, DAG);
   case ISD::FLDEXP:
     return LowerFLDEXP(Op, DAG);
+  case ISD::ROTL:
+    return LowerROTL(Op, DAG);
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6ddbcd41dcb769..0d9ebad4ada905 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -215,6 +215,9 @@ enum NodeType : unsigned {
   // Vector narrowing shift by immediate (bottom)
   RSHRNB_I,
 
+  // Vector bitwise xor and rotate right by immediate
+  XAR_I,
+
   // Vector shift by constant and insert
   VSLI,
   VSRI,
@@ -1143,6 +1146,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 344a153890631e..6e018afe18bd40 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -454,6 +454,15 @@ def AArch64eor3 : PatFrags<(ops node:$op1, node:$op2, node:$op3),
                            [(int_aarch64_sve_eor3 node:$op1, node:$op2, node:$op3),
                             (xor node:$op1, (xor node:$op2, node:$op3))]>;
 
+def SDT_AArch64xar_Imm : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>,
+  SDTCisSameAs<0,1>, SDTCisSameAs<1,2>]>;
+def AArch64xar_node : SDNode<"AArch64ISD::XAR_I",  SDT_AArch64xar_Imm>;
+def AArch64xar : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+                           [(int_aarch64_sve_xar node:$op1, node:$op2, node:$op3),
+                            (AArch64xar_node node:$op1, node:$op2, node:$op3)]>;
+
+
 def AArch64fmla_m1 : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm),
                               [(int_aarch64_sve_fmla node:$pg, node:$za, node:$zn, node:$zm),
                                (vselect node:$pg, (AArch64fadd_p_contract (SVEAllActive), node:$za, (AArch64fmul_p_oneuse (SVEAllActive), node:$zn, node:$zm)), node:$za),
@@ -3721,7 +3730,7 @@ let Predicates = [HasSVE2orSME] in {
   defm NBSL_ZZZZ  : sve2_int_bitwise_ternary_op<0b111, "nbsl",  int_aarch64_sve_nbsl>;
 
   // SVE2 bitwise xor and rotate right by immediate
-  defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;
+  defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", AArch64xar>;
 
   // SVE2 extract vector (immediate offset, constructive)
   def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index b17e215e200dea..a131cf8a6f5402 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -394,6 +394,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   void mirFileLoaded(MachineFunction &MF) const override;
 
   bool hasSVEorSME() const { return hasSVE() || hasSME(); }
+  bool hasSVE2orSME() const { return hasSVE2() || hasSME(); }
 
   // Return the known range for the bit length of SVE data registers. A value
   // of 0 means nothing is known about that particular limit beyong what's
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
new file mode 100644
index 00000000000000..4b032f74b3a244
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefix=SVE %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s -o - | FileCheck --check-prefix=SVE2 %s
+
+define <vscale x 2 x i64> @xar_nxv2i64_l(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; SVE-LABEL: xar_nxv2i64_l:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsr z1.d, z0.d, #4
+; SVE-NEXT:    lsl z0.d, z0.d, #60
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv2i64_l:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 2 x i64> %x, %y
+    %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 60, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+    ret <vscale x 2 x i64> %b
+}
+
+define <vscale x 2 x i64> @xar_nxv2i64_r(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; SVE-LABEL: xar_nxv2i64_r:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsl z1.d, z0.d, #60
+; SVE-NEXT:    lsr z0.d, z0.d, #4
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv2i64_r:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 2 x i64> %x, %y
+    %b = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+    ret <vscale x 2 x i64> %b
+}
+
+
+define <vscale x 4 x i32> @xar_nxv4i32_l(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; SVE-LABEL: xar_nxv4i32_l:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsr z1.s, z0.s, #4
+; SVE-NEXT:    lsl z0.s, z0.s, #28
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv4i32_l:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 4 x i32> %x, %y
+    %b = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 28, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+    ret <vscale x 4 x i32> %b
+}
+
+define <vscale x 4 x i32> @xar_nxv4i32_r(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; SVE-LABEL: xar_nxv4i32_r:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsl z1.s, z0.s, #28
+; SVE-NEXT:    lsr z0.s, z0.s, #4
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv4i32_r:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 4 x i32> %x, %y
+    %b = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+    ret <vscale x 4 x i32> %b
+}
+
+define <vscale x 8 x i16> @xar_nxv8i16_l(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; SVE-LABEL: xar_nxv8i16_l:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsr z1.h, z0.h, #4
+; SVE-NEXT:    lsl z0.h, z0.h, #12
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv8i16_l:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 8 x i16> %x, %y
+    %b = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 12, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
+    ret <vscale x 8 x i16> %b
+}
+
+define <vscale x 8 x i16> @xar_nxv8i16_r(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; SVE-LABEL: xar_nxv8i16_r:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsl z1.h, z0.h, #12
+; SVE-NEXT:    lsr z0.h, z0.h, #4
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv8i16_r:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 8 x i16> %x, %y
+    %b = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 4, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
+    ret <vscale x 8 x i16> %b
+}
+
+define <vscale x 16 x i8> @xar_nxv16i8_l(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; SVE-LABEL: xar_nxv16i8_l:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsr z1.b, z0.b, #4
+; SVE-NEXT:    lsl z0.b, z0.b, #4
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv16i8_l:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 16 x i8> %x, %y
+    %b = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 4, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
+    ret <vscale x 16 x i8> %b
+}
+
+define <vscale x 16 x i8> @xar_nxv16i8_r(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; SVE-LABEL: xar_nxv16i8_r:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsl z1.b, z0.b, #4
+; SVE-NEXT:    lsr z0.b, z0.b, #4
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv16i8_r:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 16 x i8> %x, %y
+    %b = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 4, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
+    ret <vscale x 16 x i8> %b
+}
+
+
+define <vscale x 2 x i64> @xar_nxv2i64_l_neg1(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i64> %z) {
+; SVE-LABEL: xar_nxv2i64_l_neg1:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z3.d, z2.d
+; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    subr z2.d, z2.d, #0 // =0x0
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    and z2.d, z2.d, #0x3f
+; SVE-NEXT:    and z3.d, z3.d, #0x3f
+; SVE-NEXT:    movprfx z1, z0
+; SVE-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
+; SVE-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv2i64_l_neg1:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    mov z3.d, z2.d
+; SVE2-NEXT:    ptrue p0.d
+; SVE2-NEXT:    subr z2.d, z2.d, #0 // =0x0
+; SVE2-NEXT:    eor z0.d, z0.d, z1.d
+; SVE2-NEXT:    and z2.d, z2.d, #0x3f
+; SVE2-NEXT:    and z3.d, z3.d, #0x3f
+; SVE2-NEXT:    movprfx z1, z0
+; SVE2-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
+; SVE2-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
+; SVE2-NEXT:    orr z0.d, z1.d, z0.d
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 2 x i64> %x, %y
+    %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %z)
+    ret <vscale x 2 x i64> %b
+}
+
+; TODO: We could use usra instruction here.
+define <vscale x 2 x i64> @xar_nxv2i64_l_neg2(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; SVE-LABEL: xar_nxv2i64_l_neg2:
+; SVE:       // %bb.0:
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    lsr z1.d, z0.d, #4
+; SVE-NEXT:    lsl z0.d, z0.d, #60
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv2i64_l_neg2:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    orr z0.d, z0.d, z1.d
+; SVE2-NEXT:    lsr z1.d, z0.d, #4
+; SVE2-NEXT:    lsl z0.d, z0.d, #60
+; SVE2-NEXT:    orr z0.d, z0.d, z1.d
+; SVE2-NEXT:    ret
+    %a = or <vscale x 2 x i64> %x, %y
+    %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 60, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+    ret <vscale x 2 x i64> %b
+}
+
+declare <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)

>From eea80c2d3fd9da3e11986a8ab2278029a2298032 Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Mon, 8 Jan 2024 14:41:02 -0800
Subject: [PATCH 2/4] Revert "[AArch64][SVE2] Generate XAR"

This reverts commit b3e26b39271841a343e1f78b02ca17f69a3eb8ab.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  32 ---
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   4 -
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  11 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |   1 -
 llvm/test/CodeGen/AArch64/sve2-xar.ll         | 213 ------------------
 5 files changed, 1 insertion(+), 260 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/sve2-xar.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cd51ce01caee9f..102fd0c3dae2ab 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1648,11 +1648,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
   }
 
-  if (Subtarget->hasSVE2orSME()) {
-    for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64})
-      setOperationAction(ISD::ROTL, VT, Custom);
-  }
-
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 
   IsStrictFPEnabled = true;
@@ -2650,7 +2645,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::MSRR)
     MAKE_CASE(AArch64ISD::RSHRNB_I)
     MAKE_CASE(AArch64ISD::CTTZ_ELTS)
-    MAKE_CASE(AArch64ISD::XAR_I)
   }
 #undef MAKE_CASE
   return nullptr;
@@ -3747,30 +3741,6 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
   return std::make_pair(Value, Overflow);
 }
 
-SDValue AArch64TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  assert(VT.isScalableVector() && "Expected a scalable vector.");
-  assert(Subtarget->hasSVE2orSME() && "Custom lowering only for SVE2.");
-
-  // rotr (xor(x, y), imm) -> xar1 (x, y, imm)
-  SDValue Xor = Op.getOperand(0);
-  SDValue RotlValue = Op.getOperand(1);
-
-  if (Xor.getOpcode() != ISD::XOR || RotlValue.getOpcode() != ISD::SPLAT_VECTOR)
-    return SDValue();
-  if (!isa<ConstantSDNode>(RotlValue.getOperand(0).getNode()))
-    return SDValue();
-
-  uint64_t RotrAmt =
-      (VT.getScalarSizeInBits() - RotlValue->getConstantOperandVal(0)) %
-      VT.getScalarSizeInBits();
-
-  SDLoc DL(Op);
-  SDValue Ops[] = {Xor.getOperand(0), Xor.getOperand(1),
-                   DAG.getTargetConstant(RotrAmt, DL, MVT::i32)};
-  return DAG.getNode(AArch64ISD::XAR_I, DL, VT, Ops);
-}
-
 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
                                    !Subtarget->isNeonAvailable()))
@@ -6444,8 +6414,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFunnelShift(Op, DAG);
   case ISD::FLDEXP:
     return LowerFLDEXP(Op, DAG);
-  case ISD::ROTL:
-    return LowerROTL(Op, DAG);
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 0d9ebad4ada905..6ddbcd41dcb769 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -215,9 +215,6 @@ enum NodeType : unsigned {
   // Vector narrowing shift by immediate (bottom)
   RSHRNB_I,
 
-  // Vector bitwise xor and rotate right by immediate
-  XAR_I,
-
   // Vector shift by constant and insert
   VSLI,
   VSRI,
@@ -1146,7 +1143,6 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 6e018afe18bd40..344a153890631e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -454,15 +454,6 @@ def AArch64eor3 : PatFrags<(ops node:$op1, node:$op2, node:$op3),
                            [(int_aarch64_sve_eor3 node:$op1, node:$op2, node:$op3),
                             (xor node:$op1, (xor node:$op2, node:$op3))]>;
 
-def SDT_AArch64xar_Imm : SDTypeProfile<1, 3, [
-  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>,
-  SDTCisSameAs<0,1>, SDTCisSameAs<1,2>]>;
-def AArch64xar_node : SDNode<"AArch64ISD::XAR_I",  SDT_AArch64xar_Imm>;
-def AArch64xar : PatFrags<(ops node:$op1, node:$op2, node:$op3),
-                           [(int_aarch64_sve_xar node:$op1, node:$op2, node:$op3),
-                            (AArch64xar_node node:$op1, node:$op2, node:$op3)]>;
-
-
 def AArch64fmla_m1 : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm),
                               [(int_aarch64_sve_fmla node:$pg, node:$za, node:$zn, node:$zm),
                                (vselect node:$pg, (AArch64fadd_p_contract (SVEAllActive), node:$za, (AArch64fmul_p_oneuse (SVEAllActive), node:$zn, node:$zm)), node:$za),
@@ -3730,7 +3721,7 @@ let Predicates = [HasSVE2orSME] in {
   defm NBSL_ZZZZ  : sve2_int_bitwise_ternary_op<0b111, "nbsl",  int_aarch64_sve_nbsl>;
 
   // SVE2 bitwise xor and rotate right by immediate
-  defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", AArch64xar>;
+  defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;
 
   // SVE2 extract vector (immediate offset, constructive)
   def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index a131cf8a6f5402..b17e215e200dea 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -394,7 +394,6 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   void mirFileLoaded(MachineFunction &MF) const override;
 
   bool hasSVEorSME() const { return hasSVE() || hasSME(); }
-  bool hasSVE2orSME() const { return hasSVE2() || hasSME(); }
 
   // Return the known range for the bit length of SVE data registers. A value
   // of 0 means nothing is known about that particular limit beyong what's
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
deleted file mode 100644
index 4b032f74b3a244..00000000000000
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ /dev/null
@@ -1,213 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefix=SVE %s
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s -o - | FileCheck --check-prefix=SVE2 %s
-
-define <vscale x 2 x i64> @xar_nxv2i64_l(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
-; SVE-LABEL: xar_nxv2i64_l:
-; SVE:       // %bb.0:
-; SVE-NEXT:    eor z0.d, z0.d, z1.d
-; SVE-NEXT:    lsr z1.d, z0.d, #4
-; SVE-NEXT:    lsl z0.d, z0.d, #60
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
-; SVE-NEXT:    ret
-;
-; SVE2-LABEL: xar_nxv2i64_l:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #4
-; SVE2-NEXT:    ret
-    %a = xor <vscale x 2 x i64> %x, %y
-    %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 60, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
-    ret <vscale x 2 x i64> %b
-}
-
-define <vscale x 2 x i64> @xar_nxv2i64_r(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
-; SVE-LABEL: xar_nxv2i64_r:
-; SVE:       // %bb.0:
-; SVE-NEXT:    eor z0.d, z0.d, z1.d
-; SVE-NEXT:    lsl z1.d, z0.d, #60
-; SVE-NEXT:    lsr z0.d, z0.d, #4
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
-; SVE-NEXT:    ret
-;
-; SVE2-LABEL: xar_nxv2i64_r:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #4
-; SVE2-NEXT:    ret
-    %a = xor <vscale x 2 x i64> %x, %y
-    %b = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
-    ret <vscale x 2 x i64> %b
-}
-
-
-define <vscale x 4 x i32> @xar_nxv4i32_l(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
-; SVE-LABEL: xar_nxv4i32_l:
-; SVE:       // %bb.0:
-; SVE-NEXT:    eor z0.d, z0.d, z1.d
-; SVE-NEXT:    lsr z1.s, z0.s, #4
-; SVE-NEXT:    lsl z0.s, z0.s, #28
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
-; SVE-NEXT:    ret
-;
-; SVE2-LABEL: xar_nxv4i32_l:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #4
-; SVE2-NEXT:    ret
-    %a = xor <vscale x 4 x i32> %x, %y
-    %b = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 28, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
-    ret <vscale x 4 x i32> %b
-}
-
-define <vscale x 4 x i32> @xar_nxv4i32_r(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
-; SVE-LABEL: xar_nxv4i32_r:
-; SVE:       // %bb.0:
-; SVE-NEXT:    eor z0.d, z0.d, z1.d
-; SVE-NEXT:    lsl z1.s, z0.s, #28
-; SVE-NEXT:    lsr z0.s, z0.s, #4
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
-; SVE-NEXT:    ret
-;
-; SVE2-LABEL: xar_nxv4i32_r:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #4
-; SVE2-NEXT:    ret
-    %a = xor <vscale x 4 x i32> %x, %y
-    %b = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
-    ret <vscale x 4 x i32> %b
-}
-
-define <vscale x 8 x i16> @xar_nxv8i16_l(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
-; SVE-LABEL: xar_nxv8i16_l:
-; SVE:       // %bb.0:
-; SVE-NEXT:    eor z0.d, z0.d, z1.d
-; SVE-NEXT:    lsr z1.h, z0.h, #4
-; SVE-NEXT:    lsl z0.h, z0.h, #12
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
-; SVE-NEXT:    ret
-;
-; SVE2-LABEL: xar_nxv8i16_l:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #4
-; SVE2-NEXT:    ret
-    %a = xor <vscale x 8 x i16> %x, %y
-    %b = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 12, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
-    ret <vscale x 8 x i16> %b
-}
-
-define <vscale x 8 x i16> @xar_nxv8i16_r(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
-; SVE-LABEL: xar_nxv8i16_r:
-; SVE:       // %bb.0:
-; SVE-NEXT:    eor z0.d, z0.d, z1.d
-; SVE-NEXT:    lsl z1.h, z0.h, #12
-; SVE-NEXT:    lsr z0.h, z0.h, #4
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
-; SVE-NEXT:    ret
-;
-; SVE2-LABEL: xar_nxv8i16_r:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #4
-; SVE2-NEXT:    ret
-    %a = xor <vscale x 8 x i16> %x, %y
-    %b = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 4, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
-    ret <vscale x 8 x i16> %b
-}
-
-define <vscale x 16 x i8> @xar_nxv16i8_l(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
-; SVE-LABEL: xar_nxv16i8_l:
-; SVE:       // %bb.0:
-; SVE-NEXT:    eor z0.d, z0.d, z1.d
-; SVE-NEXT:    lsr z1.b, z0.b, #4
-; SVE-NEXT:    lsl z0.b, z0.b, #4
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
-; SVE-NEXT:    ret
-;
-; SVE2-LABEL: xar_nxv16i8_l:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #4
-; SVE2-NEXT:    ret
-    %a = xor <vscale x 16 x i8> %x, %y
-    %b = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 4, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
-    ret <vscale x 16 x i8> %b
-}
-
-define <vscale x 16 x i8> @xar_nxv16i8_r(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
-; SVE-LABEL: xar_nxv16i8_r:
-; SVE:       // %bb.0:
-; SVE-NEXT:    eor z0.d, z0.d, z1.d
-; SVE-NEXT:    lsl z1.b, z0.b, #4
-; SVE-NEXT:    lsr z0.b, z0.b, #4
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
-; SVE-NEXT:    ret
-;
-; SVE2-LABEL: xar_nxv16i8_r:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #4
-; SVE2-NEXT:    ret
-    %a = xor <vscale x 16 x i8> %x, %y
-    %b = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 4, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
-    ret <vscale x 16 x i8> %b
-}
-
-
-define <vscale x 2 x i64> @xar_nxv2i64_l_neg1(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i64> %z) {
-; SVE-LABEL: xar_nxv2i64_l_neg1:
-; SVE:       // %bb.0:
-; SVE-NEXT:    mov z3.d, z2.d
-; SVE-NEXT:    ptrue p0.d
-; SVE-NEXT:    subr z2.d, z2.d, #0 // =0x0
-; SVE-NEXT:    eor z0.d, z0.d, z1.d
-; SVE-NEXT:    and z2.d, z2.d, #0x3f
-; SVE-NEXT:    and z3.d, z3.d, #0x3f
-; SVE-NEXT:    movprfx z1, z0
-; SVE-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
-; SVE-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
-; SVE-NEXT:    orr z0.d, z1.d, z0.d
-; SVE-NEXT:    ret
-;
-; SVE2-LABEL: xar_nxv2i64_l_neg1:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    mov z3.d, z2.d
-; SVE2-NEXT:    ptrue p0.d
-; SVE2-NEXT:    subr z2.d, z2.d, #0 // =0x0
-; SVE2-NEXT:    eor z0.d, z0.d, z1.d
-; SVE2-NEXT:    and z2.d, z2.d, #0x3f
-; SVE2-NEXT:    and z3.d, z3.d, #0x3f
-; SVE2-NEXT:    movprfx z1, z0
-; SVE2-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
-; SVE2-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
-; SVE2-NEXT:    orr z0.d, z1.d, z0.d
-; SVE2-NEXT:    ret
-    %a = xor <vscale x 2 x i64> %x, %y
-    %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %z)
-    ret <vscale x 2 x i64> %b
-}
-
-; TODO: We could use usra instruction here.
-define <vscale x 2 x i64> @xar_nxv2i64_l_neg2(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
-; SVE-LABEL: xar_nxv2i64_l_neg2:
-; SVE:       // %bb.0:
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
-; SVE-NEXT:    lsr z1.d, z0.d, #4
-; SVE-NEXT:    lsl z0.d, z0.d, #60
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
-; SVE-NEXT:    ret
-;
-; SVE2-LABEL: xar_nxv2i64_l_neg2:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    orr z0.d, z0.d, z1.d
-; SVE2-NEXT:    lsr z1.d, z0.d, #4
-; SVE2-NEXT:    lsl z0.d, z0.d, #60
-; SVE2-NEXT:    orr z0.d, z0.d, z1.d
-; SVE2-NEXT:    ret
-    %a = or <vscale x 2 x i64> %x, %y
-    %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 60, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
-    ret <vscale x 2 x i64> %b
-}
-
-declare <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
-declare <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
-declare <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)

>From 6cdb4c4f70dad8027c93b3b1952d0d7d15f320d9 Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Mon, 8 Jan 2024 18:24:27 -0800
Subject: [PATCH 3/4] [AArch64][SVE2] Generate XAR

Bitwise exclusive OR and rotate right by immediate

Select xar (x, y, imm) for the following pattern:
    or (shl (xor x, y), nBits-imm), (shr (xor x, y), imm)
This is essentially:
    rotr (xor(x, y), imm)

Change-Id: I55eac358745085e4f37c159ad3008113ac80a78b
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  60 ++++-
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |   1 +
 llvm/test/CodeGen/AArch64/sve2-xar.ll         | 230 ++++++++++++++++++
 3 files changed, 290 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-xar.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 476d99c2a7e045..77d2736f2579cb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4275,6 +4275,64 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // Essentially: rotr (xor(x, y), imm) -> xar (x, y, imm)
+  // Rotate by a constant is a funnel shift in IR which is exanded to
+  // an OR with shifted operands.
+  // We do the following transform:
+  //   OR N0, N1 -> xar (x, y, imm)
+  // Where:
+  //   N1 = SRL_PRED true, V, splat(imm)  --> rotr amount
+  //   N0 = SHL_PRED true, V, splat(bits-imm)
+  //   V = (xor x, y)
+  if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
+    if (N0.getOpcode() != AArch64ISD::SHL_PRED ||
+        N1.getOpcode() != AArch64ISD::SRL_PRED)
+      std::swap(N0, N1);
+    if (N0.getOpcode() != AArch64ISD::SHL_PRED ||
+        N1.getOpcode() != AArch64ISD::SRL_PRED)
+      return false;
+
+    auto *TLI = static_cast<const AArch64TargetLowering *>(getTargetLowering());
+    if (!TLI->isAllActivePredicate(*CurDAG, N0.getOperand(0)) ||
+        !TLI->isAllActivePredicate(*CurDAG, N1.getOperand(0)))
+      return false;
+
+    SDValue XOR = N0.getOperand(1);
+    if (XOR.getOpcode() != ISD::XOR || XOR != N1.getOperand(1))
+      return false;
+
+    SDValue LConst = N0.getOperand(2);
+    SDValue RConst = N1.getOperand(2);
+    if (RConst.getOpcode() != ISD::SPLAT_VECTOR ||
+        LConst.getOpcode() != ISD::SPLAT_VECTOR)
+      return false;
+    if (!isa<ConstantSDNode>(RConst.getOperand(0).getNode()) ||
+        !isa<ConstantSDNode>(LConst.getOperand(0).getNode()))
+      return false;
+
+    uint64_t ShlAmt = LConst->getConstantOperandVal(0);
+    uint64_t ShrAmt = RConst->getConstantOperandVal(0);
+
+    if (ShlAmt + ShrAmt != VT.getScalarSizeInBits())
+      return false;
+
+    SDLoc DL(N);
+    SDValue Imm = CurDAG->getTargetConstant(ShrAmt, DL, MVT::i32);
+
+    SDValue Ops[] = {XOR.getOperand(0), XOR.getOperand(1), Imm};
+    if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::Int>(
+            VT, {AArch64::XAR_ZZZI_B, AArch64::XAR_ZZZI_H, AArch64::XAR_ZZZI_S,
+                 AArch64::XAR_ZZZI_D})) {
+      CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+      return true;
+    }
+    return false;
+  }
+
+  if (!Subtarget->hasSHA3())
+    return false;
 
   if (N0->getOpcode() != AArch64ISD::VSHL ||
       N1->getOpcode() != AArch64ISD::VLSHR)
@@ -4367,7 +4425,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
   case ISD::OR:
     if (tryBitfieldInsertOp(Node))
       return;
-    if (Subtarget->hasSHA3() && trySelectXAR(Node))
+    if (trySelectXAR(Node))
       return;
     break;
 
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index b17e215e200dea..a131cf8a6f5402 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -394,6 +394,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   void mirFileLoaded(MachineFunction &MF) const override;
 
   bool hasSVEorSME() const { return hasSVE() || hasSME(); }
+  bool hasSVE2orSME() const { return hasSVE2() || hasSME(); }
 
   // Return the known range for the bit length of SVE data registers. A value
   // of 0 means nothing is known about that particular limit beyong what's
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
new file mode 100644
index 00000000000000..18bb607a4133a9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefix=SVE %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s -o - | FileCheck --check-prefix=SVE2 %s
+
+define <vscale x 2 x i64> @xar_nxv2i64_l(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; SVE-LABEL: xar_nxv2i64_l:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsr z1.d, z0.d, #4
+; SVE-NEXT:    lsl z0.d, z0.d, #60
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv2i64_l:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 2 x i64> %x, %y
+    %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 60))
+    ret <vscale x 2 x i64> %b
+}
+
+define <vscale x 2 x i64> @xar_nxv2i64_r(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; SVE-LABEL: xar_nxv2i64_r:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsl z1.d, z0.d, #60
+; SVE-NEXT:    lsr z0.d, z0.d, #4
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv2i64_r:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 2 x i64> %x, %y
+    %b = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 4))
+    ret <vscale x 2 x i64> %b
+}
+
+
+define <vscale x 4 x i32> @xar_nxv4i32_l(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; SVE-LABEL: xar_nxv4i32_l:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsr z1.s, z0.s, #4
+; SVE-NEXT:    lsl z0.s, z0.s, #28
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv4i32_l:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 4 x i32> %x, %y
+    %b = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 28))
+    ret <vscale x 4 x i32> %b
+}
+
+define <vscale x 4 x i32> @xar_nxv4i32_r(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; SVE-LABEL: xar_nxv4i32_r:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsl z1.s, z0.s, #28
+; SVE-NEXT:    lsr z0.s, z0.s, #4
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv4i32_r:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 4 x i32> %x, %y
+    %b = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 4))
+    ret <vscale x 4 x i32> %b
+}
+
+define <vscale x 8 x i16> @xar_nxv8i16_l(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; SVE-LABEL: xar_nxv8i16_l:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsr z1.h, z0.h, #4
+; SVE-NEXT:    lsl z0.h, z0.h, #12
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv8i16_l:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 8 x i16> %x, %y
+    %b = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 12))
+    ret <vscale x 8 x i16> %b
+}
+
+define <vscale x 8 x i16> @xar_nxv8i16_r(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; SVE-LABEL: xar_nxv8i16_r:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsl z1.h, z0.h, #12
+; SVE-NEXT:    lsr z0.h, z0.h, #4
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv8i16_r:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 8 x i16> %x, %y
+    %b = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 4))
+    ret <vscale x 8 x i16> %b
+}
+
+define <vscale x 16 x i8> @xar_nxv16i8_l(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; SVE-LABEL: xar_nxv16i8_l:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsr z1.b, z0.b, #4
+; SVE-NEXT:    lsl z0.b, z0.b, #4
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv16i8_l:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 16 x i8> %x, %y
+    %b = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 4))
+    ret <vscale x 16 x i8> %b
+}
+
+define <vscale x 16 x i8> @xar_nxv16i8_r(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; SVE-LABEL: xar_nxv16i8_r:
+; SVE:       // %bb.0:
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    lsl z1.b, z0.b, #4
+; SVE-NEXT:    lsr z0.b, z0.b, #4
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv16i8_r:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #4
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 16 x i8> %x, %y
+    %b = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 4))
+    ret <vscale x 16 x i8> %b
+}
+
+; Shift is not a constant.
+define <vscale x 2 x i64> @xar_nxv2i64_l_neg1(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i64> %z) {
+; SVE-LABEL: xar_nxv2i64_l_neg1:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z3.d, z2.d
+; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    subr z2.d, z2.d, #0 // =0x0
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
+; SVE-NEXT:    and z2.d, z2.d, #0x3f
+; SVE-NEXT:    and z3.d, z3.d, #0x3f
+; SVE-NEXT:    movprfx z1, z0
+; SVE-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
+; SVE-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv2i64_l_neg1:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    mov z3.d, z2.d
+; SVE2-NEXT:    ptrue p0.d
+; SVE2-NEXT:    subr z2.d, z2.d, #0 // =0x0
+; SVE2-NEXT:    eor z0.d, z0.d, z1.d
+; SVE2-NEXT:    and z2.d, z2.d, #0x3f
+; SVE2-NEXT:    and z3.d, z3.d, #0x3f
+; SVE2-NEXT:    movprfx z1, z0
+; SVE2-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
+; SVE2-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
+; SVE2-NEXT:    orr z0.d, z1.d, z0.d
+; SVE2-NEXT:    ret
+    %a = xor <vscale x 2 x i64> %x, %y
+    %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %z)
+    ret <vscale x 2 x i64> %b
+}
+
+; OR instead of an XOR.
+; TODO: We could use usra instruction here for SVE2.
+define <vscale x 2 x i64> @xar_nxv2i64_l_neg2(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; SVE-LABEL: xar_nxv2i64_l_neg2:
+; SVE:       // %bb.0:
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    lsr z1.d, z0.d, #4
+; SVE-NEXT:    lsl z0.d, z0.d, #60
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv2i64_l_neg2:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    orr z0.d, z0.d, z1.d
+; SVE2-NEXT:    lsr z1.d, z0.d, #4
+; SVE2-NEXT:    lsl z0.d, z0.d, #60
+; SVE2-NEXT:    orr z0.d, z0.d, z1.d
+; SVE2-NEXT:    ret
+    %a = or <vscale x 2 x i64> %x, %y
+    %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 60))
+    ret <vscale x 2 x i64> %b
+}
+
+; Rotate amount is 0.
+define <vscale x 2 x i64> @xar_nxv2i64_l_neg3(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; SVE-LABEL: xar_nxv2i64_l_neg3:
+; SVE:       // %bb.0:
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: xar_nxv2i64_l_neg3:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    orr z0.d, z0.d, z1.d
+; SVE2-NEXT:    ret
+    %a = or <vscale x 2 x i64> %x, %y
+    %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 64))
+    ret <vscale x 2 x i64> %b
+}
+
+declare <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)

>From 9ee96d841aea8754720ab907983b84760240a367 Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Mon, 8 Jan 2024 18:45:49 -0800
Subject: [PATCH 4/4] fixup! [AArch64][SVE2] Generate XAR

---
 llvm/test/CodeGen/AArch64/sve2-xar.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index 18bb607a4133a9..6b10732b2f7dba 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -208,14 +208,14 @@ define <vscale x 2 x i64> @xar_nxv2i64_l_neg2(<vscale x 2 x i64> %x, <vscale x 2
 define <vscale x 2 x i64> @xar_nxv2i64_l_neg3(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
 ; SVE-LABEL: xar_nxv2i64_l_neg3:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    eor z0.d, z0.d, z1.d
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: xar_nxv2i64_l_neg3:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    orr z0.d, z0.d, z1.d
+; SVE2-NEXT:    eor z0.d, z0.d, z1.d
 ; SVE2-NEXT:    ret
-    %a = or <vscale x 2 x i64> %x, %y
+    %a = xor <vscale x 2 x i64> %x, %y
     %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 64))
     ret <vscale x 2 x i64> %b
 }