[llvm] 18af853 - [AArch64] Remove 64bit->128bit vector insert lowering
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 1 01:39:57 PST 2023
Author: David Green
Date: 2023-03-01T09:39:51Z
New Revision: 18af85302200580ca4fc54bfee2a8a0ec2c8d7b9
URL: https://github.com/llvm/llvm-project/commit/18af85302200580ca4fc54bfee2a8a0ec2c8d7b9
DIFF: https://github.com/llvm/llvm-project/commit/18af85302200580ca4fc54bfee2a8a0ec2c8d7b9.diff
LOG: [AArch64] Remove 64bit->128bit vector insert lowering
The AArch64 backend, during lowering, will convert an 64bit vector insert to a
128bit vector:
vector_insert %dreg, %v, %idx
=>
%qreg = insert_subvector undef, %dreg, 0
%ins = vector_insert %qreg, %v, %idx
EXTRACT_SUBREG %ins, dsub
This creates a bit of mess in the DAG, and the EXTRACT_SUBREG being a machine
nodes makes it difficult to simplify. This patch removes that, treating the
64bit vector insert as legal and handling them with extra tablegen patterns.
The end result is a simpler DAG that is easier to write tablegen patterns for.
Differential Revision: https://reviews.llvm.org/D144550
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d8b692639507e..ca77cbc471790 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10327,18 +10327,6 @@ static unsigned getExtFactor(SDValue &V) {
return EltType.getSizeInBits() / 8;
}
-/// NarrowVector - Given a value in the V128 register class, produce the
-/// equivalent value in the V64 register class.
-static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
- EVT VT = V128Reg.getValueType();
- unsigned WideSize = VT.getVectorNumElements();
- MVT EltTy = VT.getVectorElementType().getSimpleVT();
- MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
- SDLoc DL(V128Reg);
-
- return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
-}
-
// Gather data to see if the operation can be modelled as a
// shuffle in combination with VEXTs.
SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
@@ -12594,7 +12582,6 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
Subtarget->forceStreamingCompatibleSVE()))
return LowerFixedLengthInsertVectorElt(Op, DAG);
- // Check for non-constant or out of range lane.
EVT VT = Op.getOperand(0).getValueType();
if (VT.getScalarType() == MVT::i1) {
@@ -12613,31 +12600,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
}
+ // Check for non-constant or out of range lane.
ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
- // Insertion/extraction are legal for V128 types.
- if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
- VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
- VT == MVT::v8f16 || VT == MVT::v8bf16)
- return Op;
-
- if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
- VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
- VT != MVT::v4bf16)
- return SDValue();
-
- // For V64 types, we perform insertion by expanding the value
- // to a V128 type and perform the insertion on that.
- SDLoc DL(Op);
- SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
- EVT WideTy = WideVec.getValueType();
-
- SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
- Op.getOperand(1), Op.getOperand(2));
- // Re-narrow the resultant vector.
- return NarrowVector(Node, DAG);
+ return Op;
}
SDValue
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1226164fc54a2..d5e194a9ddc95 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5932,14 +5932,15 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
(i64 0)),
dsub)>;
-def : Pat<(vector_insert (v8f16 v8f16:$Rn), (f16 fpimm0),
- (i64 VectorIndexH:$imm)),
+def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
(INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
-def : Pat<(vector_insert v4f32:$Rn, (f32 fpimm0),
- (i64 VectorIndexS:$imm)),
+def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
+ (EXTRACT_SUBREG (INSvi16gpr (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexH:$imm, WZR), dsub)>;
+def : Pat<(vector_insert (v4f32 V128:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)),
(INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>;
-def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0),
- (i64 VectorIndexD:$imm)),
+def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)),
+ (EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>;
+def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)),
(INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
@@ -5988,6 +5989,22 @@ def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
(v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
(i64 0))>;
+def : Pat<(v2i32 (vector_insert (v2i32 V64:$Rn), (i32 GPR32:$Rm), (i64 VectorIndexS:$imm))),
+ (EXTRACT_SUBREG
+ (INSvi32gpr (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+ VectorIndexS:$imm, GPR32:$Rm),
+ dsub)>;
+def : Pat<(v4i16 (vector_insert (v4i16 V64:$Rn), (i32 GPR32:$Rm), (i64 VectorIndexH:$imm))),
+ (EXTRACT_SUBREG
+ (INSvi16gpr (v8i16 (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+ VectorIndexH:$imm, GPR32:$Rm),
+ dsub)>;
+def : Pat<(v8i8 (vector_insert (v8i8 V64:$Rn), (i32 GPR32:$Rm), (i64 VectorIndexB:$imm))),
+ (EXTRACT_SUBREG
+ (INSvi8gpr (v16i8 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+ VectorIndexB:$imm, GPR32:$Rm),
+ dsub)>;
+
// Copy an element at a constant index in one vector into a constant indexed
// element of another.
// FIXME refactor to a shared class/dev parameterized on vector type, vector
@@ -6051,10 +6068,20 @@ defm : Neon_INS_elt_pattern<v8bf16, v4bf16, bf16, INSvi16lane>;
defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, INSvi8lane>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi64lane>;
+
// Insert from bitcast
// vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), imm:$Immd)),
(INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$Sn, ssub), 0)>;
+def : Pat<(v2i32 (vector_insert v2i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), imm:$Immd)),
+ (EXTRACT_SUBREG
+ (INSvi32lane (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$src, dsub)),
+ imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$Sn, ssub), 0),
+ dsub)>;
def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), imm:$Immd)),
(INSvi64lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$Sn, dsub), 0)>;
@@ -7283,12 +7310,22 @@ def : Ld1Lane128Pat<load, VectorIndexH, v8bf16, bf16, LD1i16>;
// In this case, the index must be adjusted to match LD1 type.
//
class Ld1Lane128IdxOpPat<SDPatternOperator scalar_load, Operand
- VecIndex, ValueType VTy, ValueType STy,
- Instruction LD1, SDNodeXForm IdxOp>
+ VecIndex, ValueType VTy, ValueType STy,
+ Instruction LD1, SDNodeXForm IdxOp>
: Pat<(vector_insert (VTy VecListOne128:$Rd),
(STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
(LD1 VecListOne128:$Rd, (IdxOp VecIndex:$idx), GPR64sp:$Rn)>;
+class Ld1Lane64IdxOpPat<SDPatternOperator scalar_load, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction LD1,
+ SDNodeXForm IdxOp>
+ : Pat<(vector_insert (VTy VecListOne64:$Rd),
+ (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (EXTRACT_SUBREG
+ (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+ (IdxOp VecIndex:$idx), GPR64sp:$Rn),
+ dsub)>;
+
def VectorIndexStoH : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
}]>;
@@ -7303,6 +7340,10 @@ def : Ld1Lane128IdxOpPat<extloadi16, VectorIndexS, v4i32, i32, LD1i16, VectorInd
def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexS, v4i32, i32, LD1i8, VectorIndexStoB>;
def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndexHtoB>;
+def : Ld1Lane64IdxOpPat<extloadi16, VectorIndexS, v2i32, i32, LD1i16, VectorIndexStoH>;
+def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexS, v2i32, i32, LD1i8, VectorIndexStoB>;
+def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexHtoB>;
+
// Same as above, but the first element is populated using
// scalar_to_vector + insert_subvector instead of insert_vector_elt.
let Predicates = [NotInStreamingSVEMode] in {
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 213c59d9526ab..66333e9b6a2fd 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -5,12 +5,12 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
; CHECK-LABEL: vector_deinterleave_v2f16_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: dup v1.2s, v0.s[1]
-; CHECK-NEXT: mov v2.16b, v0.16b
-; CHECK-NEXT: mov v2.h[1], v1.h[0]
+; CHECK-NEXT: dup v2.2s, v0.s[1]
+; CHECK-NEXT: mov v1.16b, v2.16b
; CHECK-NEXT: mov v1.h[0], v0.h[1]
+; CHECK-NEXT: mov v0.h[1], v2.h[0]
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT: fmov d0, d2
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
ret {<2 x half>, <2 x half>} %retval
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 13143041101c7..4bd190bf751ed 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1021,8 +1021,8 @@ define <4 x i16> @vselect_equivalent_shuffle_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: mov v0.h[2], v1.h[1]
; CHECK-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-NEXT: mov v0.h[2], v1.h[1]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
More information about the llvm-commits
mailing list