[llvm] 23d5e93 - [AArch64] Optimize instruction selection for certain vector shuffles

Thu Aug 27 03:07:26 PDT 2020

Author: Mikhail Maltsev
Date: 2020-08-27T11:06:49+01:00
New Revision: 23d5e93f342e168b59838476abc0e03853609617

URL: https://github.com/llvm/llvm-project/commit/23d5e93f342e168b59838476abc0e03853609617
DIFF: https://github.com/llvm/llvm-project/commit/23d5e93f342e168b59838476abc0e03853609617.diff

LOG: [AArch64] Optimize instruction selection for certain vector shuffles

This patch adds code to recognize vector shuffles which can be
represented as VDUP (splat) of a vector lane with of a different
(wider) type than the original vector lane type.

For example:
    shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
is essentially:
    shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 0, i32 0>

Such patterns are generated by the SelectionDAG machinery in some cases
(see DAGCombiner::visitBITCAST in DAGCombiner.cpp, the "Remove double
bitcasts from shuffles" part).

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D86225

Added: 
    llvm/test/CodeGen/AArch64/neon-wide-splat.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
    llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll
    llvm/test/CodeGen/AArch64/neon-extract.ll
    llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4d8c4de8ed1f..b04d245ac7a0 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7381,6 +7381,81 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   return true;
 }
 
+/// Check if a vector shuffle corresponds to a DUP instructions with a larger
+/// element width than the vector lane type. If that is the case the function
+/// returns true and writes the value of the DUP instruction lane operand into
+/// DupLaneOp
+static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
+                          unsigned &DupLaneOp) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for wide DUP are: 16, 32, 64");
+
+  if (BlockSize <= VT.getScalarSizeInBits())
+    return false;
+  if (BlockSize % VT.getScalarSizeInBits() != 0)
+    return false;
+  if (VT.getSizeInBits() % BlockSize != 0)
+    return false;
+
+  size_t SingleVecNumElements = VT.getVectorNumElements();
+  size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
+  size_t NumBlocks = VT.getSizeInBits() / BlockSize;
+
+  // We are looking for masks like
+  // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
+  // might be replaced by 'undefined'. BlockIndices will eventually contain
+  // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
+  // for the above examples)
+  SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
+  for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
+    for (size_t I = 0; I < NumEltsPerBlock; I++) {
+      int Elt = M[BlockIndex * NumEltsPerBlock + I];
+      if (Elt < 0)
+        continue;
+      // For now we don't support shuffles that use the second operand
+      if ((unsigned)Elt >= SingleVecNumElements)
+        return false;
+      if (BlockElts[I] < 0)
+        BlockElts[I] = Elt;
+      else if (BlockElts[I] != Elt)
+        return false;
+    }
+
+  // We found a candidate block (possibly with some undefs). It must be a
+  // sequence of consecutive integers starting with a value divisible by
+  // NumEltsPerBlock with some values possibly replaced by undef-s.
+
+  // Find first non-undef element
+  auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
+  assert(FirstRealEltIter != BlockElts.end() &&
+         "Shuffle with all-undefs must have been caught by previous cases, "
+         "e.g. isSplat()");
+  if (FirstRealEltIter == BlockElts.end()) {
+    DupLaneOp = 0;
+    return true;
+  }
+
+  // Index of FirstRealElt in BlockElts
+  size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
+
+  if ((unsigned)*FirstRealEltIter < FirstRealIndex)
+    return false;
+  // BlockElts[0] must have the following value if it isn't undef:
+  size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
+
+  // Check the first element
+  if (Elt0 % NumEltsPerBlock != 0)
+    return false;
+  // Check that the sequence indeed consists of consecutive integers (modulo
+  // undefs)
+  for (size_t I = 0; I < NumEltsPerBlock; I++)
+    if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
+      return false;
+
+  DupLaneOp = Elt0 / NumEltsPerBlock;
+  return true;
+}
+
 // check if an EXT instruction can handle the shuffle mask when the
 // vector sources of the shuffle are 
diff erent.
 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
@@ -7814,6 +7889,60 @@ static unsigned getDUPLANEOp(EVT EltType) {
   llvm_unreachable("Invalid vector element type?");
 }
 
+static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
+                            unsigned Opcode, SelectionDAG &DAG) {
+  // Try to eliminate a bitcasted extract subvector before a DUPLANE.
+  auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
+    // Match: dup (bitcast (extract_subv X, C)), LaneC
+    if (BitCast.getOpcode() != ISD::BITCAST ||
+        BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
+      return false;
+
+    // The extract index must align in the destination type. That may not
+    // happen if the bitcast is from narrow to wide type.
+    SDValue Extract = BitCast.getOperand(0);
+    unsigned ExtIdx = Extract.getConstantOperandVal(1);
+    unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
+    unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
+    unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
+    if (ExtIdxInBits % CastedEltBitWidth != 0)
+      return false;
+
+    // Update the lane value by offsetting with the scaled extract index.
+    LaneC += ExtIdxInBits / CastedEltBitWidth;
+
+    // Determine the casted vector type of the wide vector input.
+    // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
+    // Examples:
+    // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
+    // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
+    unsigned SrcVecNumElts =
+        Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
+    CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
+                              SrcVecNumElts);
+    return true;
+  };
+  MVT CastVT;
+  if (getScaledOffsetDup(V, Lane, CastVT)) {
+    V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
+  } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    // The lane is incremented by the index of the extract.
+    // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
+    Lane += V.getConstantOperandVal(1);
+    V = V.getOperand(0);
+  } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
+    // The lane is decremented if we are splatting from the 2nd operand.
+    // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
+    unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+    Lane -= Idx * VT.getVectorNumElements() / 2;
+    V = WidenVector(V.getOperand(Idx), DAG);
+  } else if (VT.getSizeInBits() == 64) {
+    // Widen the operand to 128-bit register with undef.
+    V = WidenVector(V, DAG);
+  }
+  return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
+}
+
 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -7847,57 +7976,26 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
     // Otherwise, duplicate from the lane of the input vector.
     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
-
-    // Try to eliminate a bitcasted extract subvector before a DUPLANE.
-    auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
-      // Match: dup (bitcast (extract_subv X, C)), LaneC
-      if (BitCast.getOpcode() != ISD::BITCAST ||
-          BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
-        return false;
-
-      // The extract index must align in the destination type. That may not
-      // happen if the bitcast is from narrow to wide type.
-      SDValue Extract = BitCast.getOperand(0);
-      unsigned ExtIdx = Extract.getConstantOperandVal(1);
-      unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
-      unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
-      unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
-      if (ExtIdxInBits % CastedEltBitWidth != 0)
-        return false;
-
-      // Update the lane value by offsetting with the scaled extract index.
-      LaneC += ExtIdxInBits / CastedEltBitWidth;
-
-      // Determine the casted vector type of the wide vector input.
-      // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
-      // Examples:
-      // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
-      // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
-      unsigned SrcVecNumElts =
-          Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
-      CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
-                                SrcVecNumElts);
-      return true;
-    };
-    MVT CastVT;
-    if (getScaledOffsetDup(V1, Lane, CastVT)) {
-      V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
-    } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-      // The lane is incremented by the index of the extract.
-      // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
-      Lane += V1.getConstantOperandVal(1);
-      V1 = V1.getOperand(0);
-    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
-      // The lane is decremented if we are splatting from the 2nd operand.
-      // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
-      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
-      Lane -= Idx * VT.getVectorNumElements() / 2;
-      V1 = WidenVector(V1.getOperand(Idx), DAG);
-    } else if (VT.getSizeInBits() == 64) {
-      // Widen the operand to 128-bit register with undef.
-      V1 = WidenVector(V1, DAG);
-    }
-    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
+    return constructDup(V1, Lane, dl, VT, Opcode, DAG);
+  }
+
+  // Check if the mask matches a DUP for a wider element
+  for (unsigned LaneSize : {64U, 32U, 16U}) {
+    unsigned Lane = 0;
+    if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
+      unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
+                                       : LaneSize == 32 ? AArch64ISD::DUPLANE32
+                                                        : AArch64ISD::DUPLANE16;
+      // Cast V1 to an integer vector with required lane size
+      MVT NewEltTy = MVT::getIntegerVT(LaneSize);
+      unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
+      MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
+      V1 = DAG.getBitcast(NewVecTy, V1);
+      // Constuct the DUP instruction
+      V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
+      // Cast back to the original type
+      return DAG.getBitcast(VT, V1);
+    }
   }
 
   if (isREVMask(ShuffleMask, VT, 64))

diff  --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index eee0d77d98eb..b38b91e9d705 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1966,7 +1966,7 @@ define <4 x i16> @test_vadd_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) {
 define <4 x i16> @test_vadd_lane2_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) {
 ; CHECK-LABEL: test_vadd_lane2_i16_bitcast_bigger_aligned:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.8b, v1.8b, v0.8b, #2
+; CHECK-NEXT:    dup v1.4h, v1.h[2]
 ; CHECK-NEXT:    dup v1.4h, v1.h[1]
 ; CHECK-NEXT:    add v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll b/llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll
index c45e55edeca5..c51ea172232a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll
@@ -14,7 +14,7 @@ entry:
 define <4 x i16> @vext_6701_12(<4 x i16> %a1, <4 x i16> %a2) {
 entry:
 ; CHECK-LABEL: vext_6701_12:
-; CHECK: ext	v0.8b, v0.8b, v0.8b, #4
+; CHECK: dup v0.2s, v0.s[0]
   %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
   ret <4 x i16> %x
 }
@@ -54,7 +54,7 @@ entry:
 define <4 x i16> @vext_6701_34(<4 x i16> %a1, <4 x i16> %a2) {
 entry:
 ; CHECK-LABEL: vext_6701_34:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+; CHECK: dup  v0.2s, v1.s[1]
   %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 undef, i32 undef>
   ret <4 x i16> %x
 }

diff  --git a/llvm/test/CodeGen/AArch64/neon-extract.ll b/llvm/test/CodeGen/AArch64/neon-extract.ll
index c159da1e9d18..0cac89424642 100644
--- a/llvm/test/CodeGen/AArch64/neon-extract.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extract.ll
@@ -209,7 +209,7 @@ entry:
 
 define <4 x i16> @test_undef_vext_s16(<4 x i16> %a) {
 ; CHECK-LABEL: test_undef_vext_s16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
+; CHECK: dup v{{[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 entry:
   %vext = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x i16> %vext

diff  --git a/llvm/test/CodeGen/AArch64/neon-wide-splat.ll b/llvm/test/CodeGen/AArch64/neon-wide-splat.ll
new file mode 100644
index 000000000000..6290f85dc1ce
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-wide-splat.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <4 x i16> @shuffle1(<4 x i16> %v) {
+; CHECK-LABEL: shuffle1:
+; CHECK:         dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %res = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 1>
+  ret <4 x i16> %res
+}
+
+define <4 x i16> @shuffle2(<4 x i16> %v) {
+; CHECK-LABEL: shuffle2:
+; CHECK:         dup v0.2s, v0.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %res = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 3>
+  ret <4 x i16> %res
+}
+
+define <8 x i16> @shuffle3(<8 x i16> %v) {
+; CHECK-LABEL: shuffle3:
+; CHECK:         dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %res = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3,
+                                                                 i32 undef, i32 1, i32 undef, i32 3>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @shuffle4(<4 x i32> %v) {
+; CHECK-LABEL: shuffle4:
+; CHECK:         dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i32> %res
+}
+
+define <16 x i8> @shuffle5(<16 x i8> %v) {
+; CHECK-LABEL: shuffle5:
+; CHECK:         dup v0.4s, v0.s[2]
+; CHECK-NEXT:    ret
+entry:
+  %res = shufflevector <16 x i8> %v, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11,
+                                                                  i32 8, i32 9, i32 10, i32 11,
+                                                                  i32 8, i32 9, i32 10, i32 11,
+                                                                  i32 8, i32 9, i32 10, i32 11>
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @shuffle6(<16 x i8> %v) {
+; CHECK-LABEL: shuffle6:
+; CHECK:         dup v0.2d, v0.d[1]
+; CHECK-NEXT:    ret
+entry:
+  %res = shufflevector <16 x i8> %v, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11,
+                                                                  i32 12, i32 13, i32 14, i32 15,
+                                                                  i32 8, i32 9, i32 10, i32 11,
+                                                                  i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i8> @shuffle7(<8 x i8> %v) {
+; CHECK-LABEL: shuffle7:
+; CHECK:         dup v0.2s, v0.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %res = shufflevector <8 x i8> %v, <8 x i8> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 undef,
+                                                               i32 undef, i32 5, i32 6, i32 undef>
+  ret <8 x i8> %res
+}
+
+define <8 x i8> @shuffle8(<8 x i8> %v) {
+; CHECK-LABEL: shuffle8:
+; CHECK:         dup v0.4h, v0.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %res = shufflevector <8 x i8> %v, <8 x i8> undef, <8 x i32> <i32 6, i32 7, i32 6, i32 undef,
+                                                               i32 undef, i32 7, i32 6, i32 undef>
+  ret <8 x i8> %res
+}
+
+; No blocks
+define <8 x i8> @shuffle_not1(<16 x i8> %v) {
+; CHECK-LABEL: shuffle_not1:
+; CHECK:         ext v0.16b, v0.16b, v0.16b, #2
+  %res = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i8> %res
+}
+
+; Block is not a proper lane
+define <4 x i32> @shuffle_not2(<4 x i32> %v) {
+; CHECK-LABEL: shuffle_not2:
+; CHECK-NOT:     dup
+; CHECK:         ext
+; CHECK:         ret
+entry:
+  %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 2>
+  ret <4 x i32> %res
+}
+
+; Block size is equal to vector size
+define <4 x i16> @shuffle_not3(<4 x i16> %v) {
+; CHECK-LABEL: shuffle_not3:
+; CHECK-NOT:     dup
+; CHECK:         ret
+entry:
+  %res = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %res
+}
+
+; Blocks mismatch
+define <8 x i8> @shuffle_not4(<8 x i8> %v) {
+; CHECK-LABEL: shuffle_not4:
+; CHECK-NOT:     dup
+; CHECK:         ret
+entry:
+  %res = shufflevector <8 x i8> %v, <8 x i8> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 undef,
+                                                               i32 undef, i32 5, i32 5, i32 undef>
+  ret <8 x i8> %res
+}

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
index c669a55519d8..4d888317b343 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
@@ -77,7 +77,7 @@ define float @test_v16f32(<16 x float> %a) nounwind {
 ; CHECK-NEXT:    fmaxnm v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    dup v1.2d, v0.d[1]
 ; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    dup v1.4s, v0.s[1]
 ; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s