[llvm] a255931 - [ARM] Supporting lowering of half-precision FP arguments and returns in AArch32's backend

Lucas Prates via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 18 05:15:20 PDT 2020


Author: Lucas Prates
Date: 2020-06-18T13:15:13+01:00
New Revision: a255931c40558edf87994c2a8ed9b274c3fbda30

URL: https://github.com/llvm/llvm-project/commit/a255931c40558edf87994c2a8ed9b274c3fbda30
DIFF: https://github.com/llvm/llvm-project/commit/a255931c40558edf87994c2a8ed9b274c3fbda30.diff

LOG: [ARM] Supporting lowering of half-precision FP arguments and returns in AArch32's backend

Summary:
Half-precision floating point arguments and returns are currently
promoted to either float or int32 in clang's CodeGen and there's
no existing support for the lowering of `half` arguments and returns
from IR in AArch32's backend.

Such frontend coercions, implemented as coercion through memory
in clang, can cause a series of issues in argument lowering, as causing
arguments to be stored on the wrong bits on big-endian architectures
and incurring in missing overflow detections in the return of certain
functions.

This patch introduces the handling of half-precision arguments and returns in
the backend using the actual "half" type on the IR. Using the "half"
type the backend is able to properly enforce the AAPCS' directions for
those arguments, making sure they are stored on the proper bits of the
registers and performing the necessary floating point convertions.

Reviewers: rjmccall, olista01, asl, efriedma, ostannard, SjoerdMeijer

Reviewed By: ostannard

Subscribers: stuij, hiraditya, dmgreen, llvm-commits, chill, dnsampaio, danielkiss, kristof.beyls, cfe-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D75169

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/Target/ARM/ARMCallLowering.cpp
    llvm/lib/Target/ARM/ARMCallingConv.cpp
    llvm/lib/Target/ARM/ARMCallingConv.td
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/ARMISelLowering.h
    llvm/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
    llvm/test/CodeGen/ARM/fp16-args.ll
    llvm/test/CodeGen/ARM/fp16-bitcast.ll
    llvm/test/CodeGen/ARM/fp16-promote.ll
    llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll
    llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
    llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
    llvm/test/CodeGen/Thumb2/mve-shuffle.ll
    llvm/test/CodeGen/Thumb2/mve-vdup.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 199ed0bc4501..a85a24f9405c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3600,6 +3600,24 @@ class TargetLowering : public TargetLoweringBase {
   // the SelectionDAGBuilder code knows how to lower these.
   //
 
+  /// Target-specific splitting of values into parts that fit a register
+  /// storing a legal type
+  virtual bool splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL,
+                                           SDValue Val, SDValue *Parts,
+                                           unsigned NumParts, MVT PartVT,
+                                           Optional<CallingConv::ID> CC) const {
+    return false;
+  }
+
+  /// Target-specific combining of register parts into its original value
+  virtual SDValue
+  joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,
+                             const SDValue *Parts, unsigned NumParts,
+                             MVT PartVT, EVT ValueVT,
+                             Optional<CallingConv::ID> CC) const {
+    return SDValue();
+  }
+
   /// This hook must be implemented to lower the incoming (formal) arguments,
   /// described by the Ins array, into the specified DAG. The implementation
   /// should fill in the InVals array with legal-type argument values, and

diff  --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 6ff869833b08..5bcc242d3528 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -298,7 +298,11 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
     assert(VA.getValNo() == i && "Location doesn't correspond to current arg");
 
     if (VA.needsCustom()) {
-      j += Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j));
+      unsigned NumArgRegs =
+          Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j));
+      if (!NumArgRegs)
+        return false;
+      j += NumArgRegs;
       continue;
     }
 

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 51b03aee92ec..1c6843489b7d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -205,12 +205,17 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
                                 MVT PartVT, EVT ValueVT, const Value *V,
                                 Optional<CallingConv::ID> CC = None,
                                 Optional<ISD::NodeType> AssertOp = None) {
+  // Let the target assemble the parts if it wants to
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (SDValue Val = TLI.joinRegisterPartsIntoValue(DAG, DL, Parts, NumParts,
+                                                   PartVT, ValueVT, CC))
+    return Val;
+
   if (ValueVT.isVector())
     return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V,
                                   CC);
 
   assert(NumParts > 0 && "No parts to assemble!");
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Val = Parts[0];
 
   if (NumParts > 1) {
@@ -512,6 +517,11 @@ static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
                            const Value *V,
                            Optional<CallingConv::ID> CallConv = None,
                            ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
+  // Let the target split the parts if it wants to
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.splitValueIntoRegisterParts(DAG, DL, Val, Parts, NumParts, PartVT,
+                                      CallConv))
+    return;
   EVT ValueVT = Val.getValueType();
 
   // Handle the vector case separately.

diff  --git a/llvm/lib/Target/ARM/ARMCallLowering.cpp b/llvm/lib/Target/ARM/ARMCallLowering.cpp
index bc03154a679d..4fbb3b6993e4 100644
--- a/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -140,7 +140,10 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
 
     CCValAssign VA = VAs[0];
     assert(VA.needsCustom() && "Value doesn't need custom handling");
-    assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+    // Custom lowering for other types, such as f16, is currently not supported
+    if (VA.getValVT() != MVT::f64)
+      return 0;
 
     CCValAssign NextVA = VAs[1];
     assert(NextVA.needsCustom() && "Value doesn't need custom handling");
@@ -360,7 +363,10 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
 
     CCValAssign VA = VAs[0];
     assert(VA.needsCustom() && "Value doesn't need custom handling");
-    assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+    // Custom lowering for other types, such as f16, is currently not supported
+    if (VA.getValVT() != MVT::f64)
+      return 0;
 
     CCValAssign NextVA = VAs[1];
     assert(NextVA.needsCustom() && "Value doesn't need custom handling");

diff  --git a/llvm/lib/Target/ARM/ARMCallingConv.cpp b/llvm/lib/Target/ARM/ARMCallingConv.cpp
index 106c9161cb94..9868ce4b099b 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.cpp
+++ b/llvm/lib/Target/ARM/ARMCallingConv.cpp
@@ -278,5 +278,33 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
   return true;
 }
 
+static bool CustomAssignInRegList(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                  CCValAssign::LocInfo LocInfo, CCState &State,
+                                  ArrayRef<MCPhysReg> RegList) {
+  unsigned Reg = State.AllocateReg(RegList);
+  if (Reg) {
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+  return false;
+}
+
+static bool CC_ARM_AAPCS_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                    CCValAssign::LocInfo LocInfo,
+                                    ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  // f16 arguments are extended to i32 and assigned to a register in [r0, r3]
+  return CustomAssignInRegList(ValNo, ValVT, MVT::i32, LocInfo, State,
+                               RRegList);
+}
+
+static bool CC_ARM_AAPCS_VFP_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                        CCValAssign::LocInfo LocInfo,
+                                        ISD::ArgFlagsTy ArgFlags,
+                                        CCState &State) {
+  // f16 arguments are extended to f32 and assigned to a register in [s0, s15]
+  return CustomAssignInRegList(ValNo, ValVT, MVT::f32, LocInfo, State,
+                               SRegList);
+}
+
 // Include the table generated calling convention implementations.
 #include "ARMGenCallingConv.inc"

diff  --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td
index 5df5b56f5afa..b7a52b0781fd 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -139,7 +139,7 @@ def CC_ARM_AAPCS_Common : CallingConv<[
 
   CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>,
   CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
-  CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f16, f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
   CCIfType<[v2f64], CCIfAlign<"16",
            CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>,
@@ -176,6 +176,7 @@ def CC_ARM_AAPCS : CallingConv<[
 
   CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[f16], CCCustom<"CC_ARM_AAPCS_Custom_f16">>,
   CCDelegateTo<CC_ARM_AAPCS_Common>
 ]>;
 
@@ -193,6 +194,7 @@ def RetCC_ARM_AAPCS : CallingConv<[
 
   CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[f16], CCCustom<"CC_ARM_AAPCS_Custom_f16">>,
 
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
@@ -224,6 +226,7 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                  S9, S10, S11, S12, S13, S14, S15]>>,
+  CCIfType<[f16], CCCustom<"CC_ARM_AAPCS_VFP_Custom_f16">>,
   CCDelegateTo<CC_ARM_AAPCS_Common>
 ]>;
 
@@ -242,7 +245,8 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
-                                      S9, S10, S11, S12, S13, S14, S15]>>,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCIfType<[f16], CCCustom<"CC_ARM_AAPCS_VFP_Custom_f16">>,
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
 

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6ef887071faf..5e6c9b3c829e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2024,7 +2024,8 @@ SDValue ARMTargetLowering::LowerCallResult(
     }
 
     SDValue Val;
-    if (VA.needsCustom()) {
+    if (VA.needsCustom() &&
+        (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
       // Handle f64 or half of a v2f64.
       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
                                       InFlag);
@@ -2073,6 +2074,17 @@ SDValue ARMTargetLowering::LowerCallResult(
       break;
     }
 
+    // f16 arguments have their size extended to 4 bytes and passed as if they
+    // had been copied to the LSBs of a 32-bit register.
+    // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
+    if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
+      assert(Subtarget->hasFullFP16() &&
+             "Lowering f16 type return without full fp16 support");
+      Val = DAG.getNode(ISD::BITCAST, dl,
+                        MVT::getIntegerVT(VA.getLocVT().getSizeInBits()), Val);
+      Val = DAG.getNode(ARMISD::VMOVhr, dl, VA.getValVT(), Val);
+    }
+
     InVals.push_back(Val);
   }
 
@@ -2241,31 +2253,40 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       break;
     }
 
-    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
-    if (VA.needsCustom()) {
-      if (VA.getLocVT() == MVT::v2f64) {
-        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
-                                  DAG.getConstant(0, dl, MVT::i32));
-        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
-                                  DAG.getConstant(1, dl, MVT::i32));
-
-        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
-                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
-
-        VA = ArgLocs[++i]; // skip ahead to next loc
-        if (VA.isRegLoc()) {
-          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
-                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
-        } else {
-          assert(VA.isMemLoc());
+    // f16 arguments have their size extended to 4 bytes and passed as if they
+    // had been copied to the LSBs of a 32-bit register.
+    // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
+    if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
+      assert(Subtarget->hasFullFP16() &&
+             "Lowering f16 type argument without full fp16 support");
+      Arg = DAG.getNode(ARMISD::VMOVrh, dl,
+                        MVT::getIntegerVT(VA.getLocVT().getSizeInBits()), Arg);
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+    }
 
-          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
-                                                 dl, DAG, VA, Flags));
-        }
-      } else {
-        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
+    if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
+      SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                DAG.getConstant(0, dl, MVT::i32));
+      SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                DAG.getConstant(1, dl, MVT::i32));
+
+      PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
+                       StackPtr, MemOpChains, Flags);
+
+      VA = ArgLocs[++i]; // skip ahead to next loc
+      if (VA.isRegLoc()) {
+        PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
                          StackPtr, MemOpChains, Flags);
+      } else {
+        assert(VA.isMemLoc());
+
+        MemOpChains.push_back(
+            LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags));
       }
+    } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
+      PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+                       StackPtr, MemOpChains, Flags);
     } else if (VA.isRegLoc()) {
       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
           Outs[0].VT == MVT::i32) {
@@ -2755,7 +2776,7 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
         if (VA.getLocInfo() == CCValAssign::Indirect)
           return false;
-        if (VA.needsCustom()) {
+        if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
           // f64 and vector types are split into multiple registers or
           // register/stack-slot combinations.  The types will not match
           // the registers; give up on memory f64 refs until we figure
@@ -2907,7 +2928,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       break;
     }
 
-    if (VA.needsCustom()) {
+    if (VA.needsCustom() &&
+        (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
       if (VA.getLocVT() == MVT::v2f64) {
         // Extract the first half and return it in two registers.
         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
@@ -2915,15 +2937,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
 
-        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                                 HalfGPRs.getValue(isLittleEndian ? 0 : 1),
-                                 Flag);
+        Chain =
+            DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
-        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                                 HalfGPRs.getValue(isLittleEndian ? 1 : 0),
-                                 Flag);
+        Chain =
+            DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
@@ -2937,14 +2959,12 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                               fmrrd.getValue(isLittleEndian ? 0 : 1),
-                               Flag);
+                               fmrrd.getValue(isLittleEndian ? 0 : 1), Flag);
       Flag = Chain.getValue(1);
       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
       VA = RVLocs[++i]; // skip ahead to next loc
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                               fmrrd.getValue(isLittleEndian ? 1 : 0),
-                               Flag);
+                               fmrrd.getValue(isLittleEndian ? 1 : 0), Flag);
     } else
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
 
@@ -4080,6 +4100,40 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
 
+bool ARMTargetLowering::splitValueIntoRegisterParts(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
+    unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
+  bool IsABIRegCopy = CC.hasValue();
+  EVT ValueVT = Val.getValueType();
+  if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) {
+    unsigned ValueBits = ValueVT.getSizeInBits();
+    unsigned PartBits = PartVT.getSizeInBits();
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
+    Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
+    Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    Parts[0] = Val;
+    return true;
+  }
+  return false;
+}
+
+SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
+    SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
+    MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
+  bool IsABIRegCopy = CC.hasValue();
+  if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) {
+    unsigned ValueBits = ValueVT.getSizeInBits();
+    unsigned PartBits = PartVT.getSizeInBits();
+    SDValue Val = Parts[0];
+
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
+    Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
+    Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+    return Val;
+  }
+  return SDValue();
+}
+
 SDValue ARMTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -4152,33 +4206,29 @@ SDValue ARMTargetLowering::LowerFormalArguments(
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
 
-      if (VA.needsCustom()) {
+      if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
         // f64 and vector types are split up into multiple registers or
         // combinations of registers and stack slots.
-        if (VA.getLocVT() == MVT::v2f64) {
-          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
-                                                   Chain, DAG, dl);
-          VA = ArgLocs[++i]; // skip ahead to next loc
-          SDValue ArgValue2;
-          if (VA.isMemLoc()) {
-            int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
-            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
-                                    MachinePointerInfo::getFixedStack(
-                                        DAG.getMachineFunction(), FI));
-          } else {
-            ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
-                                             Chain, DAG, dl);
-          }
-          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
-          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
-                                 ArgValue, ArgValue1,
-                                 DAG.getIntPtrConstant(0, dl));
-          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
-                                 ArgValue, ArgValue2,
-                                 DAG.getIntPtrConstant(1, dl));
-        } else
-          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+        SDValue ArgValue1 =
+            GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+        VA = ArgLocs[++i]; // skip ahead to next loc
+        SDValue ArgValue2;
+        if (VA.isMemLoc()) {
+          int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          ArgValue2 = DAG.getLoad(
+              MVT::f64, dl, Chain, FIN,
+              MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+        } else {
+          ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+        }
+        ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+        ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
+                               ArgValue1, DAG.getIntPtrConstant(0, dl));
+        ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
+                               ArgValue2, DAG.getIntPtrConstant(1, dl));
+      } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
+        ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
       } else {
         const TargetRegisterClass *RC;
 
@@ -4229,6 +4279,18 @@ SDValue ARMTargetLowering::LowerFormalArguments(
         break;
       }
 
+      // f16 arguments have their size extended to 4 bytes and passed as if they
+      // had been copied to the LSBs of a 32-bit register.
+      // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
+      if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
+        assert(Subtarget->hasFullFP16() &&
+               "Lowering f16 type argument without full fp16 support");
+        ArgValue = DAG.getNode(ISD::BITCAST, dl,
+                               MVT::getIntegerVT(VA.getLocVT().getSizeInBits()),
+                               ArgValue);
+        ArgValue = DAG.getNode(ARMISD::VMOVhr, dl, VA.getValVT(), ArgValue);
+      }
+
       InVals.push_back(ArgValue);
     } else { // VA.isRegLoc()
       // sanity check

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 8fb5cb0ff2aa..3bf50b1da129 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -806,6 +806,17 @@ class VectorType;
       MachineBasicBlock *Entry,
       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
+    bool
+    splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
+                                SDValue *Parts, unsigned NumParts, MVT PartVT,
+                                Optional<CallingConv::ID> CC) const override;
+
+    SDValue
+    joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,
+                               const SDValue *Parts, unsigned NumParts,
+                               MVT PartVT, EVT ValueVT,
+                               Optional<CallingConv::ID> CC) const override;
+
     SDValue
     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                          const SmallVectorImpl<ISD::InputArg> &Ins,

diff  --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll b/llvm/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
index 045491097bbc..78cdd59894f2 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
@@ -49,7 +49,7 @@ define i17 @test_funny_ints(i17 %a, i17 %b) {
 }
 
 define half @test_half(half %a, half %b) {
-; CHECK: remark: {{.*}} unable to translate instruction: ret: '  ret half %res' (in function: test_half)
+; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* (in function: test_half)
 ; CHECK-LABEL: warning: Instruction selection used fallback path for test_half
   %res = fadd half %a, %b
   ret half %res

diff  --git a/llvm/test/CodeGen/ARM/fp16-args.ll b/llvm/test/CodeGen/ARM/fp16-args.ll
index 708fae7f9ffa..e858661d32fb 100644
--- a/llvm/test/CodeGen/ARM/fp16-args.ll
+++ b/llvm/test/CodeGen/ARM/fp16-args.ll
@@ -1,41 +1,46 @@
-; RUN: llc -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
-; RUN: llc -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
+; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
+; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-SOFT
+; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-HARD
+; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
+; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
+; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-SOFT
+; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-HARD
 
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv7a--none-eabi"
-
-define float @foo(float %a.coerce, float %b.coerce) {
+define half @foo(half %a, half %b) {
+; SOFT-LABEL: foo:
+; SOFT:       @ %bb.0: @ %entry
+; SOFT-NEXT:    vmov s2, r1
+; SOFT-NEXT:    vmov s0, r0
+; SOFT-NEXT:    vcvtb.f32.f16 s2, s2
+; SOFT-NEXT:    vcvtb.f32.f16 s0, s0
+; SOFT-NEXT:    vadd.f32 s0, s0, s2
+; SOFT-NEXT:    vcvtb.f16.f32 s0, s0
+; SOFT-NEXT:    vmov r0, s0
+; SOFT-NEXT:    bx lr
+;
+; HARD-LABEL: foo:
+; HARD:       @ %bb.0: @ %entry
+; HARD-NEXT:    vcvtb.f32.f16 s2, s1
+; HARD-NEXT:    vcvtb.f32.f16 s0, s0
+; HARD-NEXT:    vadd.f32 s0, s0, s2
+; HARD-NEXT:    vcvtb.f16.f32 s0, s0
+; HARD-NEXT:    bx lr
+;
+; FULL-SOFT-LABEL: foo:
+; FULL-SOFT:       @ %bb.0: @ %entry
+; FULL-SOFT-NEXT:    vmov.f16 s0, r1
+; FULL-SOFT-NEXT:    vmov.f16 s2, r0
+; FULL-SOFT-NEXT:    vadd.f16 s0, s2, s0
+; FULL-SOFT-NEXT:    vmov r0, s0
+; FULL-SOFT-NEXT:    bx lr
+;
+; FULL-HARD-LABEL: foo:
+; FULL-HARD:       @ %bb.0: @ %entry
+; FULL-HARD-NEXT:    vadd.f16 s0, s0, s1
+; FULL-HARD-NEXT:    bx lr
 entry:
-  %0 = bitcast float %a.coerce to i32
-  %tmp.0.extract.trunc = trunc i32 %0 to i16
-  %1 = bitcast i16 %tmp.0.extract.trunc to half
-  %2 = bitcast float %b.coerce to i32
-  %tmp1.0.extract.trunc = trunc i32 %2 to i16
-  %3 = bitcast i16 %tmp1.0.extract.trunc to half
-  %4 = fadd half %1, %3
-  %5 = bitcast half %4 to i16
-  %tmp5.0.insert.ext = zext i16 %5 to i32
-  %6 = bitcast i32 %tmp5.0.insert.ext to float
-  ret float %6
-; CHECK: foo:
-
-; SOFT: vmov    {{s[0-9]+}}, r1
-; SOFT: vmov    {{s[0-9]+}}, r0
-; SOFT: vcvtb.f32.f16   {{s[0-9]+}}, {{s[0-9]+}}
-; SOFT: vcvtb.f32.f16   {{s[0-9]+}}, {{s[0-9]+}}
-; SOFT: vadd.f32        {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; SOFT: vcvtb.f16.f32   {{s[0-9]+}}, {{s[0-9]+}}
-; SOFT: vmov    r0, {{s[0-9]+}}
-
-; HARD-NOT: vmov
-; HARD-NOT: uxth
-; HARD: vcvtb.f32.f16   {{s[0-9]+}}, s1
-; HARD: vcvtb.f32.f16   {{s[0-9]+}}, s0
-; HARD: vadd.f32        {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; HARD: vcvtb.f16.f32   [[SREG:s[0-9]+]], {{s[0-9]+}}
-; HARD-NEXT: vmov            [[REG0:r[0-9]+]], [[SREG]]
-; HARD-NEXT: uxth            [[REG1:r[0-9]+]], [[REG0]]
-; HARD-NEXT: vmov            s0, [[REG1]]
-
-; CHECK: bx lr
+  %0 = fadd half %a, %b
+  ret half %0
 }

diff  --git a/llvm/test/CodeGen/ARM/fp16-bitcast.ll b/llvm/test/CodeGen/ARM/fp16-bitcast.ll
index e1fdf88856a6..4254f2ecef44 100644
--- a/llvm/test/CodeGen/ARM/fp16-bitcast.ll
+++ b/llvm/test/CodeGen/ARM/fp16-bitcast.ll
@@ -1,71 +1,115 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple thumbv8m.main-arm-unknown-eabi -mattr=+vfp4d16sp < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-VFPV4
-; RUN: llc -mtriple thumbv8.1m.main-arm-unknown-eabi -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP16
+; RUN: llc -mtriple thumbv8m.main-arm-unknown-eabi --float-abi=soft -mattr=+vfp4d16sp < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-VFPV4-SOFT
+; RUN: llc -mtriple thumbv8.1m.main-arm-unknown-eabi --float-abi=soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP16-SOFT
+; RUN: llc -mtriple thumbv8m.main-arm-unknown-eabi --float-abi=hard -mattr=+vfp4d16sp < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-VFPV4-HARD
+; RUN: llc -mtriple thumbv8.1m.main-arm-unknown-eabi --float-abi=hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP16-HARD
 
 target triple = "thumbv8.1m.main-arm-unknown-eabi"
 
 define float @add(float %a, float %b) {
-; CHECK-LABEL: add:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov s0, r1
-; CHECK-NEXT:    vmov s2, r0
-; CHECK-NEXT:    vadd.f32 s0, s2, s0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    bx lr
+; CHECK-VFPV4-SOFT-LABEL: add:
+; CHECK-VFPV4-SOFT:       @ %bb.0: @ %entry
+; CHECK-VFPV4-SOFT-NEXT:    vmov s0, r1
+; CHECK-VFPV4-SOFT-NEXT:    vmov s2, r0
+; CHECK-VFPV4-SOFT-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-VFPV4-SOFT-NEXT:    vmov r0, s0
+; CHECK-VFPV4-SOFT-NEXT:    bx lr
+;
+; CHECK-FP16-SOFT-LABEL: add:
+; CHECK-FP16-SOFT:       @ %bb.0: @ %entry
+; CHECK-FP16-SOFT-NEXT:    vmov s0, r1
+; CHECK-FP16-SOFT-NEXT:    vmov s2, r0
+; CHECK-FP16-SOFT-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-FP16-SOFT-NEXT:    vmov r0, s0
+; CHECK-FP16-SOFT-NEXT:    bx lr
+;
+; CHECK-VFPV4-HARD-LABEL: add:
+; CHECK-VFPV4-HARD:       @ %bb.0: @ %entry
+; CHECK-VFPV4-HARD-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-VFPV4-HARD-NEXT:    bx lr
+;
+; CHECK-FP16-HARD-LABEL: add:
+; CHECK-FP16-HARD:       @ %bb.0: @ %entry
+; CHECK-FP16-HARD-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-FP16-HARD-NEXT:    bx lr
 entry:
   %add = fadd float %a, %b
   ret float %add
 }
 
-define i32 @addf16(i32 %a.coerce, i32 %b.coerce) {
-; CHECK-VFPV4-LABEL: addf16:
-; CHECK-VFPV4:       @ %bb.0: @ %entry
-; CHECK-VFPV4-NEXT:    vmov s2, r1
-; CHECK-VFPV4-NEXT:    vmov s0, r0
-; CHECK-VFPV4-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-VFPV4-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-VFPV4-NEXT:    vadd.f32 s0, s0, s2
-; CHECK-VFPV4-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-VFPV4-NEXT:    vmov r0, s0
-; CHECK-VFPV4-NEXT:    uxth r0, r0
-; CHECK-VFPV4-NEXT:    bx lr
+define half @addf16(half %a, half %b) {
+; CHECK-VFPV4-SOFT-LABEL: addf16:
+; CHECK-VFPV4-SOFT:       @ %bb.0: @ %entry
+; CHECK-VFPV4-SOFT-NEXT:    vmov s2, r1
+; CHECK-VFPV4-SOFT-NEXT:    vmov s0, r0
+; CHECK-VFPV4-SOFT-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-VFPV4-SOFT-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-VFPV4-SOFT-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-VFPV4-SOFT-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-VFPV4-SOFT-NEXT:    vmov r0, s0
+; CHECK-VFPV4-SOFT-NEXT:    bx lr
 ;
-; CHECK-FP16-LABEL: addf16:
-; CHECK-FP16:       @ %bb.0: @ %entry
-; CHECK-FP16-NEXT:    vmov.f16 s0, r1
-; CHECK-FP16-NEXT:    vmov.f16 s2, r0
-; CHECK-FP16-NEXT:    vadd.f16 s0, s2, s0
-; CHECK-FP16-NEXT:    vmov.f16 r0, s0
-; CHECK-FP16-NEXT:    bx lr
+; CHECK-FP16-SOFT-LABEL: addf16:
+; CHECK-FP16-SOFT:       @ %bb.0: @ %entry
+; CHECK-FP16-SOFT-NEXT:    vmov.f16 s0, r1
+; CHECK-FP16-SOFT-NEXT:    vmov.f16 s2, r0
+; CHECK-FP16-SOFT-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-FP16-SOFT-NEXT:    vmov r0, s0
+; CHECK-FP16-SOFT-NEXT:    bx lr
+;
+; CHECK-VFPV4-HARD-LABEL: addf16:
+; CHECK-VFPV4-HARD:       @ %bb.0: @ %entry
+; CHECK-VFPV4-HARD-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-VFPV4-HARD-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-VFPV4-HARD-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-VFPV4-HARD-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-VFPV4-HARD-NEXT:    bx lr
+;
+; CHECK-FP16-HARD-LABEL: addf16:
+; CHECK-FP16-HARD:       @ %bb.0: @ %entry
+; CHECK-FP16-HARD-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-FP16-HARD-NEXT:    bx lr
 entry:
-  %tmp.0.extract.trunc = trunc i32 %a.coerce to i16
-  %0 = bitcast i16 %tmp.0.extract.trunc to half
-  %tmp1.0.extract.trunc = trunc i32 %b.coerce to i16
-  %1 = bitcast i16 %tmp1.0.extract.trunc to half
-  %add = fadd half %0, %1
-  %2 = bitcast half %add to i16
-  %tmp4.0.insert.ext = zext i16 %2 to i32
-  ret i32 %tmp4.0.insert.ext
+  %add = fadd half %a, %b
+  ret half %add
 }
 
 define half @load_i16(i16 *%hp) {
-; CHECK-VFPV4-LABEL: load_i16:
-; CHECK-VFPV4:       @ %bb.0: @ %entry
-; CHECK-VFPV4-NEXT:    vmov.f32 s0, #1.000000e+00
-; CHECK-VFPV4-NEXT:    ldrh r0, [r0]
-; CHECK-VFPV4-NEXT:    vmov s2, r0
-; CHECK-VFPV4-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-VFPV4-NEXT:    vadd.f32 s0, s2, s0
-; CHECK-VFPV4-NEXT:    vmov r0, s0
-; CHECK-VFPV4-NEXT:    bx lr
+; CHECK-VFPV4-SOFT-LABEL: load_i16:
+; CHECK-VFPV4-SOFT:       @ %bb.0: @ %entry
+; CHECK-VFPV4-SOFT-NEXT:    vmov.f32 s0, #1.000000e+00
+; CHECK-VFPV4-SOFT-NEXT:    ldrh r0, [r0]
+; CHECK-VFPV4-SOFT-NEXT:    vmov s2, r0
+; CHECK-VFPV4-SOFT-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-VFPV4-SOFT-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-VFPV4-SOFT-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-VFPV4-SOFT-NEXT:    vmov r0, s0
+; CHECK-VFPV4-SOFT-NEXT:    bx lr
+;
+; CHECK-FP16-SOFT-LABEL: load_i16:
+; CHECK-FP16-SOFT:       @ %bb.0: @ %entry
+; CHECK-FP16-SOFT-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP16-SOFT-NEXT:    vmov.f16 s0, #1.000000e+00
+; CHECK-FP16-SOFT-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-FP16-SOFT-NEXT:    vmov r0, s0
+; CHECK-FP16-SOFT-NEXT:    bx lr
+;
+; CHECK-VFPV4-HARD-LABEL: load_i16:
+; CHECK-VFPV4-HARD:       @ %bb.0: @ %entry
+; CHECK-VFPV4-HARD-NEXT:    vmov.f32 s0, #1.000000e+00
+; CHECK-VFPV4-HARD-NEXT:    ldrh r0, [r0]
+; CHECK-VFPV4-HARD-NEXT:    vmov s2, r0
+; CHECK-VFPV4-HARD-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-VFPV4-HARD-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-VFPV4-HARD-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-VFPV4-HARD-NEXT:    bx lr
 ;
-; CHECK-FP16-LABEL: load_i16:
-; CHECK-FP16:       @ %bb.0: @ %entry
-; CHECK-FP16-NEXT:    vldr.16 s2, [r1]
-; CHECK-FP16-NEXT:    vmov.f16 s0, #1.000000e+00
-; CHECK-FP16-NEXT:    vadd.f16 s0, s2, s0
-; CHECK-FP16-NEXT:    vstr.16 s0, [r0]
-; CHECK-FP16-NEXT:    bx lr
+; CHECK-FP16-HARD-LABEL: load_i16:
+; CHECK-FP16-HARD:       @ %bb.0: @ %entry
+; CHECK-FP16-HARD-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP16-HARD-NEXT:    vmov.f16 s0, #1.000000e+00
+; CHECK-FP16-HARD-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-FP16-HARD-NEXT:    bx lr
 entry:
   %h = load i16, i16 *%hp, align 2
   %hc = bitcast i16 %h to half

diff  --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 15cc6ceeafee..3cd07df671b9 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -933,7 +933,6 @@ define void @test_extractvalue(%struct.dummy* %p, half* %q) {
 }
 
 ; CHECK-ALL-LABEL: test_struct_return:
-; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-VFP-LIBCALL: bl __aeabi_h2f
 ; CHECK-NOVFP-DAG: ldr
 ; CHECK-NOVFP-DAG: ldrh

diff  --git a/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll b/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll
index bc458ef23561..56e734c44043 100644
--- a/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll
+++ b/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll
@@ -2,266 +2,235 @@
 ; RUN: llc < %s -mtriple=armv8-eabi -mattr=+fullfp16 | FileCheck %s
 ; RUN: llc < %s -mtriple thumbv7a -mattr=+fullfp16 | FileCheck %s
 
-; TODO: we can't pass half-precision arguments as "half" types yet. We do
-; that for the time being by passing "float %f.coerce" and the necessary
-; bitconverts/truncates. In these tests we pass i16 and use 1 bitconvert, which
-; is the shortest way to get a half type. But when we can pass half types, we
-; want to use that here.
-
-define half @fp16_vminnm_o(i16 signext %a, i16 signext %b) {
+define half @fp16_vminnm_o(half %a, half %b) {
 ; CHECK-LABEL: fp16_vminnm_o:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r2
-; CHECK-NEXT:    vmov.f16 s2, r1
+; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s2, r0
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp olt half %0, %1
-  %cond = select i1 %cmp, half %0, half %1
+  %cmp = fcmp olt half %a, %b
+  %cond = select i1 %cmp, half %a, half %b
   ret half %cond
 }
 
-define half @fp16_vminnm_o_rev(i16 signext %a, i16 signext %b) {
+define half @fp16_vminnm_o_rev(half %a, half %b) {
 ; CHECK-LABEL: fp16_vminnm_o_rev:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r2
-; CHECK-NEXT:    vmov.f16 s2, r1
+; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s2, r0
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp ogt half %0, %1
-  %cond = select i1 %cmp, half %0, half %1
+  %cmp = fcmp ogt half %a, %b
+  %cond = select i1 %cmp, half %a, half %b
   ret half %cond
 }
 
-define half @fp16_vminnm_u(i16 signext %a, i16 signext %b) {
+define half @fp16_vminnm_u(half %a, half %b) {
 ; CHECK-LABEL: fp16_vminnm_u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
-; CHECK-NEXT:    vmov.f16 s2, r2
+; CHECK-NEXT:    vmov.f16 s0, r0
+; CHECK-NEXT:    vmov.f16 s2, r1
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp ult half %0, %1
-  %cond = select i1 %cmp, half %0, half %1
+  %cmp = fcmp ult half %a, %b
+  %cond = select i1 %cmp, half %a, half %b
   ret half %cond
 }
 
-define half @fp16_vminnm_ule(i16 signext %a, i16 signext %b) {
+define half @fp16_vminnm_ule(half %a, half %b) {
 ; CHECK-LABEL: fp16_vminnm_ule:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
-; CHECK-NEXT:    vmov.f16 s2, r2
+; CHECK-NEXT:    vmov.f16 s0, r0
+; CHECK-NEXT:    vmov.f16 s2, r1
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp ule half %0, %1
-  %cond = select i1 %cmp, half %0, half %1
+  %cmp = fcmp ule half %a, %b
+  %cond = select i1 %cmp, half %a, half %b
   ret half %cond
 }
 
-define half @fp16_vminnm_u_rev(i16 signext %a, i16 signext %b) {
+define half @fp16_vminnm_u_rev(half %a, half %b) {
 ; CHECK-LABEL: fp16_vminnm_u_rev:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r2
-; CHECK-NEXT:    vmov.f16 s2, r1
+; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s2, r0
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp ugt half %0, %1
-  %cond = select i1 %cmp, half %1, half %0
+  %cmp = fcmp ugt half %a, %b
+  %cond = select i1 %cmp, half %b, half %a
   ret half %cond
 }
 
-define half @fp16_vmaxnm_o(i16 signext %a, i16 signext %b) {
+define half @fp16_vmaxnm_o(half %a, half %b) {
 ; CHECK-LABEL: fp16_vmaxnm_o:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r2
-; CHECK-NEXT:    vmov.f16 s2, r1
+; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s2, r0
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp ogt half %0, %1
-  %cond = select i1 %cmp, half %0, half %1
+  %cmp = fcmp ogt half %a, %b
+  %cond = select i1 %cmp, half %a, half %b
   ret half %cond
 }
 
-define half @fp16_vmaxnm_oge(i16 signext %a, i16 signext %b) {
+define half @fp16_vmaxnm_oge(half %a, half %b) {
 ; CHECK-LABEL: fp16_vmaxnm_oge:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r2
-; CHECK-NEXT:    vmov.f16 s2, r1
+; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s2, r0
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp oge half %0, %1
-  %cond = select i1 %cmp, half %0, half %1
+  %cmp = fcmp oge half %a, %b
+  %cond = select i1 %cmp, half %a, half %b
   ret half %cond
 }
 
-define half @fp16_vmaxnm_o_rev(i16 signext %a, i16 signext %b) {
+define half @fp16_vmaxnm_o_rev(half %a, half %b) {
 ; CHECK-LABEL: fp16_vmaxnm_o_rev:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
-; CHECK-NEXT:    vmov.f16 s2, r2
+; CHECK-NEXT:    vmov.f16 s0, r0
+; CHECK-NEXT:    vmov.f16 s2, r1
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp olt half %0, %1
-  %cond = select i1 %cmp, half %1, half %0
+  %cmp = fcmp olt half %a, %b
+  %cond = select i1 %cmp, half %b, half %a
   ret half %cond
 }
 
-define half @fp16_vmaxnm_ole_rev(i16 signext %a, i16 signext %b) {
+define half @fp16_vmaxnm_ole_rev(half %a, half %b) {
 ; CHECK-LABEL: fp16_vmaxnm_ole_rev:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
-; CHECK-NEXT:    vmov.f16 s2, r2
+; CHECK-NEXT:    vmov.f16 s0, r0
+; CHECK-NEXT:    vmov.f16 s2, r1
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp ole half %0, %1
-  %cond = select i1 %cmp, half %1, half %0
+  %cmp = fcmp ole half %a, %b
+  %cond = select i1 %cmp, half %b, half %a
   ret half %cond
 }
 
-define half @fp16_vmaxnm_u(i16 signext %a, i16 signext %b) {
+define half @fp16_vmaxnm_u(half %a, half %b) {
 ; CHECK-LABEL: fp16_vmaxnm_u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
-; CHECK-NEXT:    vmov.f16 s2, r2
+; CHECK-NEXT:    vmov.f16 s0, r0
+; CHECK-NEXT:    vmov.f16 s2, r1
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp ugt half %0, %1
-  %cond = select i1 %cmp, half %0, half %1
+  %cmp = fcmp ugt half %a, %b
+  %cond = select i1 %cmp, half %a, half %b
   ret half %cond
 }
 
-define half @fp16_vmaxnm_uge(i16 signext %a, i16 signext %b) {
+define half @fp16_vmaxnm_uge(half %a, half %b) {
 ; CHECK-LABEL: fp16_vmaxnm_uge:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
-; CHECK-NEXT:    vmov.f16 s2, r2
+; CHECK-NEXT:    vmov.f16 s0, r0
+; CHECK-NEXT:    vmov.f16 s2, r1
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp uge half %0, %1
-  %cond = select i1 %cmp, half %0, half %1
+  %cmp = fcmp uge half %a, %b
+  %cond = select i1 %cmp, half %a, half %b
   ret half %cond
 }
 
-define half @fp16_vmaxnm_u_rev(i16 signext %a, i16 signext %b) {
+define half @fp16_vmaxnm_u_rev(half %a, half %b) {
 ; CHECK-LABEL: fp16_vmaxnm_u_rev:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r2
-; CHECK-NEXT:    vmov.f16 s2, r1
+; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s2, r0
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast i16 %a to half
-  %1 = bitcast i16 %b to half
-  %cmp = fcmp ult half %0, %1
-  %cond = select i1 %cmp, half %1, half %0
+  %cmp = fcmp ult half %a, %b
+  %cond = select i1 %cmp, half %b, half %a
   ret half %cond
 }
 
 ; known non-NaNs
 
-define half @fp16_vminnm_NNNo(i16 signext %a) {
+define half @fp16_vminnm_NNNo(half %a) {
 ; CHECK-LABEL: fp16_vminnm_NNNo:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vmov.f16 s2, #1.200000e+01
 ; CHECK-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vldr.16 s2, .LCPI12_0
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI12_0:
 ; CHECK-NEXT:    .short 0x5040 @ half 34
 entry:
-  %0 = bitcast i16 %a to half
-  %cmp1 = fcmp olt half %0, 12.
-  %cond1 = select i1 %cmp1, half %0, half 12.
+  %cmp1 = fcmp olt half %a, 12.
+  %cond1 = select i1 %cmp1, half %a, half 12.
   %cmp2 = fcmp olt half 34., %cond1
   %cond2 = select i1 %cmp2, half 34., half %cond1
   ret half %cond2
 }
 
-define half @fp16_vminnm_NNNo_rev(i16 signext %a) {
+define half @fp16_vminnm_NNNo_rev(half %a) {
 ; CHECK-LABEL: fp16_vminnm_NNNo_rev:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldr.16 s2, .LCPI13_0
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
 ; CHECK-NEXT:    vldr.16 s2, .LCPI13_1
 ; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
@@ -270,50 +239,48 @@ define half @fp16_vminnm_NNNo_rev(i16 signext %a) {
 ; CHECK-NEXT:  .LCPI13_1:
 ; CHECK-NEXT:    .short 0x54e0 @ half 78
 entry:
-  %0 = bitcast i16 %a to half
-  %cmp1 = fcmp ogt half %0, 56.
-  %cond1 = select i1 %cmp1, half 56., half %0
+  %cmp1 = fcmp ogt half %a, 56.
+  %cond1 = select i1 %cmp1, half 56., half %a
   %cmp2 = fcmp ogt half 78., %cond1
   %cond2 = select i1 %cmp2, half %cond1, half 78.
   ret half %cond2
 }
 
-define half @fp16_vminnm_NNNu(i16 signext %b) {
+define half @fp16_vminnm_NNNu(half %b) {
 ; CHECK-LABEL: fp16_vminnm_NNNu:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vmov.f16 s2, #1.200000e+01
 ; CHECK-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vldr.16 s2, .LCPI14_0
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI14_0:
 ; CHECK-NEXT:    .short 0x5040 @ half 34
 entry:
-  %0 = bitcast i16 %b to half
-  %cmp1 = fcmp ult half 12., %0
-  %cond1 = select i1 %cmp1, half 12., half %0
+  %cmp1 = fcmp ult half 12., %b
+  %cond1 = select i1 %cmp1, half 12., half %b
   %cmp2 = fcmp ult half %cond1, 34.
   %cond2 = select i1 %cmp2, half %cond1, half 34.
   ret half %cond2
 }
 
-define half @fp16_vminnm_NNNule(i16 signext %b) {
+define half @fp16_vminnm_NNNule(half %b) {
 ; CHECK-LABEL: fp16_vminnm_NNNule:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldr.16 s2, .LCPI15_0
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vldr.16 s2, .LCPI15_1
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
@@ -323,25 +290,24 @@ define half @fp16_vminnm_NNNule(i16 signext %b) {
 ; CHECK-NEXT:    .short 0x5300 @ half 56
 
 entry:
-  %0 = bitcast i16 %b to half
-  %cmp1 = fcmp ule half 34., %0
-  %cond1 = select i1 %cmp1, half 34., half %0
+  %cmp1 = fcmp ule half 34., %b
+  %cond1 = select i1 %cmp1, half 34., half %b
   %cmp2 = fcmp ule half %cond1, 56.
   %cond2 = select i1 %cmp2, half %cond1, half 56.
   ret half %cond2
 }
 
-define half @fp16_vminnm_NNNu_rev(i16 signext %b) {
+define half @fp16_vminnm_NNNu_rev(half %b) {
 ; CHECK-LABEL: fp16_vminnm_NNNu_rev:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldr.16 s2, .LCPI16_0
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
 ; CHECK-NEXT:    vldr.16 s2, .LCPI16_1
 ; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
@@ -352,50 +318,48 @@ define half @fp16_vminnm_NNNu_rev(i16 signext %b) {
 
 
 entry:
-  %0 = bitcast i16 %b to half
-  %cmp1 = fcmp ugt half 56., %0
-  %cond1 = select i1 %cmp1, half %0, half 56.
+  %cmp1 = fcmp ugt half 56., %b
+  %cond1 = select i1 %cmp1, half %b, half 56.
   %cmp2 = fcmp ugt half %cond1, 78.
   %cond2 = select i1 %cmp2, half 78., half %cond1
   ret half %cond2
 }
 
-define half @fp16_vmaxnm_NNNo(i16 signext %a) {
+define half @fp16_vmaxnm_NNNo(half %a) {
 ; CHECK-LABEL: fp16_vmaxnm_NNNo:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vmov.f16 s2, #1.200000e+01
 ; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vldr.16 s2, .LCPI17_0
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI17_0:
 ; CHECK-NEXT:    .short 0x5040 @ half 34
 entry:
-  %0 = bitcast i16 %a to half
-  %cmp1 = fcmp ogt half %0, 12.
-  %cond1 = select i1 %cmp1, half %0, half 12.
+  %cmp1 = fcmp ogt half %a, 12.
+  %cond1 = select i1 %cmp1, half %a, half 12.
   %cmp2 = fcmp ogt half 34., %cond1
   %cond2 = select i1 %cmp2, half 34., half %cond1
   ret half %cond2
 }
 
-define half @fp16_vmaxnm_NNNoge(i16 signext %a) {
+define half @fp16_vmaxnm_NNNoge(half %a) {
 ; CHECK-LABEL: fp16_vmaxnm_NNNoge:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldr.16 s2, .LCPI18_0
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vldr.16 s2, .LCPI18_1
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
@@ -404,25 +368,24 @@ define half @fp16_vmaxnm_NNNoge(i16 signext %a) {
 ; CHECK-NEXT:  .LCPI18_1:
 ; CHECK-NEXT:    .short 0x5300 @ half 56
 entry:
-  %0 = bitcast i16 %a to half
-  %cmp1 = fcmp oge half %0, 34.
-  %cond1 = select i1 %cmp1, half %0, half 34.
+  %cmp1 = fcmp oge half %a, 34.
+  %cond1 = select i1 %cmp1, half %a, half 34.
   %cmp2 = fcmp oge half 56., %cond1
   %cond2 = select i1 %cmp2, half 56., half %cond1
   ret half %cond2
 }
 
-define half @fp16_vmaxnm_NNNo_rev(i16 signext %a) {
+define half @fp16_vmaxnm_NNNo_rev(half %a) {
 ; CHECK-LABEL: fp16_vmaxnm_NNNo_rev:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldr.16 s2, .LCPI19_0
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
 ; CHECK-NEXT:    vldr.16 s2, .LCPI19_1
 ; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
@@ -431,25 +394,24 @@ define half @fp16_vmaxnm_NNNo_rev(i16 signext %a) {
 ; CHECK-NEXT:  .LCPI19_1:
 ; CHECK-NEXT:    .short 0x54e0 @ half 78
 entry:
-  %0 = bitcast i16 %a to half
-  %cmp1 = fcmp olt half %0, 56.
-  %cond1 = select i1 %cmp1, half 56., half %0
+  %cmp1 = fcmp olt half %a, 56.
+  %cond1 = select i1 %cmp1, half 56., half %a
   %cmp2 = fcmp olt half 78., %cond1
   %cond2 = select i1 %cmp2, half %cond1, half 78.
   ret half %cond2
 }
 
-define half @fp16_vmaxnm_NNNole_rev(i16 signext %a) {
+define half @fp16_vmaxnm_NNNole_rev(half %a) {
 ; CHECK-LABEL: fp16_vmaxnm_NNNole_rev:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldr.16 s2, .LCPI20_0
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
 ; CHECK-NEXT:    vldr.16 s2, .LCPI20_1
 ; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
@@ -458,50 +420,48 @@ define half @fp16_vmaxnm_NNNole_rev(i16 signext %a) {
 ; CHECK-NEXT:  .LCPI20_1:
 ; CHECK-NEXT:    .short 0x55a0 @ half 90
 entry:
-  %0 = bitcast i16 %a to half
-  %cmp1 = fcmp ole half %0, 78.
-  %cond1 = select i1 %cmp1, half 78., half %0
+  %cmp1 = fcmp ole half %a, 78.
+  %cond1 = select i1 %cmp1, half 78., half %a
   %cmp2 = fcmp ole half 90., %cond1
   %cond2 = select i1 %cmp2, half %cond1, half 90.
   ret half %cond2
 }
 
-define half @fp16_vmaxnm_NNNu(i16 signext %b) {
+define half @fp16_vmaxnm_NNNu(half %b) {
 ; CHECK-LABEL: fp16_vmaxnm_NNNu:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vmov.f16 s2, #1.200000e+01
 ; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vldr.16 s2, .LCPI21_0
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI21_0:
 ; CHECK-NEXT:    .short 0x5040 @ half 34
 entry:
-  %0 = bitcast i16 %b to half
-  %cmp1 = fcmp ugt half 12., %0
-  %cond1 = select i1 %cmp1, half 12., half %0
+  %cmp1 = fcmp ugt half 12., %b
+  %cond1 = select i1 %cmp1, half 12., half %b
   %cmp2 = fcmp ugt half %cond1, 34.
   %cond2 = select i1 %cmp2, half %cond1, half 34.
   ret half %cond2
 }
 
-define half @fp16_vmaxnm_NNNuge(i16 signext %b) {
+define half @fp16_vmaxnm_NNNuge(half %b) {
 ; CHECK-LABEL: fp16_vmaxnm_NNNuge:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldr.16 s2, .LCPI22_0
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vldr.16 s2, .LCPI22_1
 ; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
@@ -510,81 +470,77 @@ define half @fp16_vmaxnm_NNNuge(i16 signext %b) {
 ; CHECK-NEXT:  .LCPI22_1:
 ; CHECK-NEXT:    .short 0x5300 @ half 56
 entry:
-  %0 = bitcast i16 %b to half
-  %cmp1 = fcmp uge half 34., %0
-  %cond1 = select i1 %cmp1, half 34., half %0
+  %cmp1 = fcmp uge half 34., %b
+  %cond1 = select i1 %cmp1, half 34., half %b
   %cmp2 = fcmp uge half %cond1, 56.
   %cond2 = select i1 %cmp2, half %cond1, half 56.
   ret half %cond2
 }
 
-define half @fp16_vminmaxnm_neg0(i16 signext %a) {
+define half @fp16_vminmaxnm_neg0(half %a) {
 ; CHECK-LABEL: fp16_vminmaxnm_neg0:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldr.16 s0, .LCPI23_0
-; CHECK-NEXT:    vmov.f16 s2, r1
+; CHECK-NEXT:    vmov.f16 s2, r0
 ; CHECK-NEXT:    vminnm.f16 s2, s2, s0
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s0, s2
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI23_0:
 ; CHECK-NEXT:    .short 0x8000 @ half -0
 entry:
-  %0 = bitcast i16 %a to half
-  %cmp1 = fcmp olt half %0, -0.
-  %cond1 = select i1 %cmp1, half %0, half -0.
+  %cmp1 = fcmp olt half %a, -0.
+  %cond1 = select i1 %cmp1, half %a, half -0.
   %cmp2 = fcmp ugt half %cond1, -0.
   %cond2 = select i1 %cmp2, half %cond1, half -0.
   ret half %cond2
 }
 
-define half @fp16_vminmaxnm_e_0(i16 signext %a) {
+define half @fp16_vminmaxnm_e_0(half %a) {
 ; CHECK-LABEL: fp16_vminmaxnm_e_0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s0, r0
 ; CHECK-NEXT:    vldr.16 s2, .LCPI24_0
 ; CHECK-NEXT:    vcmp.f16 s0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s2, s0
 ; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI24_0:
 ; CHECK-NEXT:    .short 0x0000 @ half 0
 entry:
-  %0 = bitcast i16 %a to half
-  %cmp1 = fcmp nsz ole half 0., %0
-  %cond1 = select i1 %cmp1, half 0., half %0
+  %cmp1 = fcmp nsz ole half 0., %a
+  %cond1 = select i1 %cmp1, half 0., half %a
   %cmp2 = fcmp nsz uge half 0., %cond1
   %cond2 = select i1 %cmp2, half 0., half %cond1
   ret half %cond2
 }
 
-define half @fp16_vminmaxnm_e_neg0(i16 signext %a) {
+define half @fp16_vminmaxnm_e_neg0(half %a) {
 ; CHECK-LABEL: fp16_vminmaxnm_e_neg0:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldr.16 s0, .LCPI25_0
-; CHECK-NEXT:    vmov.f16 s2, r1
+; CHECK-NEXT:    vmov.f16 s2, r0
 ; CHECK-NEXT:    vminnm.f16 s2, s2, s0
 ; CHECK-NEXT:    vcmp.f16 s0, s2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselge.f16 s0, s0, s2
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI25_0:
 ; CHECK-NEXT:    .short 0x8000 @ half -0
 entry:
-  %0 = bitcast i16 %a to half
-  %cmp1 = fcmp nsz ule half -0., %0
-  %cond1 = select i1 %cmp1, half -0., half %0
+  %cmp1 = fcmp nsz ule half -0., %a
+  %cond1 = select i1 %cmp1, half -0., half %a
   %cmp2 = fcmp nsz oge half -0., %cond1
   %cond2 = select i1 %cmp2, half -0., half %cond1
   ret half %cond2

diff  --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
index 28a6fef7d7b0..39673bb2d786 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
@@ -21,6 +21,7 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-NEXT:    vmov s2, r0
 ; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl __aeabi_f2h
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
 ; CHECK-NEXT:    .p2align 2

diff  --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
index 9d595b9a296e..003b64be09a6 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
@@ -21,6 +21,7 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-NEXT:    vmov s2, r0
 ; CHECK-NEXT:    vmul.f32 s0, s2, s0
 ; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl __aeabi_f2h
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
 ; CHECK-NEXT:    .p2align 2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index ce9abf80b9de..5425c0d2318f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -682,7 +682,7 @@ entry:
 define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) {
 ; CHECK-LABEL: extract_f16_0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = extractelement <8 x half> %a, i32 0
@@ -693,7 +693,6 @@ define arm_aapcs_vfpcc half @extract_f16_3(<8 x half> %a) {
 ; CHECK-LABEL: extract_f16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %res = extractelement <8 x half> %a, i32 3

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
index 86e1090dfc66..bce76f037a78 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
@@ -253,10 +253,9 @@ entry:
 define arm_aapcs_vfpcc half @vdup_f16_extract(half* %src1, half* %src2) {
 ; CHECK-LABEL: vdup_f16_extract:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    vldr.16 s2, [r1]
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    vldr.16 s2, [r0]
 ; CHECK-NEXT:    vadd.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load half, half *%src1, align 2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
index 98f2f28e8eb0..72636ad037db 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
@@ -78,7 +78,6 @@ define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) {
 ; CHECK-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
@@ -103,7 +102,6 @@ define arm_aapcs_vfpcc half @fmin_v8f16(<8 x half> %x) {
 ; CHECK-NEXT:    vminnm.f16 s4, s4, s6
 ; CHECK-NEXT:    vminnm.f16 s4, s4, s3
 ; CHECK-NEXT:    vminnm.f16 s0, s4, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
@@ -125,7 +123,6 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s6
 ; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s4, s0
-; CHECK-FP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v16f16:
@@ -169,7 +166,6 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
-; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
@@ -309,20 +305,20 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) {
 ; CHECK-FP-LABEL: fmin_v4f16_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmov r1, s1
-; CHECK-FP-NEXT:    vdup.32 q1, r1
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vdup.32 q1, r0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-FP-NEXT:    vdup.16 q1, r1
+; CHECK-FP-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-FP-NEXT:    vdup.16 q1, r0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmov r1, s1
+; CHECK-NOFP-NEXT:    vmov r0, s1
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
-; CHECK-NOFP-NEXT:    vdup.32 q1, r1
+; CHECK-NOFP-NEXT:    vdup.32 q1, r0
 ; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vcmp.f16 s8, s10
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
@@ -333,7 +329,6 @@ define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) {
 ; CHECK-NOFP-NEXT:    vcmp.f16 s8, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s8
-; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
@@ -346,13 +341,13 @@ define arm_aapcs_vfpcc half @fmin_v8f16_nofast(<8 x half> %x) {
 ; CHECK-FP-NEXT:    vmov.f64 d2, d1
 ; CHECK-FP-NEXT:    vmov.f32 s5, s3
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmov r1, s1
-; CHECK-FP-NEXT:    vdup.32 q1, r1
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vdup.32 q1, r0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-FP-NEXT:    vdup.16 q1, r1
+; CHECK-FP-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-FP-NEXT:    vdup.16 q1, r0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f16_nofast:
@@ -384,7 +379,6 @@ define arm_aapcs_vfpcc half @fmin_v8f16_nofast(<8 x half> %x) {
 ; CHECK-NOFP-NEXT:    vcmp.f16 s8, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s8
-; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
@@ -398,13 +392,13 @@ define arm_aapcs_vfpcc half @fmin_v16f16_nofast(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vmov.f64 d2, d1
 ; CHECK-FP-NEXT:    vmov.f32 s5, s3
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmov r1, s1
-; CHECK-FP-NEXT:    vdup.32 q1, r1
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vdup.32 q1, r0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-FP-NEXT:    vdup.16 q1, r1
+; CHECK-FP-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-FP-NEXT:    vdup.16 q1, r0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v16f16_nofast:
@@ -462,7 +456,6 @@ define arm_aapcs_vfpcc half @fmin_v16f16_nofast(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vcmp.f16 s8, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s8
-; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
@@ -1195,7 +1188,6 @@ define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) {
 ; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  @ %bb.1:
@@ -1220,7 +1212,6 @@ define arm_aapcs_vfpcc half @fmax_v8f16(<8 x half> %x) {
 ; CHECK-NEXT:    vmaxnm.f16 s4, s4, s6
 ; CHECK-NEXT:    vmaxnm.f16 s4, s4, s3
 ; CHECK-NEXT:    vmaxnm.f16 s0, s4, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
@@ -1242,7 +1233,6 @@ define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s6
 ; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s4, s0
-; CHECK-FP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v16f16:
@@ -1286,7 +1276,6 @@ define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
-; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
@@ -1424,20 +1413,20 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) {
 ; CHECK-FP-LABEL: fmax_v4f16_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmov r1, s1
-; CHECK-FP-NEXT:    vdup.32 q1, r1
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vdup.32 q1, r0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-FP-NEXT:    vdup.16 q1, r1
+; CHECK-FP-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-FP-NEXT:    vdup.16 q1, r0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmov r1, s1
+; CHECK-NOFP-NEXT:    vmov r0, s1
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
-; CHECK-NOFP-NEXT:    vdup.32 q1, r1
+; CHECK-NOFP-NEXT:    vdup.32 q1, r0
 ; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vcmp.f16 s10, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1448,7 +1437,6 @@ define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) {
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s8
-; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
@@ -1461,13 +1449,13 @@ define arm_aapcs_vfpcc half @fmax_v8f16_nofast(<8 x half> %x) {
 ; CHECK-FP-NEXT:    vmov.f64 d2, d1
 ; CHECK-FP-NEXT:    vmov.f32 s5, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmov r1, s1
-; CHECK-FP-NEXT:    vdup.32 q1, r1
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vdup.32 q1, r0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-FP-NEXT:    vdup.16 q1, r1
+; CHECK-FP-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-FP-NEXT:    vdup.16 q1, r0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f16_nofast:
@@ -1499,7 +1487,6 @@ define arm_aapcs_vfpcc half @fmax_v8f16_nofast(<8 x half> %x) {
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s8
-; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
@@ -1513,13 +1500,13 @@ define arm_aapcs_vfpcc half @fmax_v16f16_nofast(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vmov.f64 d2, d1
 ; CHECK-FP-NEXT:    vmov.f32 s5, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmov r1, s1
-; CHECK-FP-NEXT:    vdup.32 q1, r1
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vdup.32 q1, r0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-FP-NEXT:    vdup.16 q1, r1
+; CHECK-FP-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-FP-NEXT:    vdup.16 q1, r0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v16f16_nofast:
@@ -1577,7 +1564,6 @@ define arm_aapcs_vfpcc half @fmax_v16f16_nofast(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s8
-; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)


        


More information about the llvm-commits mailing list