[PATCH][AArch64] Fix bug around passing VPR stack parameters

Mon Jun 2 09:19:45 PDT 2014

Hi Jiangning,

> Yes, I can reproduce the failure exposed by "llc -debug", and it seems this
> issue is caused by the promotion from i8 to i32. Assert[SZ]Ext is
> implicating this breakage so we should fix the generic code accordingly,
> right?

I think the generic code is probably doing the right thing. AArch64 is
using the CC stuff in some rather odd ways, and we shouldn't really
expect generic code to be lax in its checks to accommodate us. We
should be handing it an object of the type it's asked for.

I've got a patch ready that fixes this in AArch64 code, but I was
waiting for you to finish with this one so you don't have to deal with
any conflicts from it. I've attached it here for you to have a look,
though it's still unpoloshed. I *think* all that's needed in addition
to handle your particular problem is a "case CCValAssign::BCvt:
DAG.getNode(ISD::BITCAST, ...); break;".

Cheers.

Tim.
-------------- next part --------------
commit f6efa724fd73f04ecc28bfabe6918dfb0726d466
Author: Tim Northover <T.P.Northover at gmail.com>
Date:   Fri May 30 08:53:34 2014 +0100

    AArch64: mark small types (i1, i8, i16) as promoted
    
    This means the output of LowerFormalArguments returns a lowered
    SDValue with the correct type (expected in SelectionDAGBuilder).
    Without this, an assertion under a DEBUG macro triggers when those
    types are passed on the stack.

diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index ded2e17..8e8bd3d 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -18,9 +18,6 @@ class CCIfAlign<string Align, CCAction A> :
 class CCIfBigEndian<CCAction A> :
   CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
 
-class CCIfUnallocated<string Reg, CCAction A> :
-  CCIf<"!State.isAllocated(AArch64::" # Reg # ")", A>;
-
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
 //===----------------------------------------------------------------------===//
@@ -45,7 +42,7 @@ def CC_AArch64_AAPCS : CallingConv<[
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>,
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
@@ -120,7 +117,7 @@ def CC_AArch64_DarwinPCS : CallingConv<[
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>,
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
@@ -143,8 +140,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8], CCAssignToStack<1, 1>>,
-  CCIfType<[i16], CCAssignToStack<2, 2>>,
+  CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
+  CCIf<"ValVT == MVT::i16", CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
            CCAssignToStack<8, 8>>,
@@ -172,12 +169,11 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
 // 32bit quantity as undef.
 def CC_AArch64_WebKit_JS : CallingConv<[
   // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
-  CCIfType<[i1, i8, i16], CCIfUnallocated<"X0", CCPromoteToType<i32>>>,
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
   CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
 
   // Pass the remaining arguments on the stack instead.
-  CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[i64, f64], CCAssignToStack<8, 8>>
 ]>;
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index f97cfb9..c84889e 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -1218,7 +1218,6 @@ bool AArch64FastISel::ProcessCallArgs(
       Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
       if (Arg == 0)
         return false;
-      ArgVT = DestVT;
       break;
     }
     case CCValAssign::AExt:
@@ -1229,7 +1228,6 @@ bool AArch64FastISel::ProcessCallArgs(
       Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
       if (Arg == 0)
         return false;
-      ArgVT = DestVT;
       break;
     }
     default:
@@ -1248,7 +1246,7 @@ bool AArch64FastISel::ProcessCallArgs(
       assert(VA.isMemLoc() && "Assuming store on stack.");
 
       // Need to store on the stack.
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8;
 
       unsigned BEAlign = 0;
       if (ArgSize < 8 && !Subtarget->isLittleEndian())
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index f77a21a..4132f13 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1787,9 +1787,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         break;
       }
 
-      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getValVT(), Chain, FIN,
+      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
                                 MachinePointerInfo::getFixedStack(FI),
-                                VA.getLocVT(),
+                                VA.getValVT(),
                                 false, false, false, 0);
 
       InVals.push_back(ArgValue);
@@ -2339,11 +2339,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
         // promoted to a legal register type i32, we should truncate Arg back to
         // i1/i8/i16.
-        if (Arg.getValueType().isSimple() &&
-            Arg.getValueType().getSimpleVT() == MVT::i32 &&
-            (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
-             VA.getLocVT() == MVT::i16))
-          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
+        if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
+            VA.getValVT() == MVT::i16)
+          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
 
         SDValue Store =
             DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index e2de434..0ec4956 100644
--- a/test/CodeGen/AArch64/arm64-abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -8,15 +8,15 @@ entry:
 ; CHECK-LABEL: i8i16callee:
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
-; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp, #5]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: ldrsh {{w[0-9]+}}, [sp, #2]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp]
 ; FAST-LABEL: i8i16callee:
-; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
-; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
-; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
-; FAST: ldrb  {{w[0-9]+}}, [sp]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp, #5]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp, #4]
+; FAST-DAG: ldrsh  {{w[0-9]+}}, [sp, #2]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp]
   %conv = sext i8 %a4 to i64
   %conv3 = sext i16 %a5 to i64
   %conv8 = sext i8 %b1 to i64
@@ -44,10 +44,10 @@ entry:
 ; CHECK: i8i16caller
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
-; CHECK: strb {{w[0-9]+}}, [sp, #5]
-; CHECK: strb {{w[0-9]+}}, [sp, #4]
-; CHECK: strh {{w[0-9]+}}, [sp, #2]
-; CHECK: strb {{w[0-9]+}}, [sp]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp, #5]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: strh {{w[0-9]+}}, [sp, #2]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp]
 ; CHECK: bl
 ; FAST: i8i16caller
 ; FAST: strb {{w[0-9]+}}, [sp]