[clang] 7d7022f - [PowerPC] Fix EmitPPCBuiltinExpr to emit arguments once

Tue Apr 12 13:33:25 PDT 2022

Author: Quinn Pham
Date: 2022-04-12T15:33:20-05:00
New Revision: 7d7022fb0ce42a59417b6402cbdd8397287c4ffb

URL: https://github.com/llvm/llvm-project/commit/7d7022fb0ce42a59417b6402cbdd8397287c4ffb
DIFF: https://github.com/llvm/llvm-project/commit/7d7022fb0ce42a59417b6402cbdd8397287c4ffb.diff

LOG: [PowerPC] Fix EmitPPCBuiltinExpr to emit arguments once

This patch changes `EmitPPCBuiltinExpr` in `CGBuiltin.cpp` to remove
the loop at the beginning of the function that emits the arguments and
to delay emitting the arguments until inside the switch statement. These
changes will put `EmitPPCBuiltinExpr` in line with the strategy of the
target independent function `EmitBuiltinExpr`. Also, this patch
ensures that arguments are only emitted once.

Tests that included builtins affected by these changes have been
modified to match expected behaviour.

Reviewed By: #powerpc, nemanjai, amyk

Differential Revision: https://reviews.llvm.org/D121637

Added: 
    clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c
    clang/test/CodeGen/PowerPC/builtins-ppc-stmtexpr-argument.c

Modified: 
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/test/CodeGen/PowerPC/builtins-ppc-fastmath.c
    clang/test/CodeGen/PowerPC/builtins-ppc-vsx.c
    clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-cas.c
    clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-fetch.c
    clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-fp.c
    clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.c
    clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-sync.c
    clang/test/CodeGen/PowerPC/ppc-mma-types.c
    clang/test/Sema/ppc-pair-mma-types.c

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 661c0a105f427..4ad4acc0be6b1 100644

--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -15281,14 +15281,17 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
 
 Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
                                            const CallExpr *E) {
-  SmallVector<Value*, 4> Ops;
-
-  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    if (E->getArg(i)->getType()->isArrayType())
-      Ops.push_back(EmitArrayToPointerDecay(E->getArg(i)).getPointer());
-    else
-      Ops.push_back(EmitScalarExpr(E->getArg(i)));
-  }
+  // Do not emit the builtin arguments in the arguments of a function call,
+  // because the evaluation order of function arguments is not specified in C++.
+  // This is important when testing to ensure the arguments are emitted in the
+  // same order every time. Eg:
+  // Instead of:
+  //   return Builder.CreateFDiv(EmitScalarExpr(E->getArg(0)),
+  //                             EmitScalarExpr(E->getArg(1)), "swdiv");
+  // Use:
+  //   Value *Op0 = EmitScalarExpr(E->getArg(0));
+  //   Value *Op1 = EmitScalarExpr(E->getArg(1));
+  //   return Builder.CreateFDiv(Op0, Op1, "swdiv")
 
   Intrinsic::ID ID = Intrinsic::not_intrinsic;
 
@@ -15315,6 +15318,9 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
   case PPC::BI__builtin_vsx_lxvl:
   case PPC::BI__builtin_vsx_lxvll:
   {
+    SmallVector<Value *, 2> Ops;
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     if(BuiltinID == PPC::BI__builtin_vsx_lxvl ||
        BuiltinID == PPC::BI__builtin_vsx_lxvll){
       Ops[0] = Builder.CreateBitCast(Ops[0], Int8PtrTy);
@@ -15383,6 +15389,10 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
   case PPC::BI__builtin_vsx_stxvl:
   case PPC::BI__builtin_vsx_stxvll:
   {
+    SmallVector<Value *, 3> Ops;
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Ops.push_back(EmitScalarExpr(E->getArg(2)));
     if(BuiltinID == PPC::BI__builtin_vsx_stxvl ||
       BuiltinID == PPC::BI__builtin_vsx_stxvll ){
       Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
@@ -15435,13 +15445,15 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     // Essentially boils down to performing an unaligned VMX load sequence so
     // as to avoid crossing a page boundary and then shuffling the elements
     // into the right side of the vector register.
-    int64_t NumBytes = cast<ConstantInt>(Ops[1])->getZExtValue();
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    int64_t NumBytes = cast<ConstantInt>(Op1)->getZExtValue();
     llvm::Type *ResTy = ConvertType(E->getType());
     bool IsLE = getTarget().isLittleEndian();
 
     // If the user wants the entire vector, just load the entire vector.
     if (NumBytes == 16) {
-      Value *BC = Builder.CreateBitCast(Ops[0], ResTy->getPointerTo());
+      Value *BC = Builder.CreateBitCast(Op0, ResTy->getPointerTo());
       Value *LD =
           Builder.CreateLoad(Address(BC, ResTy, CharUnits::fromQuantity(1)));
       if (!IsLE)
@@ -15459,16 +15471,14 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
                                                 : Intrinsic::ppc_altivec_lvsl);
     llvm::Function *Vperm = CGM.getIntrinsic(Intrinsic::ppc_altivec_vperm);
     Value *HiMem = Builder.CreateGEP(
-        Int8Ty, Ops[0], ConstantInt::get(Ops[1]->getType(), NumBytes - 1));
-    Value *LoLd = Builder.CreateCall(Lvx, Ops[0], "ld.lo");
+        Int8Ty, Op0, ConstantInt::get(Op1->getType(), NumBytes - 1));
+    Value *LoLd = Builder.CreateCall(Lvx, Op0, "ld.lo");
     Value *HiLd = Builder.CreateCall(Lvx, HiMem, "ld.hi");
-    Value *Mask1 = Builder.CreateCall(Lvs, Ops[0], "mask1");
+    Value *Mask1 = Builder.CreateCall(Lvs, Op0, "mask1");
 
-    Ops.clear();
-    Ops.push_back(IsLE ? HiLd : LoLd);
-    Ops.push_back(IsLE ? LoLd : HiLd);
-    Ops.push_back(Mask1);
-    Value *AllElts = Builder.CreateCall(Vperm, Ops, "shuffle1");
+    Op0 = IsLE ? HiLd : LoLd;
+    Op1 = IsLE ? LoLd : HiLd;
+    Value *AllElts = Builder.CreateCall(Vperm, {Op0, Op1, Mask1}, "shuffle1");
     Constant *Zero = llvm::Constant::getNullValue(IsLE ? ResTy : AllElts->getType());
 
     if (IsLE) {
@@ -15489,23 +15499,25 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
         Builder.CreateCall(Vperm, {Zero, AllElts, Mask2}, "shuffle2"), ResTy);
   }
   case PPC::BI__builtin_vsx_strmb: {
-    int64_t NumBytes = cast<ConstantInt>(Ops[1])->getZExtValue();
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    int64_t NumBytes = cast<ConstantInt>(Op1)->getZExtValue();
     bool IsLE = getTarget().isLittleEndian();
     auto StoreSubVec = [&](unsigned Width, unsigned Offset, unsigned EltNo) {
       // Storing the whole vector, simply store it on BE and reverse bytes and
       // store on LE.
       if (Width == 16) {
-        Value *BC =
-            Builder.CreateBitCast(Ops[0], Ops[2]->getType()->getPointerTo());
-        Value *StVec = Ops[2];
+        Value *BC = Builder.CreateBitCast(Op0, Op2->getType()->getPointerTo());
+        Value *StVec = Op2;
         if (IsLE) {
           SmallVector<int, 16> RevMask;
           for (int Idx = 0; Idx < 16; Idx++)
             RevMask.push_back(15 - Idx);
-          StVec = Builder.CreateShuffleVector(Ops[2], Ops[2], RevMask);
+          StVec = Builder.CreateShuffleVector(Op2, Op2, RevMask);
         }
         return Builder.CreateStore(
-            StVec, Address(BC, Ops[2]->getType(), CharUnits::fromQuantity(1)));
+            StVec, Address(BC, Op2->getType(), CharUnits::fromQuantity(1)));
       }
       auto *ConvTy = Int64Ty;
       unsigned NumElts = 0;
@@ -15530,9 +15542,9 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
         break;
       }
       Value *Vec = Builder.CreateBitCast(
-          Ops[2], llvm::FixedVectorType::get(ConvTy, NumElts));
-      Value *Ptr = Builder.CreateGEP(Int8Ty, Ops[0],
-                                     ConstantInt::get(Int64Ty, Offset));
+          Op2, llvm::FixedVectorType::get(ConvTy, NumElts));
+      Value *Ptr =
+          Builder.CreateGEP(Int8Ty, Op0, ConstantInt::get(Int64Ty, Offset));
       Value *PtrBC = Builder.CreateBitCast(Ptr, ConvTy->getPointerTo());
       Value *Elt = Builder.CreateExtractElement(Vec, EltNo);
       if (IsLE && Width > 1) {
@@ -15606,17 +15618,20 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
   }
   case PPC::BI__builtin_altivec_vec_replace_elt:
   case PPC::BI__builtin_altivec_vec_replace_unaligned: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
     // The third argument of vec_replace_elt and vec_replace_unaligned must
     // be a compile time constant and will be emitted either to the vinsw
     // or vinsd instruction.
-    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
+    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
     assert(ArgCI &&
            "Third Arg to vinsw/vinsd intrinsic must be a constant integer!");
     llvm::Type *ResultType = ConvertType(E->getType());
     llvm::Function *F = nullptr;
     Value *Call = nullptr;
     int64_t ConstArg = ArgCI->getSExtValue();
-    unsigned ArgWidth = Ops[1]->getType()->getPrimitiveSizeInBits();
+    unsigned ArgWidth = Op1->getType()->getPrimitiveSizeInBits();
     bool Is32Bit = false;
     assert((ArgWidth == 32 || ArgWidth == 64) && "Invalid argument width");
     // The input to vec_replace_elt is an element index, not a byte index.
@@ -15638,24 +15653,24 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
       if (getTarget().isLittleEndian())
         ConstArg = 8 - ConstArg;
     }
-    Ops[2] = ConstantInt::getSigned(Int32Ty, ConstArg);
+    Op2 = ConstantInt::getSigned(Int32Ty, ConstArg);
     // Depending on ArgWidth, the input vector could be a float or a double.
     // If the input vector is a float type, bitcast the inputs to integers. Or,
     // if the input vector is a double, bitcast the inputs to 64-bit integers.
-    if (!Ops[1]->getType()->isIntegerTy(ArgWidth)) {
-      Ops[0] = Builder.CreateBitCast(
-          Ops[0], Is32Bit ? llvm::FixedVectorType::get(Int32Ty, 4)
-                          : llvm::FixedVectorType::get(Int64Ty, 2));
-      Ops[1] = Builder.CreateBitCast(Ops[1], Is32Bit ? Int32Ty : Int64Ty);
+    if (!Op1->getType()->isIntegerTy(ArgWidth)) {
+      Op0 = Builder.CreateBitCast(
+          Op0, Is32Bit ? llvm::FixedVectorType::get(Int32Ty, 4)
+                       : llvm::FixedVectorType::get(Int64Ty, 2));
+      Op1 = Builder.CreateBitCast(Op1, Is32Bit ? Int32Ty : Int64Ty);
     }
     // Emit the call to vinsw or vinsd.
-    Call = Builder.CreateCall(F, Ops);
+    Call = Builder.CreateCall(F, {Op0, Op1, Op2});
     // Depending on the builtin, bitcast to the approriate result type.
     if (BuiltinID == PPC::BI__builtin_altivec_vec_replace_elt &&
-        !Ops[1]->getType()->isIntegerTy())
+        !Op1->getType()->isIntegerTy())
       return Builder.CreateBitCast(Call, ResultType);
     else if (BuiltinID == PPC::BI__builtin_altivec_vec_replace_elt &&
-             Ops[1]->getType()->isIntegerTy())
+             Op1->getType()->isIntegerTy())
       return Call;
     else
       return Builder.CreateBitCast(Call,
@@ -15672,15 +15687,15 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
   }
   case PPC::BI__builtin_altivec_vadduqm:
   case PPC::BI__builtin_altivec_vsubuqm: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
-    Ops[0] =
-        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int128Ty, 1));
-    Ops[1] =
-        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int128Ty, 1));
+    Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int128Ty, 1));
+    Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int128Ty, 1));
     if (BuiltinID == PPC::BI__builtin_altivec_vadduqm)
-      return Builder.CreateAdd(Ops[0], Ops[1], "vadduqm");
+      return Builder.CreateAdd(Op0, Op1, "vadduqm");
     else
-      return Builder.CreateSub(Ops[0], Ops[1], "vsubuqm");
+      return Builder.CreateSub(Op0, Op1, "vsubuqm");
   }
   // Rotate and insert under mask operation.
   // __rldimi(rs, is, shift, mask)
@@ -15689,29 +15704,37 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
   // (rotl(rs, shift) & mask) | (is & ~mask)
   case PPC::BI__builtin_ppc_rldimi:
   case PPC::BI__builtin_ppc_rlwimi: {
-    llvm::Type *Ty = Ops[0]->getType();
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    Value *Op3 = EmitScalarExpr(E->getArg(3));
+    llvm::Type *Ty = Op0->getType();
     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
     if (BuiltinID == PPC::BI__builtin_ppc_rldimi)
-      Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
-    Value *Shift = Builder.CreateCall(F, {Ops[0], Ops[0], Ops[2]});
-    Value *X = Builder.CreateAnd(Shift, Ops[3]);
-    Value *Y = Builder.CreateAnd(Ops[1], Builder.CreateNot(Ops[3]));
+      Op2 = Builder.CreateZExt(Op2, Int64Ty);
+    Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op2});
+    Value *X = Builder.CreateAnd(Shift, Op3);
+    Value *Y = Builder.CreateAnd(Op1, Builder.CreateNot(Op3));
     return Builder.CreateOr(X, Y);
   }
   // Rotate and insert under mask operation.
   // __rlwnm(rs, shift, mask)
   // rotl(rs, shift) & mask
   case PPC::BI__builtin_ppc_rlwnm: {
-    llvm::Type *Ty = Ops[0]->getType();
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    llvm::Type *Ty = Op0->getType();
     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
-    Value *Shift = Builder.CreateCall(F, {Ops[0], Ops[0], Ops[1]});
-    return Builder.CreateAnd(Shift, Ops[2]);
+    Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op1});
+    return Builder.CreateAnd(Shift, Op2);
   }
   case PPC::BI__builtin_ppc_poppar4:
   case PPC::BI__builtin_ppc_poppar8: {
-    llvm::Type *ArgType = Ops[0]->getType();
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    llvm::Type *ArgType = Op0->getType();
     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
-    Value *Tmp = Builder.CreateCall(F, Ops[0]);
+    Value *Tmp = Builder.CreateCall(F, Op0);
 
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
@@ -15721,10 +15744,12 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     return Result;
   }
   case PPC::BI__builtin_ppc_cmpb: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
     if (getTarget().getTriple().isPPC64()) {
       Function *F =
           CGM.getIntrinsic(Intrinsic::ppc_cmpb, {Int64Ty, Int64Ty, Int64Ty});
-      return Builder.CreateCall(F, Ops, "cmpb");
+      return Builder.CreateCall(F, {Op0, Op1}, "cmpb");
     }
     // For 32 bit, emit the code as below:
     // %conv = trunc i64 %a to i32
@@ -15742,13 +15767,13 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     // ret i64 %or
     Function *F =
         CGM.getIntrinsic(Intrinsic::ppc_cmpb, {Int32Ty, Int32Ty, Int32Ty});
-    Value *ArgOneLo = Builder.CreateTrunc(Ops[0], Int32Ty);
-    Value *ArgTwoLo = Builder.CreateTrunc(Ops[1], Int32Ty);
+    Value *ArgOneLo = Builder.CreateTrunc(Op0, Int32Ty);
+    Value *ArgTwoLo = Builder.CreateTrunc(Op1, Int32Ty);
     Constant *ShiftAmt = ConstantInt::get(Int64Ty, 32);
     Value *ArgOneHi =
-        Builder.CreateTrunc(Builder.CreateLShr(Ops[0], ShiftAmt), Int32Ty);
+        Builder.CreateTrunc(Builder.CreateLShr(Op0, ShiftAmt), Int32Ty);
     Value *ArgTwoHi =
-        Builder.CreateTrunc(Builder.CreateLShr(Ops[1], ShiftAmt), Int32Ty);
+        Builder.CreateTrunc(Builder.CreateLShr(Op1, ShiftAmt), Int32Ty);
     Value *ResLo = Builder.CreateZExt(
         Builder.CreateCall(F, {ArgOneLo, ArgTwoLo}, "cmpb"), Int64Ty);
     Value *ResHiShift = Builder.CreateZExt(
@@ -15842,27 +15867,32 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     return FDiv;
   }
   case PPC::BI__builtin_ppc_alignx: {
-    ConstantInt *AlignmentCI = cast<ConstantInt>(Ops[0]);
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    ConstantInt *AlignmentCI = cast<ConstantInt>(Op0);
     if (AlignmentCI->getValue().ugt(llvm::Value::MaximumAlignment))
       AlignmentCI = ConstantInt::get(AlignmentCI->getType(),
                                      llvm::Value::MaximumAlignment);
 
-    emitAlignmentAssumption(Ops[1], E->getArg(1),
+    emitAlignmentAssumption(Op1, E->getArg(1),
                             /*The expr loc is sufficient.*/ SourceLocation(),
                             AlignmentCI, nullptr);
-    return Ops[1];
+    return Op1;
   }
   case PPC::BI__builtin_ppc_rdlam: {
-    llvm::Type *Ty = Ops[0]->getType();
-    Value *ShiftAmt = Builder.CreateIntCast(Ops[1], Ty, false);
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    llvm::Type *Ty = Op0->getType();
+    Value *ShiftAmt = Builder.CreateIntCast(Op1, Ty, false);
     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
-    Value *Rotate = Builder.CreateCall(F, {Ops[0], Ops[0], ShiftAmt});
-    return Builder.CreateAnd(Rotate, Ops[2]);
+    Value *Rotate = Builder.CreateCall(F, {Op0, Op0, ShiftAmt});
+    return Builder.CreateAnd(Rotate, Op2);
   }
   case PPC::BI__builtin_ppc_load2r: {
     Function *F = CGM.getIntrinsic(Intrinsic::ppc_load2r);
-    Ops[0] = Builder.CreateBitCast(Ops[0], Int8PtrTy);
-    Value *LoadIntrinsic = Builder.CreateCall(F, Ops);
+    Value *Op0 = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int8PtrTy);
+    Value *LoadIntrinsic = Builder.CreateCall(F, {Op0});
     return Builder.CreateTrunc(LoadIntrinsic, Int16Ty);
   }
   // FMA variations
@@ -15924,11 +15954,14 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
   }
 
   case PPC::BI__builtin_vsx_insertword: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxinsertw);
 
     // Third argument is a compile time constant int. It must be clamped to
     // to the range [0, 12].
-    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
+    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
     assert(ArgCI &&
            "Third arg to xxinsertw intrinsic must be constant integer");
     const int64_t MaxIndex = 12;
@@ -15939,40 +15972,38 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     // word from the first argument, and inserts it in the second argument. The
     // instruction extracts the word from its second input register and inserts
     // it into its first input register, so swap the first and second arguments.
-    std::swap(Ops[0], Ops[1]);
+    std::swap(Op0, Op1);
 
     // Need to cast the second argument from a vector of unsigned int to a
     // vector of long long.
-    Ops[1] =
-        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int64Ty, 2));
+    Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int64Ty, 2));
 
     if (getTarget().isLittleEndian()) {
       // Reverse the double words in the vector we will extract from.
-      Ops[0] =
-          Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
-      Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ArrayRef<int>{1, 0});
+      Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
+      Op0 = Builder.CreateShuffleVector(Op0, Op0, ArrayRef<int>{1, 0});
 
       // Reverse the index.
       Index = MaxIndex - Index;
     }
 
     // Intrinsic expects the first arg to be a vector of int.
-    Ops[0] =
-        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
-    Ops[2] = ConstantInt::getSigned(Int32Ty, Index);
-    return Builder.CreateCall(F, Ops);
+    Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4));
+    Op2 = ConstantInt::getSigned(Int32Ty, Index);
+    return Builder.CreateCall(F, {Op0, Op1, Op2});
   }
 
   case PPC::BI__builtin_vsx_extractuword: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxextractuw);
 
     // Intrinsic expects the first argument to be a vector of doublewords.
-    Ops[0] =
-        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
+    Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
 
     // The second argument is a compile time constant int that needs to
     // be clamped to the range [0, 12].
-    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[1]);
+    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op1);
     assert(ArgCI &&
            "Second Arg to xxextractuw intrinsic must be a constant integer!");
     const int64_t MaxIndex = 12;
@@ -15981,29 +16012,30 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     if (getTarget().isLittleEndian()) {
       // Reverse the index.
       Index = MaxIndex - Index;
-      Ops[1] = ConstantInt::getSigned(Int32Ty, Index);
+      Op1 = ConstantInt::getSigned(Int32Ty, Index);
 
       // Emit the call, then reverse the double words of the results vector.
-      Value *Call = Builder.CreateCall(F, Ops);
+      Value *Call = Builder.CreateCall(F, {Op0, Op1});
 
       Value *ShuffleCall =
           Builder.CreateShuffleVector(Call, Call, ArrayRef<int>{1, 0});
       return ShuffleCall;
     } else {
-      Ops[1] = ConstantInt::getSigned(Int32Ty, Index);
-      return Builder.CreateCall(F, Ops);
+      Op1 = ConstantInt::getSigned(Int32Ty, Index);
+      return Builder.CreateCall(F, {Op0, Op1});
     }
   }
 
   case PPC::BI__builtin_vsx_xxpermdi: {
-    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
     assert(ArgCI && "Third arg must be constant integer!");
 
     unsigned Index = ArgCI->getZExtValue();
-    Ops[0] =
-        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
-    Ops[1] =
-        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int64Ty, 2));
+    Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
+    Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int64Ty, 2));
 
     // Account for endianness by treating this as just a shuffle. So we use the
     // same indices for both LE and BE in order to produce expected results in
@@ -16012,21 +16044,21 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     int ElemIdx1 = 2 + (Index & 1);
 
     int ShuffleElts[2] = {ElemIdx0, ElemIdx1};
-    Value *ShuffleCall =
-        Builder.CreateShuffleVector(Ops[0], Ops[1], ShuffleElts);
+    Value *ShuffleCall = Builder.CreateShuffleVector(Op0, Op1, ShuffleElts);
     QualType BIRetType = E->getType();
     auto RetTy = ConvertType(BIRetType);
     return Builder.CreateBitCast(ShuffleCall, RetTy);
   }
 
   case PPC::BI__builtin_vsx_xxsldwi: {
-    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
     assert(ArgCI && "Third argument must be a compile time constant");
     unsigned Index = ArgCI->getZExtValue() & 0x3;
-    Ops[0] =
-        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
-    Ops[1] =
-        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int32Ty, 4));
+    Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4));
+    Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int32Ty, 4));
 
     // Create a shuffle mask
     int ElemIdx0;
@@ -16050,28 +16082,31 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     }
 
     int ShuffleElts[4] = {ElemIdx0, ElemIdx1, ElemIdx2, ElemIdx3};
-    Value *ShuffleCall =
-        Builder.CreateShuffleVector(Ops[0], Ops[1], ShuffleElts);
+    Value *ShuffleCall = Builder.CreateShuffleVector(Op0, Op1, ShuffleElts);
     QualType BIRetType = E->getType();
     auto RetTy = ConvertType(BIRetType);
     return Builder.CreateBitCast(ShuffleCall, RetTy);
   }
 
   case PPC::BI__builtin_pack_vector_int128: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
     bool isLittleEndian = getTarget().isLittleEndian();
     Value *UndefValue =
-        llvm::UndefValue::get(llvm::FixedVectorType::get(Ops[0]->getType(), 2));
+        llvm::UndefValue::get(llvm::FixedVectorType::get(Op0->getType(), 2));
     Value *Res = Builder.CreateInsertElement(
-        UndefValue, Ops[0], (uint64_t)(isLittleEndian ? 1 : 0));
-    Res = Builder.CreateInsertElement(Res, Ops[1],
+        UndefValue, Op0, (uint64_t)(isLittleEndian ? 1 : 0));
+    Res = Builder.CreateInsertElement(Res, Op1,
                                       (uint64_t)(isLittleEndian ? 0 : 1));
     return Builder.CreateBitCast(Res, ConvertType(E->getType()));
   }
 
   case PPC::BI__builtin_unpack_vector_int128: {
-    ConstantInt *Index = cast<ConstantInt>(Ops[1]);
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    ConstantInt *Index = cast<ConstantInt>(Op1);
     Value *Unpacked = Builder.CreateBitCast(
-        Ops[0], llvm::FixedVectorType::get(ConvertType(E->getType()), 2));
+        Op0, llvm::FixedVectorType::get(ConvertType(E->getType()), 2));
 
     if (getTarget().isLittleEndian())
       Index = ConstantInt::get(Index->getType(), 1 - Index->getZExtValue());
@@ -16081,9 +16116,9 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
 
   case PPC::BI__builtin_ppc_sthcx: {
     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_sthcx);
-    Ops[0] = Builder.CreateBitCast(Ops[0], Int8PtrTy);
-    Ops[1] = Builder.CreateSExt(Ops[1], Int32Ty);
-    return Builder.CreateCall(F, Ops);
+    Value *Op0 = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int8PtrTy);
+    Value *Op1 = Builder.CreateSExt(EmitScalarExpr(E->getArg(1)), Int32Ty);
+    return Builder.CreateCall(F, {Op0, Op1});
   }
 
   // The PPC MMA builtins take a pointer to a __vector_quad as an argument.
@@ -16096,6 +16131,12 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
   case PPC::BI__builtin_##Name:
 #include "clang/Basic/BuiltinsPPC.def"
   {
+    SmallVector<Value *, 4> Ops;
+    for (unsigned i = 0, e = E->getNumArgs(); i != e; i++)
+      if (E->getArg(i)->getType()->isArrayType())
+        Ops.push_back(EmitArrayToPointerDecay(E->getArg(i)).getPointer());
+      else
+        Ops.push_back(EmitScalarExpr(E->getArg(i)));
     // The first argument of these two builtins is a pointer used to store their
     // result. However, the llvm intrinsics return their result in multiple
     // return values. So, here we emit code extracting these values from the
@@ -16179,8 +16220,9 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     Value *OldVal = Builder.CreateLoad(OldValAddr);
     QualType AtomicTy = E->getArg(0)->getType()->getPointeeType();
     LValue LV = MakeAddrLValue(Addr, AtomicTy);
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
     auto Pair = EmitAtomicCompareExchange(
-        LV, RValue::get(OldVal), RValue::get(Ops[2]), E->getExprLoc(),
+        LV, RValue::get(OldVal), RValue::get(Op2), E->getExprLoc(),
         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Monotonic, true);
     // Unlike c11's atomic_compare_exchange, accroding to
     // https://www.ibm.com/docs/en/xl-c-and-cpp-aix/16.1?topic=functions-compare-swap-compare-swaplp
@@ -16220,38 +16262,45 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
   case PPC::BI__builtin_ppc_lbarx:
     return emitPPCLoadReserveIntrinsic(*this, BuiltinID, E);
   case PPC::BI__builtin_ppc_mfspr: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
     llvm::Type *RetType = CGM.getDataLayout().getTypeSizeInBits(VoidPtrTy) == 32
                               ? Int32Ty
                               : Int64Ty;
     Function *F = CGM.getIntrinsic(Intrinsic::ppc_mfspr, RetType);
-    return Builder.CreateCall(F, Ops);
+    return Builder.CreateCall(F, {Op0});
   }
   case PPC::BI__builtin_ppc_mtspr: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
     llvm::Type *RetType = CGM.getDataLayout().getTypeSizeInBits(VoidPtrTy) == 32
                               ? Int32Ty
                               : Int64Ty;
     Function *F = CGM.getIntrinsic(Intrinsic::ppc_mtspr, RetType);
-    return Builder.CreateCall(F, Ops);
+    return Builder.CreateCall(F, {Op0, Op1});
   }
   case PPC::BI__builtin_ppc_popcntb: {
     Value *ArgValue = EmitScalarExpr(E->getArg(0));
     llvm::Type *ArgType = ArgValue->getType();
     Function *F = CGM.getIntrinsic(Intrinsic::ppc_popcntb, {ArgType, ArgType});
-    return Builder.CreateCall(F, Ops, "popcntb");
+    return Builder.CreateCall(F, {ArgValue}, "popcntb");
   }
   case PPC::BI__builtin_ppc_mtfsf: {
     // The builtin takes a uint32 that needs to be cast to an
     // f64 to be passed to the intrinsic.
-    Value *Cast = Builder.CreateUIToFP(Ops[1], DoubleTy);
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Cast = Builder.CreateUIToFP(Op1, DoubleTy);
     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_mtfsf);
-    return Builder.CreateCall(F, {Ops[0], Cast}, "");
+    return Builder.CreateCall(F, {Op0, Cast}, "");
   }
 
   case PPC::BI__builtin_ppc_swdiv_nochk:
   case PPC::BI__builtin_ppc_swdivs_nochk: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
     FastMathFlags FMF = Builder.getFastMathFlags();
     Builder.getFastMathFlags().setFast();
-    Value *FDiv = Builder.CreateFDiv(Ops[0], Ops[1], "swdiv_nochk");
+    Value *FDiv = Builder.CreateFDiv(Op0, Op1, "swdiv_nochk");
     Builder.getFastMathFlags() &= (FMF);
     return FDiv;
   }
@@ -16291,7 +16340,9 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
                            Intrinsic::experimental_constrained_sqrt))
         .getScalarVal();
   case PPC::BI__builtin_ppc_test_data_class: {
-    llvm::Type *ArgType = EmitScalarExpr(E->getArg(0))->getType();
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    llvm::Type *ArgType = Op0->getType();
     unsigned IntrinsicID;
     if (ArgType->isDoubleTy())
       IntrinsicID = Intrinsic::ppc_test_data_class_d;
@@ -16299,24 +16350,63 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
       IntrinsicID = Intrinsic::ppc_test_data_class_f;
     else
       llvm_unreachable("Invalid Argument Type");
-    return Builder.CreateCall(CGM.getIntrinsic(IntrinsicID), Ops,
+    return Builder.CreateCall(CGM.getIntrinsic(IntrinsicID), {Op0, Op1},
                               "test_data_class");
   }
-  case PPC::BI__builtin_ppc_maxfe:
-    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfe), Ops);
-  case PPC::BI__builtin_ppc_maxfl:
-    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfl), Ops);
-  case PPC::BI__builtin_ppc_maxfs:
-    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfs), Ops);
-  case PPC::BI__builtin_ppc_minfe:
-    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfe), Ops);
-  case PPC::BI__builtin_ppc_minfl:
-    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfl), Ops);
-  case PPC::BI__builtin_ppc_minfs:
-    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfs), Ops);
+  case PPC::BI__builtin_ppc_maxfe: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    Value *Op3 = EmitScalarExpr(E->getArg(3));
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfe),
+                              {Op0, Op1, Op2, Op3});
+  }
+  case PPC::BI__builtin_ppc_maxfl: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    Value *Op3 = EmitScalarExpr(E->getArg(3));
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfl),
+                              {Op0, Op1, Op2, Op3});
+  }
+  case PPC::BI__builtin_ppc_maxfs: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    Value *Op3 = EmitScalarExpr(E->getArg(3));
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfs),
+                              {Op0, Op1, Op2, Op3});
+  }
+  case PPC::BI__builtin_ppc_minfe: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    Value *Op3 = EmitScalarExpr(E->getArg(3));
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfe),
+                              {Op0, Op1, Op2, Op3});
+  }
+  case PPC::BI__builtin_ppc_minfl: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    Value *Op3 = EmitScalarExpr(E->getArg(3));
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfl),
+                              {Op0, Op1, Op2, Op3});
+  }
+  case PPC::BI__builtin_ppc_minfs: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Op2 = EmitScalarExpr(E->getArg(2));
+    Value *Op3 = EmitScalarExpr(E->getArg(3));
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfs),
+                              {Op0, Op1, Op2, Op3});
+  }
   case PPC::BI__builtin_ppc_swdiv:
-  case PPC::BI__builtin_ppc_swdivs:
-    return Builder.CreateFDiv(Ops[0], Ops[1], "swdiv");
+  case PPC::BI__builtin_ppc_swdivs: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    return Builder.CreateFDiv(Op0, Op1, "swdiv");
+  }
   }
 }
 

diff  --git a/clang/test/CodeGen/PowerPC/builtins-ppc-fastmath.c b/clang/test/CodeGen/PowerPC/builtins-ppc-fastmath.c
index 1dc0f43cf4dd4..c2fa0c5f73ce8 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-fastmath.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-fastmath.c
@@ -18,11 +18,9 @@ extern vector double f;
 // CHECK-LABEL: @test_flags_recipdivf(
 // CHECK:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* @a, align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* @b, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* @a, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* @b, align 16
-// CHECK-NEXT:    [[RECIPDIV:%.*]] = fdiv fast <4 x float> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* @c, align 16
-// CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[RECIPDIV]], [[TMP4]]
+// CHECK-NEXT:    [[RECIPDIV:%.*]] = fdiv fast <4 x float> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* @c, align 16
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[RECIPDIV]], [[TMP2]]
 // CHECK-NEXT:    ret <4 x float> [[ADD]]
 //
 vector float test_flags_recipdivf() {
@@ -32,11 +30,9 @@ vector float test_flags_recipdivf() {
 // CHECK-LABEL: @test_flags_recipdivd(
 // CHECK:    [[TMP0:%.*]] = load <2 x double>, <2 x double>* @d, align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* @e, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* @d, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* @e, align 16
-// CHECK-NEXT:    [[RECIPDIV:%.*]] = fdiv fast <2 x double> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* @f, align 16
-// CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x double> [[RECIPDIV]], [[TMP4]]
+// CHECK-NEXT:    [[RECIPDIV:%.*]] = fdiv fast <2 x double> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* @f, align 16
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x double> [[RECIPDIV]], [[TMP2]]
 // CHECK-NEXT:    ret <2 x double> [[ADD]]
 //
 vector double test_flags_recipdivd() {
@@ -45,11 +41,10 @@ vector double test_flags_recipdivd() {
 
 // CHECK-LABEL: @test_flags_rsqrtf(
 // CHECK:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* @a, align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* @a, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
-// CHECK-NEXT:    [[RSQRT:%.*]] = fdiv fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* @b, align 16
-// CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[RSQRT]], [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[RSQRT:%.*]] = fdiv fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* @b, align 16
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[RSQRT]], [[TMP2]]
 // CHECK-NEXT:    ret <4 x float> [[ADD]]
 //
 vector float test_flags_rsqrtf() {
@@ -58,11 +53,10 @@ vector float test_flags_rsqrtf() {
 
 // CHECK-LABEL: @test_flags_rsqrtd(
 // CHECK:    [[TMP0:%.*]] = load <2 x double>, <2 x double>* @d, align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* @d, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
-// CHECK-NEXT:    [[RSQRT:%.*]] = fdiv fast <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* @e, align 16
-// CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x double> [[RSQRT]], [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[RSQRT:%.*]] = fdiv fast <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* @e, align 16
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x double> [[RSQRT]], [[TMP2]]
 // CHECK-NEXT:    ret <2 x double> [[ADD]]
 //
 vector double test_flags_rsqrtd() {

diff  --git a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c
new file mode 100644
index 0000000000000..79cb65ebc7873
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c
@@ -0,0 +1,259 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -target-cpu pwr10 \
+// RUN:   -no-opaque-pointers -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc64-unknown-unknown -target-cpu pwr10 \
+// RUN:   -no-opaque-pointers -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-BE
+
+// CHECK-LABEL: @testVQLocal(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VC_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[VQP:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    [[VQ1:%.*]] = alloca <512 x i1>, align 64
+// CHECK-NEXT:    [[VQ2:%.*]] = alloca <512 x i1>, align 64
+// CHECK-NEXT:    [[VQ3:%.*]] = alloca <512 x i1>, align 64
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store <16 x i8> [[VC:%.*]], <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <512 x i1>, <512 x i1>* [[TMP2]], align 64
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], <512 x i1>* [[VQ1]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+// CHECK-NEXT:    store <512 x i1> [[TMP4]], <512 x i1>* [[VQ2]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
+// CHECK-NEXT:    store <512 x i1> [[TMP7]], <512 x i1>* [[VQ3]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <512 x i1>, <512 x i1>* [[VQ3]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    store <512 x i1> [[TMP8]], <512 x i1>* [[TMP9]], align 64
+// CHECK-NEXT:    ret void
+//
+// CHECK-BE-LABEL: @testVQLocal(
+// CHECK-BE-NEXT:  entry:
+// CHECK-BE-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-BE-NEXT:    [[VC_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-BE-NEXT:    [[VQP:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-BE-NEXT:    [[VQ1:%.*]] = alloca <512 x i1>, align 64
+// CHECK-BE-NEXT:    [[VQ2:%.*]] = alloca <512 x i1>, align 64
+// CHECK-BE-NEXT:    [[VQ3:%.*]] = alloca <512 x i1>, align 64
+// CHECK-BE-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-BE-NEXT:    store <16 x i8> [[VC:%.*]], <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-BE-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-BE-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQP]], align 8
+// CHECK-BE-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-BE-NEXT:    [[TMP3:%.*]] = load <512 x i1>, <512 x i1>* [[TMP2]], align 64
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP3]], <512 x i1>* [[VQ1]], align 64
+// CHECK-BE-NEXT:    [[TMP4:%.*]] = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP4]], <512 x i1>* [[VQ2]], align 64
+// CHECK-BE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP7:%.*]] = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP7]], <512 x i1>* [[VQ3]], align 64
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = load <512 x i1>, <512 x i1>* [[VQ3]], align 64
+// CHECK-BE-NEXT:    [[TMP9:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP8]], <512 x i1>* [[TMP9]], align 64
+// CHECK-BE-NEXT:    ret void
+//
+void testVQLocal(int *ptr, vector unsigned char vc) {
+  __vector_quad *vqp = (__vector_quad *)ptr;
+  __vector_quad vq1 = *vqp;
+  __vector_quad vq2;
+  __builtin_mma_xxsetaccz(&vq2);
+  __vector_quad vq3;
+  __builtin_mma_xvi4ger8(&vq3, vc, vc);
+  *vqp = vq3;
+}
+
+// CHECK-LABEL: @testVPLocal(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VC_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[VPP:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    [[VP1:%.*]] = alloca <256 x i1>, align 32
+// CHECK-NEXT:    [[VP2:%.*]] = alloca <256 x i1>, align 32
+// CHECK-NEXT:    [[VP3:%.*]] = alloca <256 x i1>, align 32
+// CHECK-NEXT:    [[VQ:%.*]] = alloca <512 x i1>, align 64
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store <16 x i8> [[VC:%.*]], <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32
+// CHECK-NEXT:    store <256 x i1> [[TMP3]], <256 x i1>* [[VP1]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+// CHECK-NEXT:    store <256 x i1> [[TMP6]], <256 x i1>* [[VP2]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP8]], <16 x i8> [[TMP7]])
+// CHECK-NEXT:    store <256 x i1> [[TMP9]], <256 x i1>* [[VP2]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <256 x i1>, <256 x i1>* [[VP3]], align 32
+// CHECK-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP10]], <16 x i8> [[TMP11]])
+// CHECK-NEXT:    store <512 x i1> [[TMP12]], <512 x i1>* [[VQ]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load <256 x i1>, <256 x i1>* [[VP3]], align 32
+// CHECK-NEXT:    [[TMP14:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    store <256 x i1> [[TMP13]], <256 x i1>* [[TMP14]], align 32
+// CHECK-NEXT:    ret void
+//
+// CHECK-BE-LABEL: @testVPLocal(
+// CHECK-BE-NEXT:  entry:
+// CHECK-BE-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-BE-NEXT:    [[VC_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-BE-NEXT:    [[VPP:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-BE-NEXT:    [[VP1:%.*]] = alloca <256 x i1>, align 32
+// CHECK-BE-NEXT:    [[VP2:%.*]] = alloca <256 x i1>, align 32
+// CHECK-BE-NEXT:    [[VP3:%.*]] = alloca <256 x i1>, align 32
+// CHECK-BE-NEXT:    [[VQ:%.*]] = alloca <512 x i1>, align 64
+// CHECK-BE-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-BE-NEXT:    store <16 x i8> [[VC:%.*]], <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-BE-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-BE-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPP]], align 8
+// CHECK-BE-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-BE-NEXT:    [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP3]], <256 x i1>* [[VP1]], align 32
+// CHECK-BE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP6]], <256 x i1>* [[VP2]], align 64
+// CHECK-BE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP9:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP9]], <256 x i1>* [[VP2]], align 64
+// CHECK-BE-NEXT:    [[TMP10:%.*]] = load <256 x i1>, <256 x i1>* [[VP3]], align 32
+// CHECK-BE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP12:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP10]], <16 x i8> [[TMP11]])
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP12]], <512 x i1>* [[VQ]], align 64
+// CHECK-BE-NEXT:    [[TMP13:%.*]] = load <256 x i1>, <256 x i1>* [[VP3]], align 32
+// CHECK-BE-NEXT:    [[TMP14:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP13]], <256 x i1>* [[TMP14]], align 32
+// CHECK-BE-NEXT:    ret void
+//
+void testVPLocal(int *ptr, vector unsigned char vc) {
+  __vector_pair *vpp = (__vector_pair *)ptr;
+  __vector_pair vp1 = *vpp;
+  __vector_pair vp2;
+  __builtin_vsx_assemble_pair(&vp2, vc, vc);
+  __builtin_vsx_build_pair(&vp2, vc, vc);
+  __vector_pair vp3;
+  __vector_quad vq;
+  __builtin_mma_xvf64ger(&vq, vp3, vc);
+  *vpp = vp3;
+}
+
+// CHECK-LABEL: @testRestrictQualifiedPointer2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ACC_ADDR:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    [[ARR:%.*]] = alloca [4 x <4 x float>], align 16
+// CHECK-NEXT:    store <512 x i1>* [[ACC:%.*]], <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[ARR]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>*, <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <512 x i1>*, <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>, <512 x i1>* [[TMP1]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float>* [[ARRAYDECAY]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 0
+// CHECK-NEXT:    store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[TMP8]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 2
+// CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 3
+// CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[TMP12]], align 16
+// CHECK-NEXT:    ret void
+//
+// CHECK-BE-LABEL: @testRestrictQualifiedPointer2(
+// CHECK-BE-NEXT:  entry:
+// CHECK-BE-NEXT:    [[ACC_ADDR:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-BE-NEXT:    [[ARR:%.*]] = alloca [4 x <4 x float>], align 16
+// CHECK-BE-NEXT:    store <512 x i1>* [[ACC:%.*]], <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-BE-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[ARR]], i64 0, i64 0
+// CHECK-BE-NEXT:    [[TMP0:%.*]] = load <512 x i1>*, <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-BE-NEXT:    [[TMP1:%.*]] = load <512 x i1>*, <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-BE-NEXT:    [[TMP2:%.*]] = load <512 x i1>, <512 x i1>* [[TMP1]], align 64
+// CHECK-BE-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
+// CHECK-BE-NEXT:    [[TMP4:%.*]] = bitcast <4 x float>* [[ARRAYDECAY]] to <16 x i8>*
+// CHECK-BE-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 0
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 16
+// CHECK-BE-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 1
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[TMP8]], align 16
+// CHECK-BE-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-BE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 2
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* [[TMP10]], align 16
+// CHECK-BE-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-BE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 3
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[TMP12]], align 16
+// CHECK-BE-NEXT:    ret void
+//
+void testRestrictQualifiedPointer2(__vector_quad *__restrict acc) {
+  vector float arr[4];
+  __builtin_mma_disassemble_acc(arr, acc);
+}
+
+// CHECK-LABEL: @testVolatileQualifiedPointer2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ACC_ADDR:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    [[ARR:%.*]] = alloca [4 x <4 x float>], align 16
+// CHECK-NEXT:    store volatile <512 x i1>* [[ACC:%.*]], <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[ARR]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <512 x i1>*, <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <512 x i1>*, <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>, <512 x i1>* [[TMP1]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float>* [[ARRAYDECAY]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 0
+// CHECK-NEXT:    store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[TMP8]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 2
+// CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 3
+// CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[TMP12]], align 16
+// CHECK-NEXT:    ret void
+//
+// CHECK-BE-LABEL: @testVolatileQualifiedPointer2(
+// CHECK-BE-NEXT:  entry:
+// CHECK-BE-NEXT:    [[ACC_ADDR:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-BE-NEXT:    [[ARR:%.*]] = alloca [4 x <4 x float>], align 16
+// CHECK-BE-NEXT:    store volatile <512 x i1>* [[ACC:%.*]], <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-BE-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[ARR]], i64 0, i64 0
+// CHECK-BE-NEXT:    [[TMP0:%.*]] = load volatile <512 x i1>*, <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-BE-NEXT:    [[TMP1:%.*]] = load volatile <512 x i1>*, <512 x i1>** [[ACC_ADDR]], align 8
+// CHECK-BE-NEXT:    [[TMP2:%.*]] = load <512 x i1>, <512 x i1>* [[TMP1]], align 64
+// CHECK-BE-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
+// CHECK-BE-NEXT:    [[TMP4:%.*]] = bitcast <4 x float>* [[ARRAYDECAY]] to <16 x i8>*
+// CHECK-BE-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 0
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 16
+// CHECK-BE-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 1
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[TMP8]], align 16
+// CHECK-BE-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-BE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 2
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* [[TMP10]], align 16
+// CHECK-BE-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-BE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP4]], i32 3
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[TMP12]], align 16
+// CHECK-BE-NEXT:    ret void
+//
+void testVolatileQualifiedPointer2(__vector_quad *__volatile acc) {
+  vector float arr[4];
+  __builtin_mma_disassemble_acc(arr, acc);
+}

diff  --git a/clang/test/CodeGen/PowerPC/builtins-ppc-stmtexpr-argument.c b/clang/test/CodeGen/PowerPC/builtins-ppc-stmtexpr-argument.c
new file mode 100644
index 0000000000000..38c9cbc367e8b
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-stmtexpr-argument.c
@@ -0,0 +1,22 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: powerpc-registered-target
+// RUN: %clang_cc1 -no-opaque-pointers -triple powerpc64-unknown-linux-gnu \
+// RUN:   -emit-llvm %s -o - -target-cpu pwr7 | FileCheck %s
+// RUN: %clang_cc1 -no-opaque-pointers -triple powerpc64le-unknown-linux-gnu \
+// RUN:   -emit-llvm %s -o - -target-cpu pwr8 | FileCheck %s
+
+// The argument expression must not be emitted multiple times
+
+// CHECK-LABEL: @test_fric(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[D:%.*]] = alloca double, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca double, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[D]], align 8
+// CHECK-NEXT:    store double [[TMP0]], double* [[TMP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[TMP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.rint.f64(double [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+void test_fric() {
+  __fric(({double d; d;}));
+}

diff  --git a/clang/test/CodeGen/PowerPC/builtins-ppc-vsx.c b/clang/test/CodeGen/PowerPC/builtins-ppc-vsx.c
index 944976cafc91c..6190a4e9dfddd 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-vsx.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-vsx.c
@@ -2213,8 +2213,6 @@ vector double xxsldwi_should_not_assert(vector double a, vector double b) {
 
 void test_vector_cpsgn_float(vector float a, vector float b) {
 // CHECK-LABEL: test_vector_cpsgn_float
-// CHECK-DAG: load{{.*}}%__a
-// CHECK-DAG: load{{.*}}%__b
 // CHECK-NOT: SEPARATOR
 // CHECK-DAG: [[RA:%[0-9]+]] = load <4 x float>, <4 x float>* %__a.addr
 // CHECK-DAG: [[RB:%[0-9]+]] = load <4 x float>, <4 x float>* %__b.addr
@@ -2224,8 +2222,6 @@ void test_vector_cpsgn_float(vector float a, vector float b) {
 
 void test_vector_cpsgn_double(vector double a, vector double b) {
 // CHECK-LABEL: test_vector_cpsgn_double
-// CHECK-DAG: load{{.*}}%__a
-// CHECK-DAG: load{{.*}}%__b
 // CHECK-NOT: SEPARATOR
 // CHECK-DAG: [[RA:%[0-9]+]] = load <2 x double>, <2 x double>* %__a.addr
 // CHECK-DAG: [[RB:%[0-9]+]] = load <2 x double>, <2 x double>* %__b.addr
@@ -2235,8 +2231,6 @@ void test_vector_cpsgn_double(vector double a, vector double b) {
 
 void test_builtin_xvcpsgnsp(vector float a, vector float b) {
 // CHECK-LABEL: test_builtin_xvcpsgnsp
-// CHECK-DAG: load{{.*}}%a
-// CHECK-DAG: load{{.*}}%b
 // CHECK-NOT: SEPARATOR
 // CHECK-DAG: [[RA:%[0-9]+]] = load <4 x float>, <4 x float>* %a.addr
 // CHECK-DAG: [[RB:%[0-9]+]] = load <4 x float>, <4 x float>* %b.addr
@@ -2246,8 +2240,6 @@ void test_builtin_xvcpsgnsp(vector float a, vector float b) {
 
 void test_builtin_xvcpsgndp(vector double a, vector double b) {
 // CHECK-LABEL: test_builtin_xvcpsgndp
-// CHECK-DAG: load{{.*}}%a
-// CHECK-DAG: load{{.*}}%b
 // CHECK-NOT: SEPARATOR
 // CHECK-DAG: [[RA:%[0-9]+]] = load <2 x double>, <2 x double>* %a.addr
 // CHECK-DAG: [[RB:%[0-9]+]] = load <2 x double>, <2 x double>* %b.addr

diff  --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-cas.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-cas.c
index 307f8a930147e..df46c9171353a 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-cas.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-cas.c
@@ -14,9 +14,9 @@
 // CHECK-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[B:%.*]], i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[C:%.*]], i32* [[C_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C_ADDR]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg weak volatile i32* [[A_ADDR]], i32 [[TMP1]], i32 [[TMP0]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg weak volatile i32* [[A_ADDR]], i32 [[TMP0]], i32 [[TMP1]] monotonic monotonic, align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
 // CHECK-NEXT:    store i32 [[TMP3]], i32* [[B_ADDR]], align 4
@@ -36,9 +36,9 @@ int test_builtin_ppc_compare_and_swap(int a, int b, int c) {
 // CHECK-NEXT:    store i64 [[A:%.*]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i64 [[B:%.*]], i64* [[B_ADDR]], align 8
 // CHECK-NEXT:    store i64 [[C:%.*]], i64* [[C_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[C_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg weak volatile i64* [[A_ADDR]], i64 [[TMP1]], i64 [[TMP0]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg weak volatile i64* [[A_ADDR]], i64 [[TMP0]], i64 [[TMP1]] monotonic monotonic, align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
 // CHECK-NEXT:    store i64 [[TMP3]], i64* [[B_ADDR]], align 8

diff  --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-fetch.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-fetch.c
index 7612834e674ca..1b4004d665ae4 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-fetch.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-fetch.c
@@ -12,8 +12,7 @@
 // CHECK-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[B:%.*]], i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add i32* [[A_ADDR]], i32 [[TMP1]] monotonic, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add i32* [[A_ADDR]], i32 [[TMP0]] monotonic, align 4
 // CHECK-NEXT:    ret void
 //
 void test_builtin_ppc_fetch_and_add(int a, int b) {
@@ -27,8 +26,7 @@ void test_builtin_ppc_fetch_and_add(int a, int b) {
 // CHECK-NEXT:    store i64 [[A:%.*]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i64 [[B:%.*]], i64* [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add i64* [[A_ADDR]], i64 [[TMP1]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add i64* [[A_ADDR]], i64 [[TMP0]] monotonic, align 8
 // CHECK-NEXT:    ret void
 //
 void test_builtin_ppc_fetch_and_addlp(long a, long b) {
@@ -41,8 +39,7 @@ void test_builtin_ppc_fetch_and_addlp(long a, long b) {
 // CHECK-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[B:%.*]], i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and i32* [[A_ADDR]], i32 [[TMP1]] monotonic, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and i32* [[A_ADDR]], i32 [[TMP0]] monotonic, align 4
 // CHECK-NEXT:    ret void
 //
 void test_builtin_ppc_fetch_and_and(unsigned int a, unsigned int b) {
@@ -55,8 +52,7 @@ void test_builtin_ppc_fetch_and_and(unsigned int a, unsigned int b) {
 // CHECK-NEXT:    store i64 [[A:%.*]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i64 [[B:%.*]], i64* [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and i64* [[A_ADDR]], i64 [[TMP1]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and i64* [[A_ADDR]], i64 [[TMP0]] monotonic, align 8
 // CHECK-NEXT:    ret void
 //
 void test_builtin_ppc_fetch_and_andlp(unsigned long a, unsigned long b) {
@@ -69,8 +65,7 @@ void test_builtin_ppc_fetch_and_andlp(unsigned long a, unsigned long b) {
 // CHECK-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[B:%.*]], i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw or i32* [[A_ADDR]], i32 [[TMP1]] monotonic, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or i32* [[A_ADDR]], i32 [[TMP0]] monotonic, align 4
 // CHECK-NEXT:    ret void
 //
 void test_builtin_ppc_fetch_and_or(unsigned int a, unsigned int b) {
@@ -83,8 +78,7 @@ void test_builtin_ppc_fetch_and_or(unsigned int a, unsigned int b) {
 // CHECK-NEXT:    store i64 [[A:%.*]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i64 [[B:%.*]], i64* [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw or i64* [[A_ADDR]], i64 [[TMP1]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or i64* [[A_ADDR]], i64 [[TMP0]] monotonic, align 8
 // CHECK-NEXT:    ret void
 //
 void test_builtin_ppc_fetch_and_orlp(unsigned long a, unsigned long b) {
@@ -97,8 +91,7 @@ void test_builtin_ppc_fetch_and_orlp(unsigned long a, unsigned long b) {
 // CHECK-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[B:%.*]], i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xchg i32* [[A_ADDR]], i32 [[TMP1]] monotonic, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg i32* [[A_ADDR]], i32 [[TMP0]] monotonic, align 4
 // CHECK-NEXT:    ret void
 //
 void test_builtin_ppc_fetch_and_swap(unsigned int a, unsigned int b) {
@@ -111,8 +104,7 @@ void test_builtin_ppc_fetch_and_swap(unsigned int a, unsigned int b) {
 // CHECK-NEXT:    store i64 [[A:%.*]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i64 [[B:%.*]], i64* [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xchg i64* [[A_ADDR]], i64 [[TMP1]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg i64* [[A_ADDR]], i64 [[TMP0]] monotonic, align 8
 // CHECK-NEXT:    ret void
 //
 void test_builtin_ppc_fetch_and_swaplp(unsigned long a, unsigned long b) {

diff  --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-fp.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-fp.c
index 94aa15c00df39..46b2a74a77e66 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-fp.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-fp.c
@@ -15,9 +15,8 @@ extern float f;
 
 // CHECK-LABEL: @test_fric(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.rint.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.rint.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_fric() {
   return __fric(a);
@@ -25,9 +24,8 @@ double test_fric() {
 
 // CHECK-LABEL: @test_frim(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.floor.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.floor.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_frim() {
   return __frim(a);
@@ -35,9 +33,8 @@ double test_frim() {
 
 // CHECK-LABEL: @test_frims(
 // CHECK:    [[TMP0:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.floor.f32(float [[TMP1]])
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.floor.f32(float [[TMP0]])
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float test_frims() {
   return __frims(d);
@@ -45,9 +42,8 @@ float test_frims() {
 
 // CHECK-LABEL: @test_frin(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.round.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.round.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_frin() {
   return __frin(a);
@@ -55,9 +51,8 @@ double test_frin() {
 
 // CHECK-LABEL: @test_frins(
 // CHECK:    [[TMP0:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.round.f32(float [[TMP1]])
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.round.f32(float [[TMP0]])
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float test_frins() {
   return __frins(d);
@@ -65,9 +60,8 @@ float test_frins() {
 
 // CHECK-LABEL: @test_frip(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.ceil.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.ceil.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_frip() {
   return __frip(a);
@@ -75,9 +69,8 @@ double test_frip() {
 
 // CHECK-LABEL: @test_frips(
 // CHECK:    [[TMP0:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.ceil.f32(float [[TMP1]])
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.ceil.f32(float [[TMP0]])
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float test_frips() {
   return __frips(d);
@@ -85,9 +78,8 @@ float test_frips() {
 
 // CHECK-LABEL: @test_friz(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.trunc.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.trunc.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_friz() {
   return __friz(a);
@@ -95,9 +87,8 @@ double test_friz() {
 
 // CHECK-LABEL: @test_frizs(
 // CHECK:    [[TMP0:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.trunc.f32(float [[TMP1]])
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.trunc.f32(float [[TMP0]])
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float test_frizs() {
   return __frizs(d);
@@ -145,9 +136,8 @@ float test_frsqrtes() {
 
 // CHECK-LABEL: @test_fsqrt(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.sqrt.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.sqrt.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_fsqrt() {
   return __fsqrt(a);
@@ -155,9 +145,8 @@ double test_fsqrt() {
 
 // CHECK-LABEL: @test_fsqrts(
 // CHECK:    [[TMP0:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.sqrt.f32(float [[TMP1]])
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.sqrt.f32(float [[TMP0]])
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float test_fsqrts() {
   return __fsqrts(d);
@@ -165,9 +154,8 @@ float test_fsqrts() {
 
 // CHECK-LABEL: @test_builtin_ppc_fric(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.rint.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.rint.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_builtin_ppc_fric() {
   return __builtin_ppc_fric(a);
@@ -175,9 +163,8 @@ double test_builtin_ppc_fric() {
 
 // CHECK-LABEL: @test_builtin_ppc_frim(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.floor.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.floor.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_builtin_ppc_frim() {
   return __builtin_ppc_frim(a);
@@ -185,9 +172,8 @@ double test_builtin_ppc_frim() {
 
 // CHECK-LABEL: @test_builtin_ppc_frims(
 // CHECK:    [[TMP0:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.floor.f32(float [[TMP1]])
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.floor.f32(float [[TMP0]])
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float test_builtin_ppc_frims() {
   return __builtin_ppc_frims(d);
@@ -195,9 +181,8 @@ float test_builtin_ppc_frims() {
 
 // CHECK-LABEL: @test_builtin_ppc_frin(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.round.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.round.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_builtin_ppc_frin() {
   return __builtin_ppc_frin(a);
@@ -205,9 +190,8 @@ double test_builtin_ppc_frin() {
 
 // CHECK-LABEL: @test_builtin_ppc_frins(
 // CHECK:    [[TMP0:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.round.f32(float [[TMP1]])
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.round.f32(float [[TMP0]])
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float test_builtin_ppc_frins() {
   return __builtin_ppc_frins(d);
@@ -215,9 +199,8 @@ float test_builtin_ppc_frins() {
 
 // CHECK-LABEL: @test_builtin_ppc_frip(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.ceil.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.ceil.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_builtin_ppc_frip() {
   return __builtin_ppc_frip(a);
@@ -225,9 +208,8 @@ double test_builtin_ppc_frip() {
 
 // CHECK-LABEL: @test_builtin_ppc_frips(
 // CHECK:    [[TMP0:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.ceil.f32(float [[TMP1]])
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.ceil.f32(float [[TMP0]])
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float test_builtin_ppc_frips() {
   return __builtin_ppc_frips(d);
@@ -235,9 +217,8 @@ float test_builtin_ppc_frips() {
 
 // CHECK-LABEL: @test_builtin_ppc_friz(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.trunc.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.trunc.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_builtin_ppc_friz() {
   return __builtin_ppc_friz(a);
@@ -245,9 +226,8 @@ double test_builtin_ppc_friz() {
 
 // CHECK-LABEL: @test_builtin_ppc_frizs(
 // CHECK:    [[TMP0:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.trunc.f32(float [[TMP1]])
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.trunc.f32(float [[TMP0]])
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float test_builtin_ppc_frizs() {
   return __builtin_ppc_frizs(d);
@@ -295,9 +275,8 @@ float test_builtin_ppc_frsqrtes() {
 
 // CHECK-LABEL: @test_builtin_ppc_fsqrt(
 // CHECK:    [[TMP0:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.sqrt.f64(double [[TMP1]])
-// CHECK-NEXT:    ret double [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.sqrt.f64(double [[TMP0]])
+// CHECK-NEXT:    ret double [[TMP1]]
 //
 double test_builtin_ppc_fsqrt() {
   return __builtin_ppc_fsqrt(a);
@@ -305,9 +284,8 @@ double test_builtin_ppc_fsqrt() {
 
 // CHECK-LABEL: @test_builtin_ppc_fsqrts(
 // CHECK:    [[TMP0:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* @d, align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.sqrt.f32(float [[TMP1]])
-// CHECK-NEXT:    ret float [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.sqrt.f32(float [[TMP0]])
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float test_builtin_ppc_fsqrts() {
   return __builtin_ppc_fsqrts(d);

diff  --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.c
index a6b2e1903445e..200540c6e79fd 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.c
@@ -95,7 +95,6 @@ float fnmadds (float f) {
 // CHECK-LABEL: @fnmsub(
 // CHECK:         [[D_ADDR:%.*]] = alloca double, align 8
 // CHECK-NEXT:    store double [[D:%.*]], double* [[D_ADDR]], align 8
-// CHECK-COUNT-3:    load double, double* [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[D_ADDR]], align 8
@@ -109,7 +108,6 @@ double fnmsub (double d) {
 // CHECK-LABEL: @fnmsubs(
 // CHECK:         [[F_ADDR:%.*]] = alloca float, align 4
 // CHECK-NEXT:    store float [[F:%.*]], float* [[F_ADDR]], align 4
-// CHECK-COUNT-3:    load float, float* [[F_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[F_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[F_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[F_ADDR]], align 4

diff  --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-sync.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-sync.c
index 24cd794e6e006..7a87811da336d 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-sync.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-sync.c
@@ -14,13 +14,11 @@ extern void *c;
 
 // CHECK-LABEL: @test_popcntb(
 // CHECK:    [[TMP0:%.*]] = load i64, i64* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* @a, align 8
 // CHECK-NEXT:    [[POPCNTB:%.*]] = call i64 @llvm.ppc.popcntb.i64.i64(i64 [[TMP0]])
 // CHECK-NEXT:    ret i64 [[POPCNTB]]
 //
 // CHECK-32-LABEL: @test_popcntb(
 // CHECK-32:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* @a, align 4
 // CHECK-32-NEXT:    [[POPCNTB:%.*]] = call i32 @llvm.ppc.popcntb.i32.i32(i32 [[TMP0]])
 // CHECK-32-NEXT:    ret i32 [[POPCNTB]]
 //
@@ -198,13 +196,11 @@ void test_dcbz() {
 
 // CHECK-LABEL: @test_builtin_ppc_popcntb(
 // CHECK:    [[TMP0:%.*]] = load i64, i64* @a, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* @a, align 8
 // CHECK-NEXT:    [[POPCNTB:%.*]] = call i64 @llvm.ppc.popcntb.i64.i64(i64 [[TMP0]])
 // CHECK-NEXT:    ret i64 [[POPCNTB]]
 //
 // CHECK-32-LABEL: @test_builtin_ppc_popcntb(
 // CHECK-32:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* @a, align 4
 // CHECK-32-NEXT:    [[POPCNTB:%.*]] = call i32 @llvm.ppc.popcntb.i32.i32(i32 [[TMP0]])
 // CHECK-32-NEXT:    ret i32 [[POPCNTB]]
 //

diff  --git a/clang/test/CodeGen/PowerPC/ppc-mma-types.c b/clang/test/CodeGen/PowerPC/ppc-mma-types.c
index ad8a9c592906b..66cbb79061624 100644
--- a/clang/test/CodeGen/PowerPC/ppc-mma-types.c
+++ b/clang/test/CodeGen/PowerPC/ppc-mma-types.c
@@ -1,17 +1,23 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -no-opaque-pointers -triple powerpc64le-linux-unknown -target-cpu pwr10 \
-// RUN:   -emit-llvm -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -no-opaque-pointers -triple powerpc64le-linux-unknown -target-cpu pwr9 \
-// RUN:   -emit-llvm -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -no-opaque-pointers -triple powerpc64le-linux-unknown -target-cpu pwr8 \
-// RUN:   -emit-llvm -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | FileCheck %s
 
 // CHECK-LABEL: @test1(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <512 x i1>, <512 x i1>* [[PTR1:%.*]], i64 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, <512 x i1>* [[ADD_PTR]], align 64, [[TBAA2:!tbaa !.*]]
-// CHECK-NEXT:    [[ADD_PTR1:%.*]] = getelementptr inbounds <512 x i1>, <512 x i1>* [[PTR2:%.*]], i64 1
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], <512 x i1>* [[ADD_PTR1]], align 64, [[TBAA2]]
+// CHECK-NEXT:    [[PTR1_ADDR:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    [[PTR2_ADDR:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    store <512 x i1>* [[PTR1:%.*]], <512 x i1>** [[PTR1_ADDR]], align 8
+// CHECK-NEXT:    store <512 x i1>* [[PTR2:%.*]], <512 x i1>** [[PTR2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>*, <512 x i1>** [[PTR1_ADDR]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <512 x i1>, <512 x i1>* [[TMP0]], i64 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[ADD_PTR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[PTR2_ADDR]], align 8
+// CHECK-NEXT:    [[ADD_PTR1:%.*]] = getelementptr inbounds <512 x i1>, <512 x i1>* [[TMP2]], i64 1
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], <512 x i1>* [[ADD_PTR1]], align 64
 // CHECK-NEXT:    ret void
 //
 void test1(__vector_quad *ptr1, __vector_quad *ptr2) {
@@ -20,12 +26,422 @@ void test1(__vector_quad *ptr1, __vector_quad *ptr2) {
 
 // CHECK-LABEL: @test2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <256 x i1>, <256 x i1>* [[PTR1:%.*]], i64 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, <256 x i1>* [[ADD_PTR]], align 32, [[TBAA6:!tbaa !.*]]
-// CHECK-NEXT:    [[ADD_PTR1:%.*]] = getelementptr inbounds <256 x i1>, <256 x i1>* [[PTR2:%.*]], i64 1
-// CHECK-NEXT:    store <256 x i1> [[TMP0]], <256 x i1>* [[ADD_PTR1]], align 32, [[TBAA6]]
+// CHECK-NEXT:    [[PTR1_ADDR:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    [[PTR2_ADDR:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    store <256 x i1>* [[PTR1:%.*]], <256 x i1>** [[PTR1_ADDR]], align 8
+// CHECK-NEXT:    store <256 x i1>* [[PTR2:%.*]], <256 x i1>** [[PTR2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>*, <256 x i1>** [[PTR1_ADDR]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <256 x i1>, <256 x i1>* [[TMP0]], i64 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, <256 x i1>* [[ADD_PTR]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[PTR2_ADDR]], align 8
+// CHECK-NEXT:    [[ADD_PTR1:%.*]] = getelementptr inbounds <256 x i1>, <256 x i1>* [[TMP2]], i64 1
+// CHECK-NEXT:    store <256 x i1> [[TMP1]], <256 x i1>* [[ADD_PTR1]], align 32
 // CHECK-NEXT:    ret void
 //
 void test2(__vector_pair *ptr1, __vector_pair *ptr2) {
   *(ptr2 + 1) = *(ptr1 + 2);
 }
+
+typedef __vector_quad vq_t;
+// CHECK-LABEL: @testVQTypedef(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INP_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[OUTP_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VQIN:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    [[VQOUT:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    store i32* [[INP:%.*]], i32** [[INP_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[OUTP:%.*]], i32** [[OUTP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[INP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQIN]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[OUTP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP3]], <512 x i1>** [[VQOUT]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load <512 x i1>*, <512 x i1>** [[VQIN]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load <512 x i1>, <512 x i1>* [[TMP4]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <512 x i1>*, <512 x i1>** [[VQOUT]], align 8
+// CHECK-NEXT:    store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64
+// CHECK-NEXT:    ret void
+//
+void testVQTypedef(int *inp, int *outp) {
+  vq_t *vqin = (vq_t *)inp;
+  vq_t *vqout = (vq_t *)outp;
+  *vqout = *vqin;
+}
+
+// CHECK-LABEL: @testVQArg3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VQ_ADDR:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VQP:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    store <512 x i1>* [[VQ:%.*]], <512 x i1>** [[VQ_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[VQ_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <512 x i1>, <512 x i1>* [[TMP2]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], <512 x i1>* [[TMP4]], align 64
+// CHECK-NEXT:    ret void
+//
+void testVQArg3(__vector_quad *vq, int *ptr) {
+  __vector_quad *vqp = (__vector_quad *)ptr;
+  *vqp = *vq;
+}
+
+// CHECK-LABEL: @testVQArg4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VQ_ADDR:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VQP:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    store <512 x i1>* [[VQ:%.*]], <512 x i1>** [[VQ_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[VQ_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <512 x i1>, <512 x i1>* [[TMP2]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], <512 x i1>* [[TMP4]], align 64
+// CHECK-NEXT:    ret void
+//
+void testVQArg4(const __vector_quad *const vq, int *ptr) {
+  __vector_quad *vqp = (__vector_quad *)ptr;
+  *vqp = *vq;
+}
+
+// CHECK-LABEL: @testVQArg5(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VQA_ADDR:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VQP:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    store <512 x i1>* [[VQA:%.*]], <512 x i1>** [[VQA_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[VQA_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <512 x i1>, <512 x i1>* [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <512 x i1>, <512 x i1>* [[ARRAYIDX]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], <512 x i1>* [[TMP4]], align 64
+// CHECK-NEXT:    ret void
+//
+void testVQArg5(__vector_quad vqa[], int *ptr) {
+  __vector_quad *vqp = (__vector_quad *)ptr;
+  *vqp = vqa[0];
+}
+
+// CHECK-LABEL: @testVQArg7(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VQ_ADDR:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VQP:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    store <512 x i1>* [[VQ:%.*]], <512 x i1>** [[VQ_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[VQ_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <512 x i1>, <512 x i1>* [[TMP2]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], <512 x i1>* [[TMP4]], align 64
+// CHECK-NEXT:    ret void
+//
+void testVQArg7(const vq_t *vq, int *ptr) {
+  __vector_quad *vqp = (__vector_quad *)ptr;
+  *vqp = *vq;
+}
+
+// CHECK-LABEL: @testVQRet2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VQP:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <512 x i1>, <512 x i1>* [[TMP2]], i64 2
+// CHECK-NEXT:    ret <512 x i1>* [[ADD_PTR]]
+//
+__vector_quad *testVQRet2(int *ptr) {
+  __vector_quad *vqp = (__vector_quad *)ptr;
+  return vqp + 2;
+}
+
+// CHECK-LABEL: @testVQRet3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VQP:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <512 x i1>, <512 x i1>* [[TMP2]], i64 2
+// CHECK-NEXT:    ret <512 x i1>* [[ADD_PTR]]
+//
+const __vector_quad *testVQRet3(int *ptr) {
+  __vector_quad *vqp = (__vector_quad *)ptr;
+  return vqp + 2;
+}
+
+// CHECK-LABEL: @testVQRet5(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VQP:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <512 x i1>, <512 x i1>* [[TMP2]], i64 2
+// CHECK-NEXT:    ret <512 x i1>* [[ADD_PTR]]
+//
+const vq_t *testVQRet5(int *ptr) {
+  __vector_quad *vqp = (__vector_quad *)ptr;
+  return vqp + 2;
+}
+
+// CHECK-LABEL: @testVQSizeofAlignof(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VQP:%.*]] = alloca <512 x i1>*, align 8
+// CHECK-NEXT:    [[VQ:%.*]] = alloca <512 x i1>, align 64
+// CHECK-NEXT:    [[SIZET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ALIGNT:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SIZEV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ALIGNV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1>* [[TMP1]], <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>*, <512 x i1>** [[VQP]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <512 x i1>, <512 x i1>* [[TMP2]], align 64
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], <512 x i1>* [[VQ]], align 64
+// CHECK-NEXT:    store i32 64, i32* [[SIZET]], align 4
+// CHECK-NEXT:    store i32 64, i32* [[ALIGNT]], align 4
+// CHECK-NEXT:    store i32 64, i32* [[SIZEV]], align 4
+// CHECK-NEXT:    store i32 64, i32* [[ALIGNV]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[SIZET]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ALIGNT]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP4]], [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[SIZEV]], align 4
+// CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ADD]], [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ALIGNV]], align 4
+// CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP7]]
+// CHECK-NEXT:    ret i32 [[ADD2]]
+//
+int testVQSizeofAlignof(int *ptr) {
+  __vector_quad *vqp = (__vector_quad *)ptr;
+  __vector_quad vq = *vqp;
+  unsigned sizet = sizeof(__vector_quad);
+  unsigned alignt = __alignof__(__vector_quad);
+  unsigned sizev = sizeof(vq);
+  unsigned alignv = __alignof__(vq);
+  return sizet + alignt + sizev + alignv;
+}
+
+typedef __vector_pair vp_t;
+// CHECK-LABEL: @testVPTypedef(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INP_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[OUTP_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VPIN:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    [[VPOUT:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    store i32* [[INP:%.*]], i32** [[INP_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[OUTP:%.*]], i32** [[OUTP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[INP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPIN]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[OUTP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP3]], <256 x i1>** [[VPOUT]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load <256 x i1>*, <256 x i1>** [[VPIN]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load <256 x i1>, <256 x i1>* [[TMP4]], align 32
+// CHECK-NEXT:    [[TMP6:%.*]] = load <256 x i1>*, <256 x i1>** [[VPOUT]], align 8
+// CHECK-NEXT:    store <256 x i1> [[TMP5]], <256 x i1>* [[TMP6]], align 32
+// CHECK-NEXT:    ret void
+//
+void testVPTypedef(int *inp, int *outp) {
+  vp_t *vpin = (vp_t *)inp;
+  vp_t *vpout = (vp_t *)outp;
+  *vpout = *vpin;
+}
+
+// CHECK-LABEL: @testVPArg3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VP_ADDR:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VPP:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    store <256 x i1>* [[VP:%.*]], <256 x i1>** [[VP_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[VP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    store <256 x i1> [[TMP3]], <256 x i1>* [[TMP4]], align 32
+// CHECK-NEXT:    ret void
+//
+void testVPArg3(__vector_pair *vp, int *ptr) {
+  __vector_pair *vpp = (__vector_pair *)ptr;
+  *vpp = *vp;
+}
+
+// CHECK-LABEL: @testVPArg4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VP_ADDR:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VPP:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    store <256 x i1>* [[VP:%.*]], <256 x i1>** [[VP_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[VP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    store <256 x i1> [[TMP3]], <256 x i1>* [[TMP4]], align 32
+// CHECK-NEXT:    ret void
+//
+void testVPArg4(const __vector_pair *const vp, int *ptr) {
+  __vector_pair *vpp = (__vector_pair *)ptr;
+  *vpp = *vp;
+}
+
+// CHECK-LABEL: @testVPArg5(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VPA_ADDR:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VPP:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    store <256 x i1>* [[VPA:%.*]], <256 x i1>** [[VPA_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[VPA_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <256 x i1>, <256 x i1>* [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[ARRAYIDX]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    store <256 x i1> [[TMP3]], <256 x i1>* [[TMP4]], align 32
+// CHECK-NEXT:    ret void
+//
+void testVPArg5(__vector_pair vpa[], int *ptr) {
+  __vector_pair *vpp = (__vector_pair *)ptr;
+  *vpp = vpa[0];
+}
+
+// CHECK-LABEL: @testVPArg7(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VP_ADDR:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VPP:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    store <256 x i1>* [[VP:%.*]], <256 x i1>** [[VP_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[VP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    store <256 x i1> [[TMP3]], <256 x i1>* [[TMP4]], align 32
+// CHECK-NEXT:    ret void
+//
+void testVPArg7(const vp_t *vp, int *ptr) {
+  __vector_pair *vpp = (__vector_pair *)ptr;
+  *vpp = *vp;
+}
+
+// CHECK-LABEL: @testVPRet2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VPP:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <256 x i1>, <256 x i1>* [[TMP2]], i64 2
+// CHECK-NEXT:    ret <256 x i1>* [[ADD_PTR]]
+//
+__vector_pair *testVPRet2(int *ptr) {
+  __vector_pair *vpp = (__vector_pair *)ptr;
+  return vpp + 2;
+}
+
+// CHECK-LABEL: @testVPRet3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VPP:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <256 x i1>, <256 x i1>* [[TMP2]], i64 2
+// CHECK-NEXT:    ret <256 x i1>* [[ADD_PTR]]
+//
+const __vector_pair *testVPRet3(int *ptr) {
+  __vector_pair *vpp = (__vector_pair *)ptr;
+  return vpp + 2;
+}
+
+// CHECK-LABEL: @testVPRet5(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VPP:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <256 x i1>, <256 x i1>* [[TMP2]], i64 2
+// CHECK-NEXT:    ret <256 x i1>* [[ADD_PTR]]
+//
+const vp_t *testVPRet5(int *ptr) {
+  __vector_pair *vpp = (__vector_pair *)ptr;
+  return vpp + 2;
+}
+
+// CHECK-LABEL: @testVPSizeofAlignof(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VPP:%.*]] = alloca <256 x i1>*, align 8
+// CHECK-NEXT:    [[VP:%.*]] = alloca <256 x i1>, align 32
+// CHECK-NEXT:    [[SIZET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ALIGNT:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SIZEV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ALIGNV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <256 x i1>*
+// CHECK-NEXT:    store <256 x i1>* [[TMP1]], <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>*, <256 x i1>** [[VPP]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i1>, <256 x i1>* [[TMP2]], align 32
+// CHECK-NEXT:    store <256 x i1> [[TMP3]], <256 x i1>* [[VP]], align 32
+// CHECK-NEXT:    store i32 32, i32* [[SIZET]], align 4
+// CHECK-NEXT:    store i32 32, i32* [[ALIGNT]], align 4
+// CHECK-NEXT:    store i32 32, i32* [[SIZEV]], align 4
+// CHECK-NEXT:    store i32 32, i32* [[ALIGNV]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[SIZET]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ALIGNT]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP4]], [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[SIZEV]], align 4
+// CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ADD]], [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ALIGNV]], align 4
+// CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP7]]
+// CHECK-NEXT:    ret i32 [[ADD2]]
+//
+int testVPSizeofAlignof(int *ptr) {
+  __vector_pair *vpp = (__vector_pair *)ptr;
+  __vector_pair vp = *vpp;
+  unsigned sizet = sizeof(__vector_pair);
+  unsigned alignt = __alignof__(__vector_pair);
+  unsigned sizev = sizeof(vp);
+  unsigned alignv = __alignof__(vp);
+  return sizet + alignt + sizev + alignv;
+}

diff  --git a/clang/test/Sema/ppc-pair-mma-types.c b/clang/test/Sema/ppc-pair-mma-types.c
index 2ad1079bd966b..293688d49813c 100644
--- a/clang/test/Sema/ppc-pair-mma-types.c
+++ b/clang/test/Sema/ppc-pair-mma-types.c
@@ -12,11 +12,6 @@
 
 // typedef
 typedef __vector_quad vq_t;
-void testVQTypedef(int *inp, int *outp) {
-  vq_t *vqin = (vq_t *)inp;
-  vq_t *vqout = (vq_t *)outp;
-  *vqout = *vqin;
-}
 
 // function argument
 void testVQArg1(__vector_quad vq, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
@@ -29,57 +24,22 @@ void testVQArg2(const __vector_quad vq, int *ptr) { // expected-error {{invalid
   *vqp = vq;
 }
 
-void testVQArg3(__vector_quad *vq, int *ptr) {
-  __vector_quad *vqp = (__vector_quad *)ptr;
-  *vqp = *vq;
-}
-
-void testVQArg4(const __vector_quad *const vq, int *ptr) {
-  __vector_quad *vqp = (__vector_quad *)ptr;
-  *vqp = *vq;
-}
-
-void testVQArg5(__vector_quad vqa[], int *ptr) {
-  __vector_quad *vqp = (__vector_quad *)ptr;
-  *vqp = vqa[0];
-}
-
 void testVQArg6(const vq_t vq, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
   __vector_quad *vqp = (__vector_quad *)ptr;
   *vqp = vq;
 }
 
-void testVQArg7(const vq_t *vq, int *ptr) {
-  __vector_quad *vqp = (__vector_quad *)ptr;
-  *vqp = *vq;
-}
-
 // function return
 __vector_quad testVQRet1(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
   __vector_quad *vqp = (__vector_quad *)ptr;
   return *vqp; // expected-error {{invalid use of PPC MMA type}}
 }
 
-__vector_quad *testVQRet2(int *ptr) {
-  __vector_quad *vqp = (__vector_quad *)ptr;
-  return vqp + 2;
-}
-
-const __vector_quad *testVQRet3(int *ptr) {
-  __vector_quad *vqp = (__vector_quad *)ptr;
-  return vqp + 2;
-}
-
 const vq_t testVQRet4(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
   __vector_quad *vqp = (__vector_quad *)ptr;
   return *vqp; // expected-error {{invalid use of PPC MMA type}}
 }
 
-const vq_t *testVQRet5(int *ptr) {
-  __vector_quad *vqp = (__vector_quad *)ptr;
-  return vqp + 2;
-}
-
 // global
 __vector_quad globalvq;        // expected-error {{invalid use of PPC MMA type}}
 const __vector_quad globalvq2; // expected-error {{invalid use of PPC MMA type}}
@@ -87,16 +47,6 @@ __vector_quad *globalvqp;
 const __vector_quad *const globalvqp2;
 vq_t globalvq_t; // expected-error {{invalid use of PPC MMA type}}
 
-// local
-void testVQLocal(int *ptr, vector unsigned char vc) {
-  __vector_quad *vqp = (__vector_quad *)ptr;
-  __vector_quad vq1 = *vqp;
-  __vector_quad vq2;
-  __builtin_mma_xxsetaccz(&vq2);
-  __vector_quad vq3;
-  __builtin_mma_xvi4ger8(&vq3, vc, vc);
-  *vqp = vq3;
-}
 
 // struct field
 struct TestVQStruct {
@@ -106,17 +56,6 @@ struct TestVQStruct {
   __vector_quad *vq;
 };
 
-// sizeof / alignof
-int testVQSizeofAlignof(int *ptr) {
-  __vector_quad *vqp = (__vector_quad *)ptr;
-  __vector_quad vq = *vqp;
-  unsigned sizet = sizeof(__vector_quad);
-  unsigned alignt = __alignof__(__vector_quad);
-  unsigned sizev = sizeof(vq);
-  unsigned alignv = __alignof__(vq);
-  return sizet + alignt + sizev + alignv;
-}
-
 // operators
 int testVQOperators1(int *ptr) {
   __vector_quad *vqp = (__vector_quad *)ptr;
@@ -168,11 +107,6 @@ void testVQOperators4(int v, void *ptr) {
 
 // typedef
 typedef __vector_pair vp_t;
-void testVPTypedef(int *inp, int *outp) {
-  vp_t *vpin = (vp_t *)inp;
-  vp_t *vpout = (vp_t *)outp;
-  *vpout = *vpin;
-}
 
 // function argument
 void testVPArg1(__vector_pair vp, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
@@ -185,57 +119,22 @@ void testVPArg2(const __vector_pair vp, int *ptr) { // expected-error {{invalid
   *vpp = vp;
 }
 
-void testVPArg3(__vector_pair *vp, int *ptr) {
-  __vector_pair *vpp = (__vector_pair *)ptr;
-  *vpp = *vp;
-}
-
-void testVPArg4(const __vector_pair *const vp, int *ptr) {
-  __vector_pair *vpp = (__vector_pair *)ptr;
-  *vpp = *vp;
-}
-
-void testVPArg5(__vector_pair vpa[], int *ptr) {
-  __vector_pair *vpp = (__vector_pair *)ptr;
-  *vpp = vpa[0];
-}
-
 void testVPArg6(const vp_t vp, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
   __vector_pair *vpp = (__vector_pair *)ptr;
   *vpp = vp;
 }
 
-void testVPArg7(const vp_t *vp, int *ptr) {
-  __vector_pair *vpp = (__vector_pair *)ptr;
-  *vpp = *vp;
-}
-
 // function return
 __vector_pair testVPRet1(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
   __vector_pair *vpp = (__vector_pair *)ptr;
   return *vpp; // expected-error {{invalid use of PPC MMA type}}
 }
 
-__vector_pair *testVPRet2(int *ptr) {
-  __vector_pair *vpp = (__vector_pair *)ptr;
-  return vpp + 2;
-}
-
-const __vector_pair *testVPRet3(int *ptr) {
-  __vector_pair *vpp = (__vector_pair *)ptr;
-  return vpp + 2;
-}
-
 const vp_t testVPRet4(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
   __vector_pair *vpp = (__vector_pair *)ptr;
   return *vpp; // expected-error {{invalid use of PPC MMA type}}
 }
 
-const vp_t *testVPRet5(int *ptr) {
-  __vector_pair *vpp = (__vector_pair *)ptr;
-  return vpp + 2;
-}
-
 // global
 __vector_pair globalvp;        // expected-error {{invalid use of PPC MMA type}}
 const __vector_pair globalvp2; // expected-error {{invalid use of PPC MMA type}}
@@ -243,19 +142,6 @@ __vector_pair *globalvpp;
 const __vector_pair *const globalvpp2;
 vp_t globalvp_t; // expected-error {{invalid use of PPC MMA type}}
 
-// local
-void testVPLocal(int *ptr, vector unsigned char vc) {
-  __vector_pair *vpp = (__vector_pair *)ptr;
-  __vector_pair vp1 = *vpp;
-  __vector_pair vp2;
-  __builtin_vsx_assemble_pair(&vp2, vc, vc);
-  __builtin_vsx_build_pair(&vp2, vc, vc);
-  __vector_pair vp3;
-  __vector_quad vq;
-  __builtin_mma_xvf64ger(&vq, vp3, vc);
-  *vpp = vp3;
-}
-
 // struct field
 struct TestVPStruct {
   int a;
@@ -264,17 +150,6 @@ struct TestVPStruct {
   __vector_pair *vp;
 };
 
-// sizeof / alignof
-int testVPSizeofAlignof(int *ptr) {
-  __vector_pair *vpp = (__vector_pair *)ptr;
-  __vector_pair vp = *vpp;
-  unsigned sizet = sizeof(__vector_pair);
-  unsigned alignt = __alignof__(__vector_pair);
-  unsigned sizev = sizeof(vp);
-  unsigned alignv = __alignof__(vp);
-  return sizet + alignt + sizev + alignv;
-}
-
 // operators
 int testVPOperators1(int *ptr) {
   __vector_pair *vpp = (__vector_pair *)ptr;
@@ -342,17 +217,7 @@ void testRestrictQualifiedPointer1(int *__restrict acc) {
   __builtin_mma_disassemble_acc(arr, acc); // expected-error {{passing 'int *restrict' to parameter of incompatible type '__vector_quad *'}}
 }
 
-void testRestrictQualifiedPointer2(__vector_quad *__restrict acc) {
-  vector float arr[4];
-  __builtin_mma_disassemble_acc(arr, acc);
-}
-
 void testVolatileQualifiedPointer1(int *__volatile acc) {
   vector float arr[4];
   __builtin_mma_disassemble_acc(arr, acc); // expected-error {{passing 'int *volatile' to parameter of incompatible type '__vector_quad *'}}
 }
-
-void testVolatileQualifiedPointer2(__vector_quad *__volatile acc) {
-  vector float arr[4];
-  __builtin_mma_disassemble_acc(arr, acc);
-}