[llvm] r339432 - [ARM] Disallow zexts in ARMCodeGenPrepare

Fri Aug 10 06:57:13 PDT 2018

Author: sam_parker
Date: Fri Aug 10 06:57:13 2018
New Revision: 339432

URL: http://llvm.org/viewvc/llvm-project?rev=339432&view=rev
Log:
[ARM] Disallow zexts in ARMCodeGenPrepare
    
Enabling ARMCodeGenPrepare by default caused a whole load of
failures. This is due to zexts and truncs not being handled properly.
ZExts are messy so it's just easier to disable for now and truncs
are allowed only as 'sinks'. I still need to figure out why allowing
them as 'sources' causes so many failures. The other main changes are
that we are explicit in the types that we converting to, it's now
always 'TypeSize'. Type support is also now performed while checking
for valid opcodes as it unnecessarily complicated having the checks
are different stages.
    
I've moved the tests around too, so we have the zext and truncs in
their own file as well as the overflowing opcode tests.

Differential Revision: https://reviews.llvm.org/D50518

Added:
    llvm/trunk/test/CodeGen/ARM/arm-cgp-overflow.ll
    llvm/trunk/test/CodeGen/ARM/arm-cgp-zext-truncs.ll
Modified:
    llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp
    llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll
    llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll

Modified: llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp?rev=339432&r1=339431&r2=339432&view=diff
==============================================================================

--- llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp Fri Aug 10 06:57:13 2018
@@ -85,16 +85,15 @@ class ARMCodeGenPrepare : public Functio
   const ARMSubtarget *ST = nullptr;
   IRPromoter *Promoter = nullptr;
   std::set<Value*> AllVisited;
-  Type *OrigTy = nullptr;
-  unsigned TypeSize = 0;
 
-  bool isNarrowInstSupported(Instruction *I);
   bool isSupportedValue(Value *V);
   bool isLegalToPromote(Value *V);
   bool TryToPromote(Value *V);
 
 public:
   static char ID;
+  static unsigned TypeSize;
+  Type *OrigTy = nullptr;
 
   ARMCodeGenPrepare() : FunctionPass(ID) {}
 
@@ -126,65 +125,66 @@ static bool isSigned(Value *V) {
 /// dealing with icmps but allow any other integer that is <= 16 bits. Void
 /// types are accepted so we can handle switches.
 static bool isSupportedType(Value *V) {
-  if (V->getType()->isVoidTy())
+  LLVM_DEBUG(dbgs() << "ARM CGP: isSupportedType: " << *V << "\n");
+  Type *Ty = V->getType();
+  if (Ty->isVoidTy())
     return true;
 
-  const IntegerType *IntTy = dyn_cast<IntegerType>(V->getType());
-  if (!IntTy)
-    return false;
+  if (auto *Ld = dyn_cast<LoadInst>(V))
+    Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType();
 
-  // Don't try to promote boolean values.
-  if (IntTy->getBitWidth() == 1)
+  const IntegerType *IntTy = dyn_cast<IntegerType>(Ty);
+  if (!IntTy) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: No, not an integer.\n");
     return false;
+  }
 
-  if (auto *ZExt = dyn_cast<ZExtInst>(V))
-    return isSupportedType(ZExt->getOperand(0));
+  return IntTy->getBitWidth() == ARMCodeGenPrepare::TypeSize;
+}
 
-  return IntTy->getBitWidth() <= 16;
+/// Return true if the given value is a leaf in the use-def chain, producing
+/// a narrow (i8, i16) value. These values will be zext to start the promotion
+/// of the tree to i32. We guarantee that these won't populate the upper bits
+/// of the register. ZExt on the loads will be free, and the same for call
+/// return values because we only accept ones that guarantee a zeroext ret val.
+/// Many arguments will have the zeroext attribute too, so those would be free
+/// too.
+static bool isSource(Value *V) {
+  // TODO Allow truncs and zext to be sources.
+  if (isa<Argument>(V))
+    return true;
+  else if (isa<LoadInst>(V))
+    return true;
+  else if (auto *Call = dyn_cast<CallInst>(V))
+    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+  return false;
 }
 
 /// Return true if V will require any promoted values to be truncated for the
-/// use to be valid.
+/// the IR to remain valid. We can't mutate the value type of these
+/// instructions.
 static bool isSink(Value *V) {
+  // TODO The truncate also isn't actually necessary because we would already
+  // proved that the data value is kept within the range of the original data
+  // type.
   auto UsesNarrowValue = [](Value *V) {
-    return V->getType()->getScalarSizeInBits() <= 32;
+    return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
   };
 
   if (auto *Store = dyn_cast<StoreInst>(V))
     return UsesNarrowValue(Store->getValueOperand());
   if (auto *Return = dyn_cast<ReturnInst>(V))
     return UsesNarrowValue(Return->getReturnValue());
+  if (auto *Trunc = dyn_cast<TruncInst>(V))
+    return UsesNarrowValue(Trunc->getOperand(0));
 
   return isa<CallInst>(V);
 }
 
-/// Return true if the given value is a leaf that will need to be zext'd.
-static bool isSource(Value *V) {
-  if (isa<Argument>(V) && isSupportedType(V))
-    return true;
-  else if (isa<TruncInst>(V))
-    return true;
-  else if (auto *ZExt = dyn_cast<ZExtInst>(V))
-    // ZExt can be a leaf if its the only user of a load.
-    return isa<LoadInst>(ZExt->getOperand(0)) &&
-                         ZExt->getOperand(0)->hasOneUse();
-  else if (auto *Call = dyn_cast<CallInst>(V))
-    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
-  else if (auto *Load = dyn_cast<LoadInst>(V)) {
-    if (!isa<IntegerType>(Load->getType()))
-      return false;
-    // A load is a leaf, unless its already just being zext'd.
-    if (Load->hasOneUse() && isa<ZExtInst>(*Load->use_begin()))
-      return false;
-
-    return true;
-  }
-  return false;
-}
-
 /// Return whether the instruction can be promoted within any modifications to
 /// it's operands or result.
 static bool isSafeOverflow(Instruction *I) {
+  // FIXME Do we need NSW too?
   if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
     return true;
 
@@ -222,19 +222,18 @@ static bool isSafeOverflow(Instruction *
 }
 
 static bool shouldPromote(Value *V) {
-  auto *I = dyn_cast<Instruction>(V);
-  if (!I)
+  if (!isa<IntegerType>(V->getType()) || isSink(V))
     return false;
 
-  if (!isa<IntegerType>(V->getType()))
-    return false;
+  if (isSource(V))
+    return true;
 
-  if (isa<StoreInst>(I) || isa<TerminatorInst>(I) || isa<TruncInst>(I) ||
-      isa<ICmpInst>(I))
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
     return false;
 
-  if (auto *ZExt = dyn_cast<ZExtInst>(I))
-    return !ZExt->getDestTy()->isIntegerTy(32);
+  if (isa<ICmpInst>(I))
+    return false;
 
   return true;
 }
@@ -262,7 +261,7 @@ static bool isPromotedResultSafe(Value *
 /// Return the intrinsic for the instruction that can perform the same
 /// operation but on a narrow type. This is using the parallel dsp intrinsics
 /// on scalar values.
-static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) {
+static Intrinsic::ID getNarrowIntrinsic(Instruction *I) {
   // Whether we use the signed or unsigned versions of these intrinsics
   // doesn't matter because we're not using the GE bits that they set in
   // the APSR.
@@ -270,10 +269,10 @@ static Intrinsic::ID getNarrowIntrinsic(
   default:
     break;
   case Instruction::Add:
-    return TypeSize == 16 ? Intrinsic::arm_uadd16 :
+    return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_uadd16 :
       Intrinsic::arm_uadd8;
   case Instruction::Sub:
-    return TypeSize == 16 ? Intrinsic::arm_usub16 :
+    return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_usub16 :
       Intrinsic::arm_usub8;
   }
   llvm_unreachable("unhandled opcode for narrow intrinsic");
@@ -285,10 +284,9 @@ void IRPromoter::Mutate(Type *OrigTy,
                         SmallPtrSetImpl<Instruction*> &Roots) {
   IRBuilder<> Builder{Ctx};
   Type *ExtTy = Type::getInt32Ty(M->getContext());
-  unsigned TypeSize = OrigTy->getPrimitiveSizeInBits();
   SmallPtrSet<Value*, 8> Promoted;
-  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize
-        << " to 32-bits\n");
+  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
+             << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
 
   auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
     SmallVector<Instruction*, 4> Users;
@@ -325,7 +323,7 @@ void IRPromoter::Mutate(Type *OrigTy,
     LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
                << *I << "\n");
     Function *DSPInst =
-      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize));
+      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
     Builder.SetInsertPoint(I);
     Builder.SetCurrentDebugLocation(I->getDebugLoc());
     Value *Args[] = { I->getOperand(0), I->getOperand(1) };
@@ -353,9 +351,7 @@ void IRPromoter::Mutate(Type *OrigTy,
   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting leaves:\n");
   for (auto V : Leaves) {
     LLVM_DEBUG(dbgs() << " - " << *V << "\n");
-    if (auto *ZExt = dyn_cast<ZExtInst>(V))
-      ZExt->mutateType(ExtTy);
-    else if (auto *I = dyn_cast<Instruction>(V))
+    if (auto *I = dyn_cast<Instruction>(V))
       InsertZExt(I, I);
     else if (auto *Arg = dyn_cast<Argument>(V)) {
       BasicBlock &BB = Arg->getParent()->front();
@@ -401,17 +397,9 @@ void IRPromoter::Mutate(Type *OrigTy,
   for (auto *V : Visited) {
     if (Leaves.count(V))
       continue;
-    if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
-      if (ZExt->getDestTy() != ExtTy) {
-        ZExt->mutateType(ExtTy);
-        Promoted.insert(ZExt);
-      }
-      else if (ZExt->getSrcTy() == ExtTy) {
-        ReplaceAllUsersOfWith(V, ZExt->getOperand(0));
-        InstsToRemove.push_back(ZExt);
-      }
+
+    if (!isa<Instruction>(V))
       continue;
-    }
 
     if (!shouldPromote(V) || isPromotedResultSafe(V))
       continue;
@@ -459,30 +447,6 @@ void IRPromoter::Mutate(Type *OrigTy,
   LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n");
 }
 
-bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) {
-  if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
-    return false;
-
-  if (ST->isThumb() && !ST->hasThumb2())
-    return false;
-
-  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
-    return false;
-
-  // TODO
-  // Would it be profitable? For Thumb code, these parallel DSP instructions
-  // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
-  // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
-  // halved. They also do not take immediates as operands.
-  for (auto &Op : I->operands()) {
-    if (isa<Constant>(Op)) {
-      if (!EnableDSPWithImms)
-        return false;
-    }
-  }
-  return true;
-}
-
 /// We accept most instructions, as well as Arguments and ConstantInsts. We
 /// Disallow casts other than zext and truncs and only allow calls if their
 /// return value is zeroext. We don't allow opcodes that can introduce sign
@@ -490,42 +454,42 @@ bool ARMCodeGenPrepare::isNarrowInstSupp
 bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
   LLVM_DEBUG(dbgs() << "ARM CGP: Is " << *V << " supported?\n");
 
-  // Non-instruction values that we can handle.
-  if (isa<ConstantInt>(V) || isa<Argument>(V))
-    return true;
+  if (auto *ICmp = dyn_cast<ICmpInst>(V))
+    return ICmp->isEquality() || !ICmp->isSigned();
 
   // Memory instructions
-  if (isa<StoreInst>(V) || isa<LoadInst>(V) || isa<GetElementPtrInst>(V))
+  if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
     return true;
 
   // Branches and targets.
-  if (auto *ICmp = dyn_cast<ICmpInst>(V))
-    return ICmp->isEquality() || !ICmp->isSigned();
-
   if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
     return true;
 
-  if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V))
-    return true;
+  // Non-instruction values that we can handle.
+  if (isa<ConstantInt>(V) || isa<Argument>(V))
+    return isSupportedType(V);
+
+  if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) ||
+      isa<LoadInst>(V))
+    return isSupportedType(V);
+
+  // Currently, Trunc is the only cast we support.
+  if (auto *Trunc = dyn_cast<TruncInst>(V))
+    return isSupportedType(Trunc->getOperand(0));
 
   // Special cases for calls as we need to check for zeroext
   // TODO We should accept calls even if they don't have zeroext, as they can
   // still be roots.
   if (auto *Call = dyn_cast<CallInst>(V))
-    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
-  else if (auto *Cast = dyn_cast<CastInst>(V)) {
-    if (isa<ZExtInst>(Cast))
-      return Cast->getDestTy()->getScalarSizeInBits() <= 32;
-    else if (auto *Trunc = dyn_cast<TruncInst>(V))
-      return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize;
-    else {
-      LLVM_DEBUG(dbgs() << "ARM CGP: No, unsupported cast.\n");
-      return false;
-    }
-  } else if (!isa<BinaryOperator>(V)) {
+    return isSupportedType(Call) &&
+           Call->hasRetAttr(Attribute::AttrKind::ZExt);
+
+  if (!isa<BinaryOperator>(V)) {
     LLVM_DEBUG(dbgs() << "ARM CGP: No, not a binary operator.\n");
     return false;
   }
+  if (!isSupportedType(V))
+    return false;
 
   bool res = !isSigned(V);
   if (!res)
@@ -537,39 +501,49 @@ bool ARMCodeGenPrepare::isSupportedValue
 /// smaller than the targeted promoted type. Check that we're not trying to
 /// promote something larger than our base 'TypeSize' type.
 bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
-  if (!isSupportedType(V))
-    return false;
+  if (isPromotedResultSafe(V))
+    return true;
 
-  unsigned VSize = 0;
-  if (auto *Ld = dyn_cast<LoadInst>(V)) {
-    auto *PtrTy = cast<PointerType>(Ld->getPointerOperandType());
-    VSize = PtrTy->getElementType()->getPrimitiveSizeInBits();
-  } else if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
-    VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits();
-  } else {
-    VSize = V->getType()->getPrimitiveSizeInBits();
-  }
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
 
-  if (VSize > TypeSize)
+  // If promotion is not safe, can we use a DSP instruction to natively
+  // handle the narrow type?
+  if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
     return false;
 
-  if (isPromotedResultSafe(V))
-    return true;
+  if (ST->isThumb() && !ST->hasThumb2())
+    return false;
 
-  if (auto *I = dyn_cast<Instruction>(V))
-    return isNarrowInstSupported(I);
+  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
+    return false;
 
-  return false;
+  // TODO
+  // Would it be profitable? For Thumb code, these parallel DSP instructions
+  // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
+  // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
+  // halved. They also do not take immediates as operands.
+  for (auto &Op : I->operands()) {
+    if (isa<Constant>(Op)) {
+      if (!EnableDSPWithImms)
+        return false;
+    }
+  }
+  return true;
 }
 
 bool ARMCodeGenPrepare::TryToPromote(Value *V) {
   OrigTy = V->getType();
   TypeSize = OrigTy->getPrimitiveSizeInBits();
+  if (TypeSize > 16)
+    return false;
 
   if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
     return false;
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n");
+  LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << ", TypeSize = "
+             << TypeSize << "\n");
 
   SetVector<Value*> WorkList;
   SmallPtrSet<Value*, 8> Leaves;
@@ -584,6 +558,10 @@ bool ARMCodeGenPrepare::TryToPromote(Val
     if (CurrentVisited.count(V))
       return true;
 
+    // Ignore pointer value that aren't instructions.
+    if (!isa<Instruction>(V) && isa<PointerType>(V->getType()))
+      return true;
+
     if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
       LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
       return false;
@@ -638,41 +616,10 @@ bool ARMCodeGenPrepare::TryToPromote(Val
     }
   }
 
-  unsigned NumToPromote = 0;
-  unsigned Cost = 0;
-  for (auto *V : CurrentVisited) {
-    // Truncs will cause a uxt and no zeroext arguments will often require
-    // a uxt somewhere.
-    if (isa<TruncInst>(V))
-      ++Cost;
-    else if (auto *Arg = dyn_cast<Argument>(V)) {
-      if (!Arg->hasZExtAttr())
-        ++Cost;
-    }
-
-    // Mem ops can automatically be extended/truncated and non-instructions
-    // don't need anything done.
-    if (Leaves.count(V) || isa<StoreInst>(V) || !isa<Instruction>(V))
-      continue;
-
-    // Will need to truncate calls args and returns.
-    if (Roots.count(cast<Instruction>(V))) {
-      ++Cost;
-      continue;
-    }
-
-    if (shouldPromote(V))
-      ++NumToPromote;
-  }
-
   LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
              for (auto *I : CurrentVisited)
                I->dump();
              );
-  LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote
-             << " instructions = " << Cost << "\n");
-  if (Cost > NumToPromote || (NumToPromote == 0))
-    return false;
 
   Promoter->Mutate(OrigTy, CurrentVisited, Leaves, Roots);
   return true;
@@ -712,12 +659,8 @@ bool ARMCodeGenPrepare::runOnFunction(Fu
 
         LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
         for (auto &Op : CI.operands()) {
-          if (auto *I = dyn_cast<Instruction>(Op)) {
-            if (isa<ZExtInst>(I))
-              MadeChange |= TryToPromote(I->getOperand(0));
-            else
-              MadeChange |= TryToPromote(I);
-          }
+          if (auto *I = dyn_cast<Instruction>(Op))
+            MadeChange |= TryToPromote(I);
         }
       }
     }
@@ -744,6 +687,7 @@ INITIALIZE_PASS_END(ARMCodeGenPrepare, D
                     false, false)
 
 char ARMCodeGenPrepare::ID = 0;
+unsigned ARMCodeGenPrepare::TypeSize = 0;
 
 FunctionPass *llvm::createARMCodeGenPreparePass() {
   return new ARMCodeGenPrepare();

Modified: llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll?rev=339432&r1=339431&r2=339432&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll Fri Aug 10 06:57:13 2018
@@ -158,39 +158,6 @@ entry:
   ret i32 %res
 }
 
-; CHECK-COMMON-LABEL: dsp_imm2
-; CHECK-COMMON:   add   r0, r1
-; CHECK-DSP-NEXT: ldrh  r1, [r3]
-; CHECK-DSP-NEXT: ldrh  r2, [r2]
-; CHECK-DSP-NEXT: subs  r1, r1, r0
-; CHECK-DSP-NEXT: add   r0, r2
-; CHECK-DSP-NEXT: uxth  r3, r1
-; CHECK-DSP-NEXT: uxth  r2, r0
-; CHECK-DSP-NEXT: cmp   r2, r3
-
-; CHECK-DSP-IMM:      movs  r1, #0
-; CHECK-DSP-IMM-NEXT: uxth  r0, r0
-; CHECK-DSP-IMM-NEXT: usub16  r1, r1, r0
-; CHECK-DSP-IMM-NEXT: ldrh  r0, [r2]
-; CHECK-DSP-IMM-NEXT: ldrh  r3, [r3]
-; CHECK-DSP-IMM-NEXT: usub16  r0, r0, r1
-; CHECK-DSP-IMM-NEXT: uadd16  r1, r3, r1
-; CHECK-DSP-IMM-NEXT: cmp r0, r1
-
-define i16 @dsp_imm2(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) {
-entry:
-  %add0 = add i32 %arg0, %arg1
-  %conv0 = trunc i32 %add0 to i16
-  %sub0 = sub i16 0, %conv0
-  %load0 = load i16, i16* %gep0, align 2
-  %load1 = load i16, i16* %gep1, align 2
-  %sub1 = sub i16 %load0, %sub0
-  %add1 = add i16 %load1, %sub0
-  %cmp = icmp ult i16 %sub1, %add1
-  %res = select i1 %cmp, i16 %add1, i16 %sub1
-  ret i16 %res
-}
-
 ; CHECK-COMMON-LABEL: dsp_var:
 ; CHECK-COMMON:   eors    r1, r0
 ; CHECK-COMMON:   and     r2, r0, #7
@@ -267,109 +234,6 @@ entry:
   ret i32 %res
 }
 
-; CHECK-COMMON-LABEL: icmp_i32_zext:
-; CHECK-COMMON:     ldrb [[LD:r[^ ]+]], [r0]
-; CHECK-COMMON:     subs [[SUB:r[^ ]+]], [[LD]], #1
-; CHECK-COMMON-NOT: uxt
-; CHECK-COMMON:     cmp [[LD]], [[SUB]]
-; CHECK-COMMON-NOT: uxt
-define i8 @icmp_i32_zext(i8* %ptr) {
-entry:
-  %gep = getelementptr inbounds i8, i8* %ptr, i32 0
-  %0 = load i8, i8* %gep, align 1
-  %1 = sub nuw nsw i8 %0, 1
-  %conv44 = zext i8 %0 to i32
-  br label %preheader
-
-preheader:
-  br label %body
-
-body:
-  %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ]
-  %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ]
-  %conv51266 = zext i8 %2 to i32
-  %cmp52267 = icmp eq i32 %si.0274, %conv51266
-  br i1 %cmp52267, label %if.end, label %exit
-
-if.end:
-  %inc = add i32 %si.0274, 1
-  %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc
-  %3 = load i8, i8* %gep1, align 1
-  br label %body
-
-exit:
-  ret i8 %2
-}
-
- at d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
- at sh1 = hidden local_unnamed_addr global i16 0, align 2
- at d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
-
-; CHECK-COMMON-LABEL: icmp_sext_zext_store_i8_i16
-; CHECK-NODSP: ldrb [[BYTE:r[^ ]+]],
-; CHECK-NODSP: strh [[BYTE]],
-; CHECK-NODSP: ldrsh.w
-define i32 @icmp_sext_zext_store_i8_i16() {
-entry:
-  %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1
-  %conv = zext i8 %0 to i16
-  store i16 %conv, i16* @sh1, align 2
-  %conv1 = zext i8 %0 to i32
-  %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2
-  %conv2 = sext i16 %1 to i32
-  %cmp = icmp eq i32 %conv1, %conv2
-  %conv3 = zext i1 %cmp to i32
-  ret i32 %conv3
-}
-
-; CHECK-COMMON-LABEL: or_icmp_ugt:
-; CHECK-COMMON:     ldrb [[LD:r[^ ]+]], [r1]
-; CHECK-COMMON:     subs [[SUB:r[^ ]+]], #1
-; CHECK-COMMON-NOT: uxtb
-; CHECK-COMMON:     cmp [[SUB]], #3
-define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) {
-entry:
-  %0 = load i8, i8* %ptr
-  %1 = zext i8 %0 to i32
-  %mul = shl nuw nsw i32 %1, 1
-  %add0 = add nuw nsw i32 %mul, 6
-  %cmp0 = icmp ne i32 %arg, %add0
-  %add1 = add i8 %0, -1
-  %cmp1 = icmp ugt i8 %add1, 3
-  %or = or i1 %cmp0, %cmp1
-  ret i1 %or
-}
-
-; CHECK-COMMON-LABEL: icmp_switch_trunc:
-; CHECK-COMMON-NOT: uxt
-define i16 @icmp_switch_trunc(i16 zeroext %arg) {
-entry:
-  %conv = add nuw i16 %arg, 15
-  %mul = mul nuw nsw i16 %conv, 3
-  %trunc = trunc i16 %arg to i3
-  switch i3 %trunc, label %default [
-    i3 0, label %sw.bb
-    i3 1, label %sw.bb.i
-  ]
-
-sw.bb:
-  %cmp0 = icmp ult i16 %mul, 127
-  %select = select i1 %cmp0, i16 %mul, i16 127
-  br label %exit
-
-sw.bb.i:
-  %cmp1 = icmp ugt i16 %mul, 34
-  %select.i = select i1 %cmp1, i16 %mul, i16 34
-  br label %exit
-
-default:
-  br label %exit
-
-exit:
-  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
-  ret i16 %res
-}
-
 ; CHECK-COMMON-LABEL: icmp_eq_minus_one
 ; CHECK-COMMON: cmp r0, #255
 define i32 @icmp_eq_minus_one(i8* %ptr) {
@@ -392,77 +256,3 @@ define i32 @icmp_not(i16 zeroext %arg0,
   ret i32 %res
 }
 
-; CHECK-COMMON-LABEL: mul_wrap
-; CHECK-COMMON: mul
-; CHECK-COMMON: uxth
-; CHECK-COMMON: cmp
-define i16 @mul_wrap(i16 %arg0, i16 %arg1) {
-  %mul = mul i16 %arg0, %arg1
-  %cmp = icmp eq i16 %mul, 1
-  %res = select i1 %cmp, i16 %arg0, i16 47
-  ret i16 %res
-}
-
-; CHECK-COMMON-LABEL: shl_wrap
-; CHECK-COMMON: lsl
-; CHECK-COMMON: uxth
-; CHECK-COMMON: cmp
-define i16 @shl_wrap(i16 %arg0) {
-  %mul = shl i16 %arg0, 4
-  %cmp = icmp eq i16 %mul, 1
-  %res = select i1 %cmp, i16 %arg0, i16 47
-  ret i16 %res
-}
-
-; CHECK-COMMON-LABEL: add_wrap
-; CHECK-COMMON: add
-; CHECK-COMMON: uxth
-; CHECK-COMMON: cmp
-define i16 @add_wrap(i16 %arg0, i16 %arg1) {
-  %add = add i16 %arg0, 128
-  %cmp = icmp eq i16 %add, %arg1
-  %res = select i1 %cmp, i16 %arg0, i16 1
-  ret i16 %res
-}
-
-; CHECK-COMMON-LABEL: sub_wrap
-; CHECK-COMMON: sub
-; CHECK-COMMON: uxth
-; CHECK-COMMON: cmp
-define i16 @sub_wrap(i16 %arg0, i16 %arg1, i16 %arg2) {
-  %sub = sub i16 %arg0, %arg2
-  %cmp = icmp eq i16 %sub, %arg1
-  %res = select i1 %cmp, i16 %arg0, i16 1
-  ret i16 %res
-}
-
-; CHECK-COMMON-LABEL: urem_trunc_icmps
-; CHECK-COMMON-NOT: uxt
-define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) {
-entry:
-  %ptr = load i16*, i16** %in, align 4
-  %ld = load i16, i16* %ptr, align 2
-  %cmp.i = icmp eq i16 %ld, 0
-  br i1 %cmp.i, label %exit, label %cond.false.i
-
-cond.false.i:
-  %rem = urem i16 5, %ld
-  %extract.t = trunc i16 %rem to i8
-  br label %body
-
-body:
-  %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ]
-  %cmp = icmp ugt i8 %cond.in.i.off0, 7
-  %conv5 = zext i1 %cmp to i32
-  store i32 %conv5, i32* %g, align 4
-  %.pr = load i32, i32* %k, align 4
-  %tobool13150 = icmp eq i32 %.pr, 0
-  br i1 %tobool13150, label %for.inc, label %exit
-
-for.inc:
-  %add = add nuw i8 %cond.in.i.off0, 1
-  br label %body
-
-exit:
-  ret void
-}

Added: llvm/trunk/test/CodeGen/ARM/arm-cgp-overflow.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/arm-cgp-overflow.ll?rev=339432&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/arm-cgp-overflow.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/arm-cgp-overflow.ll Fri Aug 10 06:57:13 2018
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s
+
+; CHECK: overflow_add
+; CHECK: add
+; CHECK: uxth
+; CHECK: cmp
+define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
+  %add = add i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+; CHECK-LABEL: overflow_sub
+; CHECK: sub
+; CHECK: uxth
+; CHECK: cmp
+define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) {
+  %add = sub i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+; CHECK-LABEL: overflow_mul
+; CHECK: mul
+; CHECK: uxth
+; CHECK: cmp
+define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) {
+  %add = mul i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+; CHECK-LABEL: overflow_shl
+; CHECK-COMMON: lsl
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) {
+  %add = shl i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}

Modified: llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll?rev=339432&r1=339431&r2=339432&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll Fri Aug 10 06:57:13 2018
@@ -116,48 +116,6 @@ exit:
   ret void
 }
 
-; CHECK-COMMON-LABEL: phi_feeding_switch
-; CHECK-COMMON: ldrb
-; CHECK-COMMON: uxtb
-; CHECK-COMMON-NOT: uxt
-define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) {
-entry:
-  %pre = load i8, i8* %memblock, align 1
-  %conv = trunc i16 %arg to i8
-  br label %header
-
-header:
-  %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ]
-  %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ]
-  %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ]
-  switch i8 %phi.0, label %default [
-    i8 43, label %for.inc.i
-    i8 45, label %for.inc.i.i
-  ]
-
-for.inc.i:
-  %xor = xor i8 %phi.1, 1
-  br label %latch
-
-for.inc.i.i:
-  %and = and i8 %phi.1, 3
-  br label %latch
-
-default:
-  %sub = sub i8 %phi.0, 1
-  %cmp2 = icmp ugt i8 %sub, 4
-  br i1 %cmp2, label %latch, label %exit
-
-latch:
-  %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ]
-  %count = add nuw i8 %phi.2, 1
-  store i8 %count, i8* %store, align 1
-  br label %header
-
-exit:
-  ret void
-}
-
 ; CHECK-COMMON-LABEL: ret_i8
 ; CHECK-COMMON-NOT:   uxt
 define i8 @ret_i8() {
@@ -186,33 +144,6 @@ exit:
   ret i8 %inc2
 }
 
-; Check that %exp requires uxth in all cases, and will also be required to
-; promote %1 for the call - unless we can generate a uadd16.
-; CHECK-COMMON-LABEL: zext_load_sink_call:
-; CHECK-COMMON:       uxt
-; CHECK-DSP-IMM:      uadd16
-; CHECK-COMMON:       cmp
-; CHECK-DSP:          uxt
-; CHECK-DSP-IMM-NOT:  uxt
-define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) {
-entry:
-  %0 = load i16, i16* %ptr, align 4
-  %1 = add i16 %exp, 3
-  %cmp = icmp eq i16 %0, %exp
-  br i1 %cmp, label %exit, label %if.then
-
-if.then:
-  %conv0 = zext i16 %0 to i32
-  %conv1 = zext i16 %1 to i32
-  %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1)
-  br label %exit
-
-exit:
-  %exitval = phi i32 [ %call, %if.then ], [ 0, %entry  ]
-  ret i32 %exitval
-}
-
-
 ; Check that the pass doesn't try to promote the immediate parameters.
 ; CHECK-COMMON-LABEL: call_with_imms
 ; CHECK-COMMON-NOT:   uxt
@@ -301,9 +232,10 @@ entry:
   ret i32 undef
 }
 
+; Transform will bail because of the zext
 ; Check that d.sroa.0.0.be is promoted passed directly into the tail call.
 ; CHECK-COMMON-LABEL: check_zext_phi_call_arg
-; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON: uxt
 define i32 @check_zext_phi_call_arg() {
 entry:
   br label %for.cond
@@ -385,7 +317,6 @@ declare dso_local fastcc zeroext i8 @saf
 declare dso_local i32 @e(...) local_unnamed_addr #1
 declare dso_local zeroext i16 @f(...) local_unnamed_addr #1
 
-declare i32 @dummy(i32, i32)
 declare i8 @dummy_i8(i8)
 declare i8 @dummy2(i8*, i8, i8)
 declare i16 @dummy3(i16)

Added: llvm/trunk/test/CodeGen/ARM/arm-cgp-zext-truncs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/arm-cgp-zext-truncs.ll?rev=339432&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/arm-cgp-zext-truncs.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/arm-cgp-zext-truncs.ll Fri Aug 10 06:57:13 2018
@@ -0,0 +1,292 @@
+; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv7-linux-android %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
+; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv8 %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
+
+; Transform will fail because the trunc is not a sink.
+; CHECK-COMMON-LABEL: dsp_trunc
+; CHECK-COMMON:   add   [[ADD:[^ ]+]],
+; CHECK-DSP-NEXT: ldrh  r1, [r3]
+; CHECK-DSP-NEXT: ldrh  r2, [r2]
+; CHECK-DSP-NEXT: subs  r1, r1, [[ADD]]
+; CHECK-DSP-NEXT: add   r0, r2
+; CHECK-DSP-NEXT: uxth  r3, r1
+; CHECK-DSP-NEXT: uxth  r2, r0
+; CHECK-DSP-NEXT: cmp   r2, r3
+
+; With DSP-IMM, we could have:
+; movs  r1, #0
+; uxth  r0, r0
+; usub16  r1, r1, r0
+; ldrh  r0, [r2]
+; ldrh  r3, [r3]
+; usub16  r0, r0, r1
+; uadd16  r1, r3, r1
+; cmp r0, r1
+define i16 @dsp_trunc(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) {
+entry:
+  %add0 = add i32 %arg0, %arg1
+  %conv0 = trunc i32 %add0 to i16
+  %sub0 = sub i16 0, %conv0
+  %load0 = load i16, i16* %gep0, align 2
+  %load1 = load i16, i16* %gep1, align 2
+  %sub1 = sub i16 %load0, %sub0
+  %add1 = add i16 %load1, %sub0
+  %cmp = icmp ult i16 %sub1, %add1
+  %res = select i1 %cmp, i16 %add1, i16 %sub1
+  ret i16 %res
+}
+
+; CHECK-COMMON-LABEL: trunc_i16_i8
+; CHECK-COMMON: ldrh
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: cmp
+define i8 @trunc_i16_i8(i16* %ptr, i16 zeroext %arg0, i8 zeroext %arg1) {
+entry:
+  %0 = load i16, i16* %ptr
+  %1 = add i16 %0, %arg0
+  %2 = trunc i16 %1 to i8
+  %3 = icmp ugt i8 %2, %arg1
+  %4 = select i1 %3, i8 %2, i8 %arg1
+  ret i8 %4
+}
+
+; The pass will bail because of the zext, otherwise we'd want something like:
+; ldrb [[LD:r[^ ]+]], [r0]
+; subs [[SUB:r[^ ]+]], [[LD]], #1
+; cmp [[LD]], [[SUB]]
+; CHECK-COMMON-LABEL: icmp_i32_zext:
+; CHECK-COMMON: uxtb
+define i8 @icmp_i32_zext(i8* %ptr) {
+entry:
+  %gep = getelementptr inbounds i8, i8* %ptr, i32 0
+  %0 = load i8, i8* %gep, align 1
+  %1 = sub nuw nsw i8 %0, 1
+  %conv44 = zext i8 %0 to i32
+  br label %preheader
+
+preheader:
+  br label %body
+
+body:
+  %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ]
+  %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ]
+  %conv51266 = zext i8 %2 to i32
+  %cmp52267 = icmp eq i32 %si.0274, %conv51266
+  br i1 %cmp52267, label %if.end, label %exit
+
+if.end:
+  %inc = add i32 %si.0274, 1
+  %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc
+  %3 = load i8, i8* %gep1, align 1
+  br label %body
+
+exit:
+  ret i8 %2
+}
+
+; Won't handle zext or sext
+; CHECK-COMMON-LABEL: icmp_sext_zext_store_i8_i16
+define i32 @icmp_sext_zext_store_i8_i16() {
+entry:
+  %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1
+  %conv = zext i8 %0 to i16
+  store i16 %conv, i16* @sh1, align 2
+  %conv1 = zext i8 %0 to i32
+  %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2
+  %conv2 = sext i16 %1 to i32
+  %cmp = icmp eq i32 %conv1, %conv2
+  %conv3 = zext i1 %cmp to i32
+  ret i32 %conv3
+}
+
+; Pass will bail because of the zext, otherwise:
+; ldrb [[LD:r[^ ]+]], [r1]
+; subs [[SUB:r[^ ]+]], #1
+; cmp [[SUB]], #3
+; CHECK-COMMON-LABEL: or_icmp_ugt:
+; CHECK-COMMON: uxt
+define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) {
+entry:
+  %0 = load i8, i8* %ptr
+  %1 = zext i8 %0 to i32
+  %mul = shl nuw nsw i32 %1, 1
+  %add0 = add nuw nsw i32 %mul, 6
+  %cmp0 = icmp ne i32 %arg, %add0
+  %add1 = add i8 %0, -1
+  %cmp1 = icmp ugt i8 %add1, 3
+  %or = or i1 %cmp0, %cmp1
+  ret i1 %or
+}
+
+; CHECK-COMMON-LABEL: icmp_switch_trunc:
+; CHECK-COMMON-NOT: uxt
+define i16 @icmp_switch_trunc(i16 zeroext %arg) {
+entry:
+  %conv = add nuw i16 %arg, 15
+  %mul = mul nuw nsw i16 %conv, 3
+  %trunc = trunc i16 %arg to i3
+  switch i3 %trunc, label %default [
+    i3 0, label %sw.bb
+    i3 1, label %sw.bb.i
+  ]
+
+sw.bb:
+  %cmp0 = icmp ult i16 %mul, 127
+  %select = select i1 %cmp0, i16 %mul, i16 127
+  br label %exit
+
+sw.bb.i:
+  %cmp1 = icmp ugt i16 %mul, 34
+  %select.i = select i1 %cmp1, i16 %mul, i16 34
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
+  ret i16 %res
+}
+
+; Pass will bail because of the zext
+; CHECK-COMMON-LABEL: urem_trunc_icmps
+; CHECK-COMMON: uxt
+define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) {
+entry:
+  %ptr = load i16*, i16** %in, align 4
+  %ld = load i16, i16* %ptr, align 2
+  %cmp.i = icmp eq i16 %ld, 0
+  br i1 %cmp.i, label %exit, label %cond.false.i
+
+cond.false.i:
+  %rem = urem i16 5, %ld
+  %extract.t = trunc i16 %rem to i8
+  br label %body
+
+body:
+  %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ]
+  %cmp = icmp ugt i8 %cond.in.i.off0, 7
+  %conv5 = zext i1 %cmp to i32
+  store i32 %conv5, i32* %g, align 4
+  %.pr = load i32, i32* %k, align 4
+  %tobool13150 = icmp eq i32 %.pr, 0
+  br i1 %tobool13150, label %for.inc, label %exit
+
+for.inc:
+  %add = add nuw i8 %cond.in.i.off0, 1
+  br label %body
+
+exit:
+  ret void
+}
+
+; CHECK-COMMON-LABEL: phi_feeding_switch
+; CHECK-COMMON: ldrb
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: uxtb
+define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) {
+entry:
+  %pre = load i8, i8* %memblock, align 1
+  %conv = trunc i16 %arg to i8
+  br label %header
+
+header:
+  %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ]
+  %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ]
+  %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ]
+  switch i8 %phi.0, label %default [
+    i8 43, label %for.inc.i
+    i8 45, label %for.inc.i.i
+  ]
+
+for.inc.i:
+  %xor = xor i8 %phi.1, 1
+  br label %latch
+
+for.inc.i.i:
+  %and = and i8 %phi.1, 3
+  br label %latch
+
+default:
+  %sub = sub i8 %phi.0, 1
+  %cmp2 = icmp ugt i8 %sub, 4
+  br i1 %cmp2, label %latch, label %exit
+
+latch:
+  %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ]
+  %count = add nuw i8 %phi.2, 1
+  store i8 %count, i8* %store, align 1
+  br label %header
+
+exit:
+  ret void
+}
+
+; Again, zexts will prevent the transform.
+; Check that %exp requires uxth in all cases, and will also be required to
+; promote %1 for the call - unless we can generate a uadd16.
+; CHECK-COMMON-LABEL: zext_load_sink_call:
+; CHECK-COMMON: uxt
+; uadd16
+; cmp
+; CHECK-COMMON: uxt
+define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) {
+entry:
+  %0 = load i16, i16* %ptr, align 4
+  %1 = add i16 %exp, 3
+  %cmp = icmp eq i16 %0, %exp
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %conv0 = zext i16 %0 to i32
+  %conv1 = zext i16 %1 to i32
+  %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1)
+  br label %exit
+
+exit:
+  %exitval = phi i32 [ %call, %if.then ], [ 0, %entry  ]
+  ret i32 %exitval
+}
+
+%class.ae = type { i8 }
+%class.x = type { i8 }
+%class.v = type { %class.q }
+%class.q = type { i16 }
+
+; CHECK-COMMON-LABEL: trunc_i16_i9_switch
+; CHECK-COMMON-NOT: uxt
+define i32 @trunc_i16_i9_switch(%class.ae* %this) {
+entry:
+  %call = tail call %class.x* @_ZNK2ae2afEv(%class.ae* %this)
+  %call2 = tail call %class.v* @_ZN1x2acEv(%class.x* %call)
+  %0 = getelementptr inbounds %class.v, %class.v* %call2, i32 0, i32 0, i32 0
+  %1 = load i16, i16* %0, align 2
+  %2 = trunc i16 %1 to i9
+  %trunc = and i9 %2, -64
+  switch i9 %trunc, label %cleanup.fold.split [
+    i9 0, label %cleanup
+    i9 -256, label %if.then7
+  ]
+
+if.then7:
+  %3 = and i16 %1, 7
+  %tobool = icmp eq i16 %3, 0
+  %cond = select i1 %tobool, i32 2, i32 1
+  br label %cleanup
+
+cleanup.fold.split:
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ %cond, %if.then7 ], [ 0, %entry ], [ 2, %cleanup.fold.split ]
+  ret i32 %retval.0
+}
+
+declare %class.x* @_ZNK2ae2afEv(%class.ae*) local_unnamed_addr
+declare %class.v* @_ZN1x2acEv(%class.x*) local_unnamed_addr
+declare i32 @dummy(i32, i32)
+
+ at d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
+ at sh1 = hidden local_unnamed_addr global i16 0, align 2
+ at d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2