[llvm] r223764 - Teach instcombine to canonicalize "element extraction" from a load of an

Tue Dec 9 00:55:32 PST 2014

Author: chandlerc
Date: Tue Dec  9 02:55:32 2014
New Revision: 223764

URL: http://llvm.org/viewvc/llvm-project?rev=223764&view=rev
Log:
Teach instcombine to canonicalize "element extraction" from a load of an
integer and "element insertion" into a store of an integer into actual
element extraction, element insertion, and vector loads and stores.

Previously various parts of LLVM (including instcombine itself) would
introduce integer loads and stores into the code as a way of opaquely
loading and storing "bits". In some cases (such as a memcpy of
std::complex<float> object) we will eventually end up using those bits
in non-integer types. In order for SROA to effectively promote the
allocas involved, it splits these "store a bag of bits" integer loads
and stores up into the constituent parts. However, for non-alloca loads
and tsores which remain, it uses integer math to recombine the values
into a large integer to load or store.

All of this would be "fine", except that it forces LLVM to go through
integer math to combine and split up values. While this makes perfect
sense for integers (and in fact is critical for bitfields to end up
lowering efficiently) it is *terrible* for non-integer types, especially
floating point types. We have a much more canonical way of representing
the act of concatenating the bits of two SSA values in LLVM: a vector
and insertelement. This patch teaching InstCombine to use this
representation.

With this patch applied, LLVM will no longer introduce integer math into
the critical path of every loop over std::complex<float> operations such
as those that make up the hot path of ... oh, most HPC code, Eigen, and
any other heavy linear algebra library.

For the record, I looked *extensively* at fixing this in other parts of
the compiler, but it just doesn't work:
- We really do want to canonicalize memcpy and other bit-motion to
  integer loads and stores. SSA values are tremendously more powerful
  than "copy" intrinsics. Not doing this regresses massive amounts of
  LLVM's scalar optimizer.
- We really do need to split up integer loads and stores of this form in
  SROA or every memcpy of a trivially copyable struct will prevent SSA
  formation of the members of that struct. It essentially turns off
  SROA.
- The closest alternative is to actually split the loads and stores when
  partitioning with SROA, but this has all of the downsides historically
  discussed of splitting up loads and stores -- the wide-store
  information is fundamentally lost. We would also see performance
  regressions for bitfield-heavy code and other places where the
  integers aren't really intended to be split without seemingly
  arbitrary logic to treat integers totally differently.
- We *can* effectively fix this in instcombine, so it isn't that hard of
  a choice to make IMO.

Differential Revision: http://reviews.llvm.org/D6548

Added:
    llvm/trunk/test/Transforms/InstCombine/loadstore-vector.ll
Modified:
    llvm/trunk/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp?rev=223764&r1=223763&r2=223764&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp Tue Dec  9 02:55:32 2014
@@ -12,14 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
@@ -345,6 +348,142 @@ static LoadInst *combineLoadToNewType(In
   return NewLoad;
 }
 
+/// \brief Combine integer loads to vector stores when the integers bits are
+/// just a concatenation of non-integer (and non-vector) types.
+///
+/// This specifically matches the pattern of loading an integer, right-shifting,
+/// trucating, and casting it to a non-integer type. When the shift is an exact
+/// multiple of the result non-integer type's size, this is more naturally
+/// expressed as a load of a vector and an extractelement. This shows up largely
+/// because large integers are sometimes used to represent a "generic" load or
+/// store, and only later optimization may uncover that there is a more natural
+/// type to represent the load with.
+static Instruction *combineIntegerLoadToVectorLoad(InstCombiner &IC,
+                                                   LoadInst &LI) {
+  // FIXME: This is probably a reasonable transform to make for atomic stores.
+  assert(LI.isSimple() && "Do not call for non-simple stores!");
+
+  const DataLayout &DL = *IC.getDataLayout();
+  unsigned BaseBits = LI.getType()->getIntegerBitWidth();
+  Type *ElementTy = nullptr;
+  int ElementSize;
+
+  // We match any number of element extractions from the loaded integer. Each of
+  // these should be RAUW'ed with an actual extract element instruction at the
+  // given index of a loaded vector.
+  struct ExtractedElement {
+    Instruction *Element;
+    int Index;
+  };
+  SmallVector<ExtractedElement, 2> Elements;
+
+  // Lambda to match the bit cast in the extracted element (which is the root
+  // pattern matched). Accepts the instruction and shifted bits, returns false
+  // if at any point we failed to match a suitable bitcast for element
+  // extraction.
+  auto MatchCast = [&](Instruction *I, unsigned ShiftBits) {
+    // The truncate must be casted to some element type. This cast can only be
+    // a bitcast or an inttoptr cast which is the same size.
+    if (!isa<BitCastInst>(I)) {
+      if (auto *PC = dyn_cast<IntToPtrInst>(I)) {
+        // Ensure that the pointer and integer have the exact same size.
+        if (PC->getOperand(0)->getType()->getIntegerBitWidth() !=
+            DL.getTypeSizeInBits(PC->getType()))
+          return false;
+      } else {
+        // We only support bitcast and inttoptr.
+        return false;
+      }
+    }
+
+    // All of the elements inserted need to be the same type. Either capture the
+    // first element type or check this element type against the previous
+    // element types.
+    if (!ElementTy) {
+      ElementTy = I->getType();
+      // We don't handle integers, sub-vectors, or any aggregate types. We
+      // handle pointers and floating ponit types.
+      if (!ElementTy->isSingleValueType() || ElementTy->isIntegerTy() ||
+          ElementTy->isVectorTy())
+        return false;
+
+      ElementSize = DL.getTypeSizeInBits(ElementTy);
+      // The base integer size and the shift need to be multiples of the element
+      // size in bits.
+      if (BaseBits % ElementSize || ShiftBits % ElementSize)
+        return false;
+    } else if (ElementTy != I->getType()) {
+      return false;
+    }
+
+    // Compute the vector index and store the element with it.
+    int Index =
+        (DL.isLittleEndian() ? ShiftBits : BaseBits - ElementSize - ShiftBits) /
+        ElementSize;
+    ExtractedElement E = {I, Index};
+    Elements.push_back(std::move(E));
+    return true;
+  };
+
+  // Lambda to match the truncate in the extracted element. Accepts the
+  // instruction and shifted bits. Returns false if at any point we failed to
+  // match a suitable truncate for element extraction.
+  auto MatchTruncate = [&](Instruction *I, unsigned ShiftBits) {
+    // Handle the truncate to the bit size of the element.
+    auto *T = dyn_cast<TruncInst>(I);
+    if (!T)
+      return false;
+
+    // Walk all the users of the truncate, whuch must all be bitcasts.
+    for (User *TU : T->users())
+      if (!MatchCast(cast<Instruction>(TU), ShiftBits))
+        return false;
+    return true;
+  };
+
+  for (User *U : LI.users()) {
+    Instruction *I = cast<Instruction>(U);
+
+    // Strip off a logical shift right and retain the shifted amount.
+    ConstantInt *ShiftC;
+    if (!match(I, m_LShr(m_Value(), m_ConstantInt(ShiftC)))) {
+      // This must be a direct truncate.
+      if (!MatchTruncate(I, 0))
+        return nullptr;
+      continue;
+    }
+
+    unsigned ShiftBits = ShiftC->getLimitedValue(BaseBits);
+    // We can't handle shifts of more than the number of bits in the integer.
+    if (ShiftBits == BaseBits)
+      return nullptr;
+
+    // Match all the element extraction users of the shift.
+    for (User *IU : I->users())
+      if (!MatchTruncate(cast<Instruction>(IU), ShiftBits))
+        return nullptr;
+  }
+
+  // If didn't find any extracted elements, there is nothing to do here.
+  if (Elements.empty())
+    return nullptr;
+
+  // Create a vector load and rewrite all of the elements extracted as
+  // extractelement instructions.
+  VectorType *VTy = VectorType::get(ElementTy, BaseBits / ElementSize);
+  LoadInst *NewLI = combineLoadToNewType(IC, LI, VTy);
+
+  for (const auto &E : Elements) {
+    IC.Builder->SetInsertPoint(E.Element);
+    E.Element->replaceAllUsesWith(
+        IC.Builder->CreateExtractElement(NewLI, IC.Builder->getInt32(E.Index)));
+    IC.EraseInstFromFunction(*E.Element);
+  }
+
+  // Return the original load to indicate it has been combined away.
+  return &LI;
+}
+
 /// \brief Combine loads to match the type of value their uses after looking
 /// through intervening bitcasts.
 ///
@@ -373,6 +512,8 @@ static Instruction *combineLoadToOperati
 
 
   // Fold away bit casts of the loaded value by loading the desired type.
+  // FIXME: We should also canonicalize loads of vectors when their elements are
+  // cast to other types.
   if (LI.hasOneUse())
     if (auto *BC = dyn_cast<BitCastInst>(LI.user_back())) {
       LoadInst *NewLoad = combineLoadToNewType(IC, LI, BC->getDestTy());
@@ -381,8 +522,12 @@ static Instruction *combineLoadToOperati
       return &LI;
     }
 
-  // FIXME: We should also canonicalize loads of vectors when their elements are
-  // cast to other types.
+  // Try to combine integer loads into vector loads when the integer is just
+  // loading a bag of bits that are casted into vector element chunks.
+  if (LI.getType()->isIntegerTy())
+    if (Instruction *R = combineIntegerLoadToVectorLoad(IC, LI))
+      return R;
+
   return nullptr;
 }
 
@@ -491,6 +636,201 @@ Instruction *InstCombiner::visitLoadInst
   return nullptr;
 }
 
+/// \brief Helper to combine a store to use a new value.
+///
+/// This just does the work of combining a store to use a new value, potentially
+/// of a different type. It handles metadata, etc., and returns the new
+/// instruction. The new value is stored to a bitcast of the pointer argument to
+/// the original store.
+///
+/// Note that this will create the instructions with whatever insert point the
+/// \c InstCombiner currently is using.
+static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &OldSI,
+                                         Value *V) {
+  Value *Ptr = OldSI.getPointerOperand();
+  unsigned AS = OldSI.getPointerAddressSpace();
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  OldSI.getAllMetadata(MD);
+
+  StoreInst *NewSI = IC.Builder->CreateAlignedStore(
+      V, IC.Builder->CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
+      OldSI.getAlignment());
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    // Note, essentially every kind of metadata should be preserved here! This
+    // routine is supposed to clone a store instruction changing *only its
+    // type*. The only metadata it makes sense to drop is metadata which is
+    // invalidated when the pointer type changes. This should essentially
+    // never be the case in LLVM, but we explicitly switch over only known
+    // metadata to be conservatively correct. If you are adding metadata to
+    // LLVM which pertains to stores, you almost certainly want to add it
+    // here.
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_fpmath:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_mem_parallel_loop_access:
+    case LLVMContext::MD_nonnull:
+      // All of these directly apply.
+      NewSI->setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_invariant_load:
+    case LLVMContext::MD_range:
+      break;
+    }
+  }
+  return NewSI;
+}
+
+/// \brief Combine integer stores to vector stores when the integers bits are
+/// just a concatenation of non-integer (and non-vector) types.
+///
+/// This specifically matches the pattern of taking a sequence of non-integer
+/// types, casting them to integers, extending, shifting, and or-ing them
+/// together to make a concatenation, and then storing the result. This shows up
+/// because large integers are sometimes used to represent a "generic" load or
+/// store, and only later optimization may uncover that there is a more natural
+/// type to represent the store with.
+///
+/// \returns true if the store was successfully combined away. This indicates
+/// the caller must erase the store instruction. We have to let the caller erase
+/// the store instruction sas otherwise there is no way to signal whether it was
+/// combined or not: IC.EraseInstFromFunction returns a null pointer.
+static bool combineIntegerStoreToVectorStore(InstCombiner &IC, StoreInst &SI) {
+  // FIXME: This is probably a reasonable transform to make for atomic stores.
+  assert(SI.isSimple() && "Do not call for non-simple stores!");
+
+  Instruction *OrigV = dyn_cast<Instruction>(SI.getValueOperand());
+  if (!OrigV)
+    return false;
+
+  // We only handle values which are used entirely to store to memory. If the
+  // value is used directly as an SSA value, then even if there are matching
+  // element insertion and element extraction, we rely on basic integer
+  // combining to forward the bits and delete the intermediate math. Here we
+  // just need to clean up the places where it actually reaches memory.
+  SmallVector<StoreInst *, 2> Stores;
+  for (User *U : OrigV->users())
+    if (auto *SIU = dyn_cast<StoreInst>(U))
+      Stores.push_back(SIU);
+    else
+      return false;
+
+  const DataLayout &DL = *IC.getDataLayout();
+  unsigned BaseBits = OrigV->getType()->getIntegerBitWidth();
+  Type *ElementTy = nullptr;
+  int ElementSize;
+
+  // We need to match some number of element insertions into an integer. Each
+  // insertion takes the form of an element value (and type), index (multiple of
+  // the bitwidth of the type) of insertion, and the base it was inserted into.
+  struct InsertedElement {
+    Value *Base;
+    Value *Element;
+    int Index;
+  };
+  auto MatchInsertedElement = [&](Value *V) -> Optional<InsertedElement> {
+    // Handle a null input to make it easy to loop over bases.
+    if (!V)
+      return Optional<InsertedElement>();
+
+    assert(!V->getType()->isVectorTy() && "Must not be a vector.");
+    assert(V->getType()->isIntegerTy() && "Must be an integer value.");
+
+    Value *Base = nullptr, *Cast;
+    ConstantInt *ShiftC = nullptr;
+    auto InsertPattern = m_CombineOr(
+        m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Value(Cast)))), m_ConstantInt(ShiftC)),
+        m_ZExt(m_OneUse(m_Value(Cast))));
+    if (!match(V, m_CombineOr(m_CombineOr(m_Or(m_OneUse(m_Value(Base)),
+                                               m_OneUse(InsertPattern)),
+                                          m_Or(m_OneUse(InsertPattern),
+                                               m_OneUse(m_Value(Base)))),
+                              InsertPattern)))
+      return Optional<InsertedElement>();
+
+    Value *Element;
+    if (auto *BC = dyn_cast<BitCastInst>(Cast)) {
+      // Bit casts are trivially correct here.
+      Element = BC->getOperand(0);
+    } else if (auto *PC = dyn_cast<PtrToIntInst>(Cast)) {
+      Element = PC->getOperand(0);
+      // If this changes the bit width at all, reject it.
+      if (PC->getType()->getIntegerBitWidth() !=
+          DL.getTypeSizeInBits(Element->getType()))
+        return Optional<InsertedElement>();
+    } else {
+      // All other casts are rejected.
+      return Optional<InsertedElement>();
+    }
+
+    // We can't handle shifts wider than the number of bits in the integer.
+    unsigned ShiftBits = ShiftC ? ShiftC->getLimitedValue(BaseBits) : 0;
+    if (ShiftBits == BaseBits)
+      return Optional<InsertedElement>();
+
+    // All of the elements inserted need to be the same type. Either capture the
+    // first element type or check this element type against the previous
+    // element types.
+    if (!ElementTy) {
+      ElementTy = Element->getType();
+      // The base integer size and the shift need to be multiples of the element
+      // size in bits.
+      ElementSize = DL.getTypeSizeInBits(ElementTy);
+      if (BaseBits % ElementSize || ShiftBits % ElementSize)
+        return Optional<InsertedElement>();
+    } else if (ElementTy != Element->getType()) {
+      return Optional<InsertedElement>();
+    }
+
+    // We don't handle integers, sub-vectors, or any aggregate types. We
+    // handle pointers and floating ponit types.
+    if (!ElementTy->isSingleValueType() || ElementTy->isIntegerTy() ||
+        ElementTy->isVectorTy())
+      return Optional<InsertedElement>();
+
+    int Index =
+        (DL.isLittleEndian() ? ShiftBits : BaseBits - ElementSize - ShiftBits) /
+        ElementSize;
+    InsertedElement Result = {Base, Element, Index};
+    return Result;
+  };
+
+  SmallVector<InsertedElement, 2> Elements;
+  Value *V = OrigV;
+  while (Optional<InsertedElement> E = MatchInsertedElement(V)) {
+    V = E->Base;
+    Elements.push_back(std::move(*E));
+  }
+  // If searching for elements found none, or didn't terminate in either an
+  // undef or a direct zext, we can't form a vector.
+  if (Elements.empty() || (V && !isa<UndefValue>(V)))
+    return false;
+
+  // Build a storable vector by looping over the inserted elements.
+  VectorType *VTy = VectorType::get(ElementTy, BaseBits / ElementSize);
+  V = UndefValue::get(VTy);
+  IC.Builder->SetInsertPoint(OrigV);
+  for (const auto &E : Elements)
+    V = IC.Builder->CreateInsertElement(V, E.Element,
+                                        IC.Builder->getInt32(E.Index));
+
+  for (StoreInst *OldSI : Stores) {
+    IC.Builder->SetInsertPoint(OldSI);
+    combineStoreToNewValue(IC, *OldSI, V);
+    if (OldSI != &SI)
+      IC.EraseInstFromFunction(*OldSI);
+  }
+  return true;
+}
+
 /// \brief Combine stores to match the type of value being stored.
 ///
 /// The core idea here is that the memory does not have any intrinsic type and
@@ -517,52 +857,20 @@ static bool combineStoreToValueType(Inst
   if (!SI.isSimple())
     return false;
 
-  Value *Ptr = SI.getPointerOperand();
   Value *V = SI.getValueOperand();
-  unsigned AS = SI.getPointerAddressSpace();
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
-  SI.getAllMetadata(MD);
 
   // Fold away bit casts of the stored value by storing the original type.
   if (auto *BC = dyn_cast<BitCastInst>(V)) {
-    V = BC->getOperand(0);
-    StoreInst *NewStore = IC.Builder->CreateAlignedStore(
-        V, IC.Builder->CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
-        SI.getAlignment());
-    for (const auto &MDPair : MD) {
-      unsigned ID = MDPair.first;
-      MDNode *N = MDPair.second;
-      // Note, essentially every kind of metadata should be preserved here! This
-      // routine is supposed to clone a store instruction changing *only its
-      // type*. The only metadata it makes sense to drop is metadata which is
-      // invalidated when the pointer type changes. This should essentially
-      // never be the case in LLVM, but we explicitly switch over only known
-      // metadata to be conservatively correct. If you are adding metadata to
-      // LLVM which pertains to stores, you almost certainly want to add it
-      // here.
-      switch (ID) {
-      case LLVMContext::MD_dbg:
-      case LLVMContext::MD_tbaa:
-      case LLVMContext::MD_prof:
-      case LLVMContext::MD_fpmath:
-      case LLVMContext::MD_tbaa_struct:
-      case LLVMContext::MD_alias_scope:
-      case LLVMContext::MD_noalias:
-      case LLVMContext::MD_nontemporal:
-      case LLVMContext::MD_mem_parallel_loop_access:
-      case LLVMContext::MD_nonnull:
-        // All of these directly apply.
-        NewStore->setMetadata(ID, N);
-        break;
-
-      case LLVMContext::MD_invariant_load:
-      case LLVMContext::MD_range:
-        break;
-      }
-    }
+    combineStoreToNewValue(IC, SI, BC->getOperand(0));
     return true;
   }
 
+  // If this is an integer store and we have data layout, look for a pattern of
+  // storing a vector as an integer (modeled as a bag of bits).
+  if (V->getType()->isIntegerTy() && IC.getDataLayout() &&
+      combineIntegerStoreToVectorStore(IC, SI))
+    return true;
+
   // FIXME: We should also canonicalize loads of vectors when their elements are
   // cast to other types.
   return false;

Added: llvm/trunk/test/Transforms/InstCombine/loadstore-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/loadstore-vector.ll?rev=223764&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/loadstore-vector.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/loadstore-vector.ll Tue Dec  9 02:55:32 2014
@@ -0,0 +1,210 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Basic test for turning element extraction from integer loads and element
+; insertion into integer stores into extraction and insertion with vectors.
+define void @test1({ float, float }* %x, float %a, float %b, { float, float }* %out) {
+; CHECK-LABEL: @test1(
+entry:
+  %x.cast = bitcast { float, float }* %x to i64*
+  %x.load = load i64* %x.cast, align 4
+; CHECK-NOT: load i64*
+; CHECK: %[[LOAD:.*]] = load <2 x float>*
+
+  %lo.trunc = trunc i64 %x.load to i32
+  %hi.shift = lshr i64 %x.load, 32
+  %hi.trunc = trunc i64 %hi.shift to i32
+  %hi.cast = bitcast i32 %hi.trunc to float
+  %lo.cast = bitcast i32 %lo.trunc to float
+; CHECK-NOT: trunc
+; CHECK-NOT: lshr
+; CHECK: %[[HI:.*]] = extractelement <2 x float> %[[LOAD]], i32 1
+; CHECK: %[[LO:.*]] = extractelement <2 x float> %[[LOAD]], i32 0
+
+  %add.i.i = fadd float %lo.cast, %a
+  %add5.i.i = fadd float %hi.cast, %b
+; CHECK: %[[LO_SUM:.*]] = fadd float %[[LO]], %a
+; CHECK: %[[HI_SUM:.*]] = fadd float %[[HI]], %b
+
+  %add.lo.cast = bitcast float %add.i.i to i32
+  %add.hi.cast = bitcast float %add5.i.i to i32
+  %add.hi.ext = zext i32 %add.hi.cast to i64
+  %add.hi.shift = shl nuw i64 %add.hi.ext, 32
+  %add.lo.ext = zext i32 %add.lo.cast to i64
+  %add.lo.or = or i64 %add.hi.shift, %add.lo.ext
+; CHECK-NOT: zext i32
+; CHECK-NOT: shl {{.*}} i64
+; CHECK-NOT: or i64
+; CHECK: %[[INSERT1:.*]] = insertelement <2 x float> undef, float %[[LO_SUM]], i32 0
+; CHECK: %[[INSERT2:.*]] = insertelement <2 x float> %[[INSERT1]], float %[[HI_SUM]], i32 1
+
+  %out.cast = bitcast { float, float }* %out to i64*
+  store i64 %add.lo.or, i64* %out.cast, align 4
+; CHECK-NOT: store i64
+; CHECK: store <2 x float> %[[INSERT2]]
+
+  ret void
+}
+
+define void @test2({ float, float }* %x, float %a, float %b, { float, float }* %out1, { float, float }* %out2) {
+; CHECK-LABEL: @test2(
+entry:
+  %x.cast = bitcast { float, float }* %x to i64*
+  %x.load = load i64* %x.cast, align 4
+; CHECK-NOT: load i64*
+; CHECK: %[[LOAD:.*]] = load <2 x float>*
+
+  %lo.trunc = trunc i64 %x.load to i32
+  %hi.shift = lshr i64 %x.load, 32
+  %hi.trunc = trunc i64 %hi.shift to i32
+  %hi.cast = bitcast i32 %hi.trunc to float
+  %lo.cast = bitcast i32 %lo.trunc to float
+; CHECK-NOT: trunc
+; CHECK-NOT: lshr
+; CHECK: %[[HI:.*]] = extractelement <2 x float> %[[LOAD]], i32 1
+; CHECK: %[[LO:.*]] = extractelement <2 x float> %[[LOAD]], i32 0
+
+  %add.i.i = fadd float %lo.cast, %a
+  %add5.i.i = fadd float %hi.cast, %b
+; CHECK: %[[LO_SUM:.*]] = fadd float %[[LO]], %a
+; CHECK: %[[HI_SUM:.*]] = fadd float %[[HI]], %b
+
+  %add.lo.cast = bitcast float %add.i.i to i32
+  %add.hi.cast = bitcast float %add5.i.i to i32
+  %add.hi.ext = zext i32 %add.hi.cast to i64
+  %add.hi.shift = shl nuw i64 %add.hi.ext, 32
+  %add.lo.ext = zext i32 %add.lo.cast to i64
+  %add.lo.or = or i64 %add.hi.shift, %add.lo.ext
+; CHECK-NOT: zext i32
+; CHECK-NOT: shl {{.*}} i64
+; CHECK-NOT: or i64
+; CHECK: %[[INSERT1:.*]] = insertelement <2 x float> undef, float %[[LO_SUM]], i32 0
+; CHECK: %[[INSERT2:.*]] = insertelement <2 x float> %[[INSERT1]], float %[[HI_SUM]], i32 1
+
+  %out1.cast = bitcast { float, float }* %out1 to i64*
+  store i64 %add.lo.or, i64* %out1.cast, align 4
+  %out2.cast = bitcast { float, float }* %out2 to i64*
+  store i64 %add.lo.or, i64* %out2.cast, align 4
+; CHECK-NOT: store i64
+; CHECK: store <2 x float> %[[INSERT2]]
+; CHECK-NOT: store i64
+; CHECK: store <2 x float> %[[INSERT2]]
+
+  ret void
+}
+
+; We handle some cases where there is partial CSE but not complete CSE of
+; repeated insertion and extraction. Currently, we don't catch the store side
+; yet because it would require extreme heroics to match this reliably.
+define void @test3({ float, float, float }* %x, float %a, float %b, { float, float, float }* %out1, { float, float, float }* %out2) {
+; CHECK-LABEL: @test3(
+entry:
+  %x.cast = bitcast { float, float, float }* %x to i96*
+  %x.load = load i96* %x.cast, align 4
+; CHECK-NOT: load i96*
+; CHECK: %[[LOAD:.*]] = load <3 x float>*
+
+  %lo.trunc = trunc i96 %x.load to i32
+  %lo.cast = bitcast i32 %lo.trunc to float
+  %mid.shift = lshr i96 %x.load, 32
+  %mid.trunc = trunc i96 %mid.shift to i32
+  %mid.cast = bitcast i32 %mid.trunc to float
+  %mid.trunc2 = trunc i96 %mid.shift to i32
+  %mid.cast2 = bitcast i32 %mid.trunc2 to float
+  %hi.shift = lshr i96 %mid.shift, 32
+  %hi.trunc = trunc i96 %hi.shift to i32
+  %hi.cast = bitcast i32 %hi.trunc to float
+; CHECK-NOT: trunc
+; CHECK-NOT: lshr
+; CHECK: %[[LO:.*]] = extractelement <3 x float> %[[LOAD]], i32 0
+; CHECK: %[[MID1:.*]] = extractelement <3 x float> %[[LOAD]], i32 1
+; CHECK: %[[MID2:.*]] = extractelement <3 x float> %[[LOAD]], i32 1
+; CHECK: %[[HI:.*]] = extractelement <3 x float> %[[LOAD]], i32 2
+
+  %add.lo = fadd float %lo.cast, %a
+  %add.mid = fadd float %mid.cast, %b
+  %add.hi = fadd float %hi.cast, %mid.cast2
+; CHECK: %[[LO_SUM:.*]] = fadd float %[[LO]], %a
+; CHECK: %[[MID_SUM:.*]] = fadd float %[[MID1]], %b
+; CHECK: %[[HI_SUM:.*]] = fadd float %[[HI]], %[[MID2]]
+
+  %add.lo.cast = bitcast float %add.lo to i32
+  %add.mid.cast = bitcast float %add.mid to i32
+  %add.hi.cast = bitcast float %add.hi to i32
+  %result.hi.ext = zext i32 %add.hi.cast to i96
+  %result.hi.shift = shl nuw i96 %result.hi.ext, 32
+  %result.mid.ext = zext i32 %add.mid.cast to i96
+  %result.mid.or = or i96 %result.hi.shift, %result.mid.ext
+  %result.mid.shift = shl nuw i96 %result.mid.or, 32
+  %result.lo.ext = zext i32 %add.lo.cast to i96
+  %result.lo.or = or i96 %result.mid.shift, %result.lo.ext
+; FIXME-NOT: zext i32
+; FIXME-NOT: shl {{.*}} i64
+; FIXME-NOT: or i64
+; FIXME: %[[INSERT1:.*]] = insertelement <3 x float> undef, float %[[HI_SUM]], i32 2
+; FIXME: %[[INSERT2:.*]] = insertelement <3 x float> %[[INSERT1]], float %[[MID_SUM]], i32 1
+; FIXME: %[[INSERT3:.*]] = insertelement <3 x float> %[[INSERT2]], float %[[LO_SUM]], i32 0
+
+  %out1.cast = bitcast { float, float, float }* %out1 to i96*
+  store i96 %result.lo.or, i96* %out1.cast, align 4
+; FIXME-NOT: store i96
+; FIXME: store <3 x float> %[[INSERT3]]
+
+  %result2.lo.ext = zext i32 %add.lo.cast to i96
+  %result2.lo.or = or i96 %result.mid.shift, %result2.lo.ext
+; FIXME-NOT: zext i32
+; FIXME-NOT: shl {{.*}} i64
+; FIXME-NOT: or i64
+; FIXME: %[[INSERT4:.*]] = insertelement <3 x float> %[[INSERT2]], float %[[LO_SUM]], i32 0
+
+  %out2.cast = bitcast { float, float, float }* %out2 to i96*
+  store i96 %result2.lo.or, i96* %out2.cast, align 4
+; FIXME-NOT: store i96
+; FIXME: store <3 x float>
+
+  ret void
+}
+
+; Basic test that pointers work correctly as the element type.
+define void @test4({ i8*, i8* }* %x, i64 %a, i64 %b, { i8*, i8* }* %out) {
+; CHECK-LABEL: @test4(
+entry:
+  %x.cast = bitcast { i8*, i8* }* %x to i128*
+  %x.load = load i128* %x.cast, align 4
+; CHECK-NOT: load i128*
+; CHECK: %[[LOAD:.*]] = load <2 x i8*>* {{.*}}, align 4
+
+  %lo.trunc = trunc i128 %x.load to i64
+  %hi.shift = lshr i128 %x.load, 64
+  %hi.trunc = trunc i128 %hi.shift to i64
+  %hi.cast = inttoptr i64 %hi.trunc to i8*
+  %lo.cast = inttoptr i64 %lo.trunc to i8*
+; CHECK-NOT: trunc
+; CHECK-NOT: lshr
+; CHECK: %[[HI:.*]] = extractelement <2 x i8*> %[[LOAD]], i32 1
+; CHECK: %[[LO:.*]] = extractelement <2 x i8*> %[[LOAD]], i32 0
+
+  %gep.lo = getelementptr i8* %lo.cast, i64 %a
+  %gep.hi = getelementptr i8* %hi.cast, i64 %b
+; CHECK: %[[LO_GEP:.*]] = getelementptr i8* %[[LO]], i64 %a
+; CHECK: %[[HI_GEP:.*]] = getelementptr i8* %[[HI]], i64 %b
+
+  %gep.lo.cast = ptrtoint i8* %gep.lo to i64
+  %gep.hi.cast = ptrtoint i8* %gep.hi to i64
+  %gep.hi.ext = zext i64 %gep.hi.cast to i128
+  %gep.hi.shift = shl nuw i128 %gep.hi.ext, 64
+  %gep.lo.ext = zext i64 %gep.lo.cast to i128
+  %gep.lo.or = or i128 %gep.hi.shift, %gep.lo.ext
+; CHECK-NOT: zext i32
+; CHECK-NOT: shl {{.*}} i64
+; CHECK-NOT: or i64
+; CHECK: %[[INSERT1:.*]] = insertelement <2 x i8*> undef, i8* %[[LO_GEP]], i32 0
+; CHECK: %[[INSERT2:.*]] = insertelement <2 x i8*> %[[INSERT1]], i8* %[[HI_GEP]], i32 1
+
+  %out.cast = bitcast { i8*, i8* }* %out to i128*
+  store i128 %gep.lo.or, i128* %out.cast, align 4
+; CHECK-NOT: store i128
+; CHECK: store <2 x i8*> %[[INSERT2]], <2 x i8*>* {{.*}}, align 4
+
+  ret void
+}