[llvm-commits] [llvm] r59399 - in /llvm/trunk: lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp test/CodeGen/X86/vec_shuffle-25.ll test/CodeGen/X86/vec_shuffle-26.ll test/CodeGen/X86/vec_shuffle-27.ll

Sat Nov 15 21:06:27 PST 2008

Author: wangmp
Date: Sat Nov 15 23:06:27 2008
New Revision: 59399

URL: http://llvm.org/viewvc/llvm-project?rev=59399&view=rev
Log:
Improved shuffle normalization to avoid using extract/build when we
can extract using different indexes for two vectors. Added a few tests
for vector shuffles.

Added:
    llvm/trunk/test/CodeGen/X86/vec_shuffle-25.ll
    llvm/trunk/test/CodeGen/X86/vec_shuffle-26.ll
    llvm/trunk/test/CodeGen/X86/vec_shuffle-27.ll
Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp?rev=59399&r1=59398&r2=59399&view=diff

==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp Sat Nov 15 23:06:27 2008
@@ -2292,8 +2292,8 @@
 // Utility for visitShuffleVector - Returns true if the mask is mask starting
 // from SIndx and increasing to the element length (undefs are allowed).
 static bool SequentialMask(SDValue Mask, unsigned SIndx) {
-  unsigned NumElems = Mask.getNumOperands();
-  for (unsigned i = 0; i != NumElems; ++i) {
+  unsigned MaskNumElts = Mask.getNumOperands();
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
     if (Mask.getOperand(i).getOpcode() != ISD::UNDEF) {
       unsigned Idx = cast<ConstantSDNode>(Mask.getOperand(i))->getZExtValue();
       if (Idx != i + SIndx)
@@ -2304,161 +2304,187 @@
 }
 
 void SelectionDAGLowering::visitShuffleVector(User &I) {
-  SDValue V1   = getValue(I.getOperand(0));
-  SDValue V2   = getValue(I.getOperand(1));
+  SDValue Srcs[2];
+  Srcs[0] = getValue(I.getOperand(0));
+  Srcs[1] = getValue(I.getOperand(1));
   SDValue Mask = getValue(I.getOperand(2));
 
   MVT VT = TLI.getValueType(I.getType());
-  MVT VT1 = V1.getValueType();
-  unsigned MaskNumElts = Mask.getNumOperands();
-  unsigned Src1NumElts = VT1.getVectorNumElements();
+  MVT SrcVT = Srcs[0].getValueType();
+  int MaskNumElts = Mask.getNumOperands();
+  int SrcNumElts = SrcVT.getVectorNumElements();
 
-  if (Src1NumElts == MaskNumElts) {
-    setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask));
+  if (SrcNumElts == MaskNumElts) {
+    setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Srcs[0], Srcs[1], Mask));
     return;
   }
 
   // Normalize the shuffle vector since mask and vector length don't match.
-  if (Src1NumElts < MaskNumElts && MaskNumElts % Src1NumElts == 0) {
-    // We can concat vectors to make the mask and input vector match.
-    if (Src1NumElts*2 == MaskNumElts && SequentialMask(Mask, 0)) {
-      // The shuffle is concatenating two vectors.
-      setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, VT, V1, V2));
+  MVT MaskEltVT = Mask.getValueType().getVectorElementType();
+
+  if (SrcNumElts < MaskNumElts && MaskNumElts % SrcNumElts == 0) {
+    // Mask is longer than the source vectors and is a multiple of the source
+    // vectors.  We can use concatenate vector to make the mask and vectors
+    // length match.
+    if (SrcNumElts*2 == MaskNumElts && SequentialMask(Mask, 0)) {
+      // The shuffle is concatenating two vectors together.
+      setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, VT, Srcs[0], Srcs[1]));
       return;
     }
 
-    // Pad both vectors with undefs to the same size as the mask.
-    unsigned NumConcat = MaskNumElts / Src1NumElts;
-    std::vector<SDValue> UnOps(Src1NumElts,
-                               DAG.getNode(ISD::UNDEF, 
-                                           VT1.getVectorElementType()));
-    SDValue UndefVal = DAG.getNode(ISD::BUILD_VECTOR, VT1,
-                                   &UnOps[0], UnOps.size());
+    // Pad both vectors with undefs to make them the same length as the mask.
+    unsigned NumConcat = MaskNumElts / SrcNumElts;
+    SDValue UndefVal = DAG.getNode(ISD::UNDEF, SrcVT);
 
     SmallVector<SDValue, 8> MOps1, MOps2;
-    MOps1.push_back(V1);
-    MOps2.push_back(V2);
+    MOps1.push_back(Srcs[0]);
+    MOps2.push_back(Srcs[1]);
     for (unsigned i = 1; i != NumConcat; ++i) {
       MOps1.push_back(UndefVal);
       MOps2.push_back(UndefVal);
     }
-    V1 = DAG.getNode(ISD::CONCAT_VECTORS, VT, &MOps1[0], MOps1.size());
-    V2 = DAG.getNode(ISD::CONCAT_VECTORS, VT, &MOps2[0], MOps2.size());
+    Srcs[0] = DAG.getNode(ISD::CONCAT_VECTORS, VT, &MOps1[0], MOps1.size());
+    Srcs[1] = DAG.getNode(ISD::CONCAT_VECTORS, VT, &MOps2[0], MOps2.size());
     
     // Readjust mask for new input vector length.
     SmallVector<SDValue, 8> MappedOps;
-    for (unsigned i = 0; i != MaskNumElts; ++i) {
+    for (int i = 0; i != MaskNumElts; ++i) {
       if (Mask.getOperand(i).getOpcode() == ISD::UNDEF) {
         MappedOps.push_back(Mask.getOperand(i));
       } else {
-        unsigned Idx = cast<ConstantSDNode>(Mask.getOperand(i))->getZExtValue();
-        if (Idx < Src1NumElts) {
-          MappedOps.push_back(DAG.getConstant(Idx,
-                                           Mask.getOperand(i).getValueType()));
-        } else {
-          MappedOps.push_back(DAG.getConstant(Idx + MaskNumElts - Src1NumElts,
-                                           Mask.getOperand(i).getValueType()));
-        } 
+        int Idx = cast<ConstantSDNode>(Mask.getOperand(i))->getZExtValue();
+        if (Idx < SrcNumElts)
+          MappedOps.push_back(DAG.getConstant(Idx, MaskEltVT));
+        else
+          MappedOps.push_back(DAG.getConstant(Idx + MaskNumElts - SrcNumElts,
+                                              MaskEltVT));
       }
     }
     Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(),
                        &MappedOps[0], MappedOps.size());
 
-    setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask));
+    setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Srcs[0], Srcs[1], Mask));
     return;
   }
 
-  if (Src1NumElts > MaskNumElts) {
+  if (SrcNumElts > MaskNumElts) {
     // Resulting vector is shorter than the incoming vector.
-    if (Src1NumElts == MaskNumElts && SequentialMask(Mask,0)) {
+    if (SrcNumElts == MaskNumElts && SequentialMask(Mask,0)) {
       // Shuffle extracts 1st vector.
-      setValue(&I, V1);
+      setValue(&I, Srcs[0]);
       return;
     }
 
-    if (Src1NumElts == MaskNumElts && SequentialMask(Mask,MaskNumElts)) {
+    if (SrcNumElts == MaskNumElts && SequentialMask(Mask,MaskNumElts)) {
       // Shuffle extracts 2nd vector.
-      setValue(&I, V2);
+      setValue(&I, Srcs[1]);
       return;
     }
 
-    // Analyze the access pattern of the vector to see if we can extract each
-    // subvector and then do the shuffle. The analysis is done by calculating
-    // the range of elements the mask access on both vectors. If it is useful,
-    // we could do better by considering separate what elements are accessed
-    // in each vector (i.e., have min/max for each vector).
-    int MinRange = Src1NumElts+1;
-    int MaxRange = -1;
-    for (unsigned i = 0; i != MaskNumElts; ++i) {
+    // Analyze the access pattern of the vector to see if we can extract
+    // two subvectors and do the shuffle. The analysis is done by calculating
+    // the range of elements the mask access on both vectors.
+    int MinRange[2] = { SrcNumElts+1, SrcNumElts+1};
+    int MaxRange[2] = {-1, -1};
+
+    for (int i = 0; i != MaskNumElts; ++i) {
       SDValue Arg = Mask.getOperand(i);
       if (Arg.getOpcode() != ISD::UNDEF) {
         assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-        int Idx = cast<ConstantSDNode>(Mask.getOperand(i))->getZExtValue();
-        if (Idx > (int) Src1NumElts)
-          Idx -= Src1NumElts;
-        if (Idx > MaxRange)
-          MaxRange = Idx;
-        if (Idx < MinRange)
-          MinRange = Idx;
-      }
-    }
-    // Adjust MinRange to start at an even boundary since this give us
-    // better quality splits later.
-    if ((unsigned) MinRange < Src1NumElts && MinRange%2 != 0)
-      MinRange = MinRange - 1;
-    if (MaxRange - MinRange < (int) MaskNumElts) {
-      // Extract subvector because the range is less than the new vector length
-      unsigned StartIdx = (MinRange/MaskNumElts)*MaskNumElts;
-      if (MaxRange - StartIdx < MaskNumElts) {
-        V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, VT, V1,
-                         DAG.getIntPtrConstant(MinRange));
-        V2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, VT, V2,
-                         DAG.getIntPtrConstant(MinRange));
-        // Readjust mask for new input vector length.
-        SmallVector<SDValue, 8> MappedOps;
-        for (unsigned i = 0; i != MaskNumElts; ++i) {
-          if (Mask.getOperand(i).getOpcode() == ISD::UNDEF) {
-            MappedOps.push_back(Mask.getOperand(i));
-          } else {
-            unsigned Idx =
-              cast<ConstantSDNode>(Mask.getOperand(i))->getZExtValue();
-            if (Idx < Src1NumElts) {
-              MappedOps.push_back(DAG.getConstant(Idx - StartIdx,
-                                         Mask.getOperand(i).getValueType()));
-            } else {
-              Idx = Idx - Src1NumElts - StartIdx + MaskNumElts;
-              MappedOps.push_back(DAG.getConstant(Idx,
-                                        Mask.getOperand(i).getValueType()));
-            } 
-          }
+        int Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
+        int Input = 0;
+        if (Idx >= SrcNumElts) {
+          Input = 1;
+          Idx -= SrcNumElts;
         }
-        Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(),
-                           &MappedOps[0], MappedOps.size());
+        if (Idx > MaxRange[Input])
+          MaxRange[Input] = Idx;
+        if (Idx < MinRange[Input])
+          MinRange[Input] = Idx;
+      }
+    }
 
-        setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask));
-        return;
+    // Check if the access is smaller than the vector size and can we find
+    // a reasonable extract index.
+    int RangeUse[2];  // 0 = Unused, 1 = Extract, 2 = Can not Extract.
+    int StartIdx[2];  // StartIdx to extract from
+    for (int Input=0; Input < 2; ++Input) {
+      if (MinRange[Input] == SrcNumElts+1 && MaxRange[Input] == -1) {
+        RangeUse[Input] = 0; // Unused
+        StartIdx[Input] = 0;
+      } else if (MaxRange[Input] - MinRange[Input] < MaskNumElts) {
+        // Fits within range but we should see if we can find a good
+        // start index that a multiple of the mask length.
+        if (MaxRange[Input] < MaskNumElts) {
+          RangeUse[Input] = 1; // Extract from beginning of the vector
+          StartIdx[Input] = 0;
+        } else {
+          StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
+          if (MaxRange[Input] - StartIdx[Input] < MaskNumElts) 
+            RangeUse[Input] = 1; // Extract from a multiple of the mask length.
+          else
+            RangeUse[Input] = 2; // Can not extract
+        }
+      } else
+        RangeUse[Input] = 2;  // Access doesn't fit within range
+    }
+
+    if (RangeUse[0] == 0 && RangeUse[0] == 0) {
+      setValue(&I, DAG.getNode(ISD::UNDEF, VT));  // Vectors are not used.
+      return;
+    }
+    else if (RangeUse[0] < 2 && RangeUse[1] < 2) {
+      // Extract appropriate subvector and generate a vector shuffle
+      for (int Input=0; Input < 2; ++Input) {
+        if (RangeUse[Input] == 0) {
+          Srcs[Input] = DAG.getNode(ISD::UNDEF, VT);
+        } else {
+          Srcs[Input] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, VT, Srcs[Input],
+                                    DAG.getIntPtrConstant(StartIdx[Input]));
+        }
+      }
+      // Calculate new mask.
+      SmallVector<SDValue, 8> MappedOps;
+      for (int i = 0; i != MaskNumElts; ++i) {
+        SDValue Arg = Mask.getOperand(i);
+        if (Arg.getOpcode() == ISD::UNDEF) {
+          MappedOps.push_back(Arg);
+        } else {
+          int Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
+          if (Idx < SrcNumElts)
+            MappedOps.push_back(DAG.getConstant(Idx - StartIdx[0], MaskEltVT));
+          else {
+            Idx = Idx - SrcNumElts - StartIdx[1] + MaskNumElts;
+            MappedOps.push_back(DAG.getConstant(Idx, MaskEltVT));
+          } 
+        }
       }
+      Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(),
+                         &MappedOps[0], MappedOps.size());
+      setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Srcs[0], Srcs[1], Mask));
+      return;
     }
   }
 
-  // We can't use either concat vectors or extract subvectors so we fall back
-  // to insert and extracts.
+  // We can't use either concat vectors or extract subvectors so fall back to
+  // replacing the shuffle with extract and build vector.
+  // to insert and build vector.
   MVT EltVT = VT.getVectorElementType();
   MVT PtrVT = TLI.getPointerTy();
   SmallVector<SDValue,8> Ops;
-  for (unsigned i = 0; i != MaskNumElts; ++i) {
+  for (int i = 0; i != MaskNumElts; ++i) {
     SDValue Arg = Mask.getOperand(i);
     if (Arg.getOpcode() == ISD::UNDEF) {
       Ops.push_back(DAG.getNode(ISD::UNDEF, EltVT));
     } else {
       assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-      unsigned Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
-      if (Idx < Src1NumElts)
-        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, EltVT, V1,
+      int Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
+      if (Idx < SrcNumElts)
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, EltVT, Srcs[0],
                                   DAG.getConstant(Idx, PtrVT)));
       else
-        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, EltVT, V2,
-                                  DAG.getConstant(Idx - Src1NumElts, PtrVT)));
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, EltVT, Srcs[1],
+                                  DAG.getConstant(Idx - SrcNumElts, PtrVT)));
     }
   }
   setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, VT, &Ops[0], Ops.size()));

Added: llvm/trunk/test/CodeGen/X86/vec_shuffle-25.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_shuffle-25.ll?rev=59399&view=auto

==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_shuffle-25.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vec_shuffle-25.ll Sat Nov 15 23:06:27 2008
@@ -0,0 +1,34 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f
+; RUN: grep unpcklps %t | count 3
+; RUN: grep unpckhps %t | count 1
+ 
+; Transpose example using the more generic vector shuffle.  We return
+; float8 instead of float16 since x86 can return that in register.
+; ModuleID = 'transpose2_opt.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i386-apple-cl.1.0"
+ at r0 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+ at r1 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+ at r2 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+ at r3 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+
+define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
+entry:
+	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
+	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
+	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
+	%unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
+	%unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
+	%unpcklps14a = shufflevector <4 x float> %unpcklps14,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>; 
+	%unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
+	%unpckhps17a = shufflevector <4 x float> %unpckhps17,  <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>; 	
+	%r1 = shufflevector <16 x float> %unpcklps14a,  <16 x float> %unpckhps17a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>; 
+	%unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
+	%unpcklps20a = shufflevector <4 x float> %unpcklps20,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>; 
+	%r2 = shufflevector <16 x float> %r1,  <16 x float> %unpcklps20a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>; 
+	%unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
+	%unpckhps23a = shufflevector <4 x float> %unpckhps23,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>; 
+	%r3 = shufflevector <16 x float> %r2,  <16 x float> %unpckhps23a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>; 
+	%r4 = shufflevector <16 x float> %r3,  <16 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>; 
+	ret <8 x float> %r4;
+}

Added: llvm/trunk/test/CodeGen/X86/vec_shuffle-26.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_shuffle-26.ll?rev=59399&view=auto

==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_shuffle-26.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vec_shuffle-26.ll Sat Nov 15 23:06:27 2008
@@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f
+; RUN: grep unpcklps %t | count 1
+; RUN: grep unpckhps %t | count 3
+
+; Transpose example using the more generic vector shuffle. Return float8
+; instead of float16
+; ModuleID = 'transpose2_opt.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i386-apple-cl.1.0"
+ at r0 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+ at r1 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+ at r2 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+ at r3 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+
+define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
+entry:
+	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
+	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
+	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
+	%unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
+	%unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
+	%unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
+        %r1 = shufflevector <4 x float> %unpcklps14,  <4 x float> %unpckhps17,  <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >; 
+	%unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
+	%unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
+        %r2 = shufflevector <4 x float> %unpcklps20,  <4 x float> %unpckhps23,  <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >;
+;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
+	ret <8 x float> %r2;
+}

Added: llvm/trunk/test/CodeGen/X86/vec_shuffle-27.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_shuffle-27.ll?rev=59399&view=auto

==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_shuffle-27.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vec_shuffle-27.ll Sat Nov 15 23:06:27 2008
@@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f
+; RUN: grep addps %t | count 2
+; RUN: grep mulps %t | count 2
+; RUN: grep subps %t | count 2
+
+; ModuleID = 'vec_shuffle-27.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i686-apple-cl.1.0"
+
+define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone {
+entry:
+	%tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 >		; <<8 x float>> [#uses=1]
+	%sub = sub <8 x float> %T1, %T0		; <<8 x float>> [#uses=1]
+	%mul = mul <8 x float> %sub, %tmp7		; <<8 x float>> [#uses=1]
+	%add = add <8 x float> %mul, %T0		; <<8 x float>> [#uses=1]
+	ret <8 x float> %add
+}