[llvm-commits] CVS: llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Mon Apr 17 20:44:00 PDT 2006

Changes in directory llvm/lib/Target/PowerPC:

PPCISelLowering.cpp updated: 1.161 -> 1.162
---
Log message:

Lower v8i16 multiply into this code:

        li r5, lo16(LCPI1_0)
        lis r6, ha16(LCPI1_0)
        lvx v4, r6, r5
        vmulouh v5, v3, v2
        vmuleuh v2, v3, v2
        vperm v2, v2, v5, v4

where v4 is:
LCPI1_0:                                        ;  <16 x ubyte>
        .byte   2
        .byte   3
        .byte   18
        .byte   19
        .byte   6
        .byte   7
        .byte   22
        .byte   23
        .byte   10
        .byte   11
        .byte   26
        .byte   27
        .byte   14
        .byte   15
        .byte   30
        .byte   31

This is 5.07x faster on the G5 (measured) than lowering to scalar code + 
loads/stores.



---
Diffs of the changes:  (+51 -25)

 PPCISelLowering.cpp |   76 ++++++++++++++++++++++++++++++++++------------------
 1 files changed, 51 insertions(+), 25 deletions(-)


Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp
diff -u llvm/lib/Target/PowerPC/PPCISelLowering.cpp:1.161 llvm/lib/Target/PowerPC/PPCISelLowering.cpp:1.162

--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp:1.161	Mon Apr 17 22:24:30 2006
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp	Mon Apr 17 22:43:48 2006
@@ -228,6 +228,7 @@
     
     setOperationAction(ISD::MUL, MVT::v4f32, Legal);
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
 
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
@@ -1573,31 +1574,56 @@
 }
 
 static SDOperand LowerMUL(SDOperand Op, SelectionDAG &DAG) {
-  assert(Op.getValueType() == MVT::v4i32 && "Unknown mul to lower!");
-  SDOperand LHS = Op.getOperand(0);
-  SDOperand RHS = Op.getOperand(1);
-  
-  SDOperand Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG);
-  SDOperand Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG);  // +16 as shift amt.
-  
-  SDOperand RHSSwap =   // = vrlw RHS, 16
-    BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG);
-  
-  // Shrinkify inputs to v8i16.
-  LHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, LHS);
-  RHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, RHS);
-  RHSSwap = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, RHSSwap);
-  
-  // Low parts multiplied together, generating 32-bit results (we ignore the top
-  // parts).
-  SDOperand LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
-                                      LHS, RHS, DAG, MVT::v4i32);
-  
-  SDOperand HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
-                                      LHS, RHSSwap, Zero, DAG, MVT::v4i32);
-  // Shift the high parts up 16 bits.
-  HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, Neg16, DAG);
-  return DAG.getNode(ISD::ADD, MVT::v4i32, LoProd, HiProd);
+  if (Op.getValueType() == MVT::v4i32) {
+    SDOperand LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+    
+    SDOperand Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG);
+    SDOperand Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG); // +16 as shift amt.
+    
+    SDOperand RHSSwap =   // = vrlw RHS, 16
+      BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG);
+    
+    // Shrinkify inputs to v8i16.
+    LHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, LHS);
+    RHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, RHS);
+    RHSSwap = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, RHSSwap);
+    
+    // Low parts multiplied together, generating 32-bit results (we ignore the
+    // top parts).
+    SDOperand LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
+                                        LHS, RHS, DAG, MVT::v4i32);
+    
+    SDOperand HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
+                                        LHS, RHSSwap, Zero, DAG, MVT::v4i32);
+    // Shift the high parts up 16 bits.
+    HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, Neg16, DAG);
+    return DAG.getNode(ISD::ADD, MVT::v4i32, LoProd, HiProd);
+  } else if (Op.getValueType() == MVT::v8i16) {
+    SDOperand LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+    
+    // Multiply the even 16-parts, producing 32-bit sums.
+    SDOperand EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleuh,
+                                           LHS, RHS, DAG, MVT::v4i32);
+    EvenParts = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, EvenParts);
+    
+    // Multiply the odd 16-parts, producing 32-bit sums.
+    SDOperand OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
+                                          LHS, RHS, DAG, MVT::v4i32);
+    OddParts = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, OddParts);
+
+    // Merge the results together.
+    std::vector<SDOperand> Ops;
+    for (unsigned i = 0; i != 4; ++i) {
+      Ops.push_back(DAG.getConstant(2*i+1, MVT::i16));
+      Ops.push_back(DAG.getConstant(2*i+1+8, MVT::i16));
+    }
+    
+    return DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, EvenParts, OddParts,
+                       DAG.getNode(ISD::BUILD_VECTOR, MVT::v8i16, Ops));
+  } else {
+    assert(0 && "Unknown mul to lower!");
+    abort();
+  }
 }
 
 /// LowerOperation - Provide custom lowering hooks for some operations.