[llvm] adccc0b - [X86] Add X86ISD opcodes for the Key Locker AESENC*KL and AESDEC*KL instructions

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sat Oct 3 16:56:02 PDT 2020


Author: Craig Topper
Date: 2020-10-03T16:55:19-07:00
New Revision: adccc0bfa301005367d6b89a3aacc07ef0166e64

URL: https://github.com/llvm/llvm-project/commit/adccc0bfa301005367d6b89a3aacc07ef0166e64
DIFF: https://github.com/llvm/llvm-project/commit/adccc0bfa301005367d6b89a3aacc07ef0166e64.diff

LOG: [X86] Add X86ISD opcodes for the Key Locker AESENC*KL and AESDEC*KL instructions

Instead of emitting MachineSDNodes during lowering, emit X86ISD
opcodes. These opcodes will either be selected by tablegen
patterns or custom selection code.

Emitting MachineSDNodes during lowering is uncommon so this makes
things more consistent. It also allows selectAddr to be called to
perform address matching during instruction selection.

I had trouble getting tablegen to accept XMM0-XMM7 as results in
an isel pattern for the WIDE instructions so I had to use custom
instruction selection.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/lib/Target/X86/X86InstrInfo.td
    llvm/lib/Target/X86/X86InstrKL.td
    llvm/test/CodeGen/X86/keylocker-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 3b5a29ef31fc..0d80bde5f717 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2448,6 +2448,14 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
       Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
       Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
       Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
+      Parent->getOpcode() != X86ISD::AESENC128KL && // Fixme
+      Parent->getOpcode() != X86ISD::AESDEC128KL && // Fixme
+      Parent->getOpcode() != X86ISD::AESENC256KL && // Fixme
+      Parent->getOpcode() != X86ISD::AESDEC256KL && // Fixme
+      Parent->getOpcode() != X86ISD::AESENCWIDE128KL && // Fixme
+      Parent->getOpcode() != X86ISD::AESDECWIDE128KL && // Fixme
+      Parent->getOpcode() != X86ISD::AESENCWIDE256KL && // Fixme
+      Parent->getOpcode() != X86ISD::AESDECWIDE256KL && // Fixme
       Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
       Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
     unsigned AddrSpace =
@@ -5725,6 +5733,61 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     CurDAG->RemoveDeadNode(Node);
     return;
   }
+  case X86ISD::AESENCWIDE128KL:
+  case X86ISD::AESDECWIDE128KL:
+  case X86ISD::AESENCWIDE256KL:
+  case X86ISD::AESDECWIDE256KL: {
+    unsigned Opcode;
+    switch (Node->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+    case X86ISD::AESENCWIDE128KL:
+      Opcode = X86::AESENCWIDE128KL;
+      break;
+    case X86ISD::AESDECWIDE128KL:
+      Opcode = X86::AESDECWIDE128KL;
+      break;
+    case X86ISD::AESENCWIDE256KL:
+      Opcode = X86::AESENCWIDE256KL;
+      break;
+    case X86ISD::AESDECWIDE256KL:
+      Opcode = X86::AESDECWIDE256KL;
+      break;
+    }
+
+    SDValue Chain = Node->getOperand(0);
+    SDValue Addr = Node->getOperand(1);
+
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
+      break;
+
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
+                                 SDValue());
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
+                                 Chain.getValue(1));
+
+    SDVTList VTs = CurDAG->getVTList(
+        {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
+         MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
+    SDNode *Res = CurDAG->getMachineNode(
+        Opcode, dl, VTs,
+        {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
+    ReplaceNode(Node, Res);
+    return;
+  }
   }
 
   SelectCode(Node);

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 935fab44e7c1..e526a1dd58eb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26032,118 +26032,73 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
     case Intrinsic::x86_aesenc256kl:
     case Intrinsic::x86_aesdec256kl: {
       SDLoc DL(Op);
-      SDVTList VTs = DAG.getVTList(MVT::v16i8, MVT::Other, MVT::Glue);
+      SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
       SDValue Chain = Op.getOperand(0);
       unsigned Opcode;
 
       switch (IntNo) {
       default: llvm_unreachable("Impossible intrinsic");
       case Intrinsic::x86_aesenc128kl:
-        Opcode = X86::AESENC128KL;
+        Opcode = X86ISD::AESENC128KL;
         break;
       case Intrinsic::x86_aesdec128kl:
-        Opcode = X86::AESDEC128KL;
+        Opcode = X86ISD::AESDEC128KL;
         break;
       case Intrinsic::x86_aesenc256kl:
-        Opcode = X86::AESENC256KL;
+        Opcode = X86ISD::AESENC256KL;
         break;
       case Intrinsic::x86_aesdec256kl:
-        Opcode = X86::AESDEC256KL;
+        Opcode = X86ISD::AESDEC256KL;
         break;
       }
 
-      SDValue XMM = Op.getOperand(2);
-      SDValue Base = Op.getOperand(3);
-      SDValue Index = DAG.getRegister(0, MVT::i32);
-      SDValue Scale = DAG.getTargetConstant(1, DL, MVT::i8);
-      SDValue Disp = DAG.getTargetConstant(0, DL, MVT::i32);
-      SDValue Segment = DAG.getRegister(0, MVT::i32);
-
-      SDNode *Res = DAG.getMachineNode(Opcode, DL, VTs, {XMM, Base, Scale, Index,
-                                                         Disp, Segment, Chain});
-      Chain = SDValue(Res, 1);
-      SDValue EFLAGS = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32,
-                                          SDValue(Res, 2));
-      SDValue ZF = getSETCC(X86::COND_E, EFLAGS.getValue(0), DL, DAG);
+      SDValue Operation = DAG.getNode(Opcode, DL, VTs, Chain, Op.getOperand(2),
+                                      Op.getOperand(3));
+      SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
 
       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
-                         {ZF, SDValue(Res, 0), EFLAGS.getValue(1)});
+                         {ZF, Operation.getValue(0), Operation.getValue(2)});
     }
     case Intrinsic::x86_aesencwide128kl:
     case Intrinsic::x86_aesdecwide128kl:
     case Intrinsic::x86_aesencwide256kl:
     case Intrinsic::x86_aesdecwide256kl: {
       SDLoc DL(Op);
-      SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDVTList VTs = DAG.getVTList(
+          {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
+           MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
       SDValue Chain = Op.getOperand(0);
       unsigned Opcode;
 
       switch (IntNo) {
       default: llvm_unreachable("Impossible intrinsic");
       case Intrinsic::x86_aesencwide128kl:
-        Opcode = X86::AESENCWIDE128KL;
+        Opcode = X86ISD::AESENCWIDE128KL;
         break;
       case Intrinsic::x86_aesdecwide128kl:
-        Opcode = X86::AESDECWIDE128KL;
+        Opcode = X86ISD::AESDECWIDE128KL;
         break;
       case Intrinsic::x86_aesencwide256kl:
-        Opcode = X86::AESENCWIDE256KL;
+        Opcode = X86ISD::AESENCWIDE256KL;
         break;
       case Intrinsic::x86_aesdecwide256kl:
-        Opcode = X86::AESDECWIDE256KL;
+        Opcode = X86ISD::AESDECWIDE256KL;
         break;
       }
 
-      SDValue Base = Op.getOperand(2);
-      SDValue Index = DAG.getRegister(0, MVT::i32);
-      SDValue Scale = DAG.getTargetConstant(1, DL, MVT::i8);
-      SDValue Disp = DAG.getTargetConstant(0, DL, MVT::i32);
-      SDValue Segment = DAG.getRegister(0, MVT::i32);
+      SDValue Operation = DAG.getNode(
+          Opcode, DL, VTs,
+          {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+           Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
+           Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)});
+      SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
 
-      Chain = DAG.getCopyToReg(Chain, DL, X86::XMM0, Op->getOperand(3),
-                               SDValue());
-      Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM1,
-                               Op->getOperand(4), Chain.getValue(1));
-      Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM2,
-                               Op->getOperand(5), Chain.getValue(1));
-      Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM3,
-                               Op->getOperand(6), Chain.getValue(1));
-      Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM4,
-                               Op->getOperand(7), Chain.getValue(1));
-      Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM5,
-                               Op->getOperand(8), Chain.getValue(1));
-      Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM6,
-                               Op->getOperand(9), Chain.getValue(1));
-      Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM7,
-                               Op->getOperand(10),Chain.getValue(1));
-
-      SDNode *Res = DAG.getMachineNode(Opcode, DL, VTs, {Base, Scale, Index,
-                                                         Disp, Segment, Chain,
-                                                         Chain.getValue(1)});
-
-      Chain = SDValue(Res, 0);
-      SDValue EFLAGS = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32,
-                                          SDValue(Res, 1));
-      SDValue ZF = getSETCC(X86::COND_E, EFLAGS.getValue(0), DL, DAG);
-      SDValue XMM0 = DAG.getCopyFromReg(EFLAGS.getValue(1), DL, X86::XMM0,
-                                        MVT::v16i8, EFLAGS.getValue(2));
-      SDValue XMM1 = DAG.getCopyFromReg(XMM0.getValue(1), DL, X86::XMM1,
-                                        MVT::v16i8, XMM0.getValue(2));
-      SDValue XMM2 = DAG.getCopyFromReg(XMM1.getValue(1), DL, X86::XMM2,
-                                        MVT::v16i8, XMM1.getValue(2));
-      SDValue XMM3 = DAG.getCopyFromReg(XMM2.getValue(1), DL, X86::XMM3,
-                                        MVT::v16i8, XMM2.getValue(2));
-      SDValue XMM4 = DAG.getCopyFromReg(XMM3.getValue(1), DL, X86::XMM4,
-                                        MVT::v16i8, XMM3.getValue(2));
-      SDValue XMM5 = DAG.getCopyFromReg(XMM4.getValue(1), DL, X86::XMM5,
-                                        MVT::v16i8, XMM4.getValue(2));
-      SDValue XMM6 = DAG.getCopyFromReg(XMM5.getValue(1), DL, X86::XMM6,
-                                        MVT::v16i8, XMM5.getValue(2));
-      SDValue XMM7 = DAG.getCopyFromReg(XMM6.getValue(1), DL, X86::XMM7,
-                                        MVT::v16i8, XMM6.getValue(2));
       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
-                         {ZF, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
-                          XMM7.getValue(1)});
+                         {ZF, Operation.getValue(1), Operation.getValue(2),
+                          Operation.getValue(3), Operation.getValue(4),
+                          Operation.getValue(5), Operation.getValue(6),
+                          Operation.getValue(7), Operation.getValue(8),
+                          Operation.getValue(9)});
     }
     }
     return SDValue();
@@ -31167,6 +31122,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(ENQCMD)
   NODE_NAME_CASE(ENQCMDS)
   NODE_NAME_CASE(VP2INTERSECT)
+  NODE_NAME_CASE(AESENC128KL)
+  NODE_NAME_CASE(AESDEC128KL)
+  NODE_NAME_CASE(AESENC256KL)
+  NODE_NAME_CASE(AESDEC256KL)
+  NODE_NAME_CASE(AESENCWIDE128KL)
+  NODE_NAME_CASE(AESDECWIDE128KL)
+  NODE_NAME_CASE(AESENCWIDE256KL)
+  NODE_NAME_CASE(AESDECWIDE256KL)
   }
   return nullptr;
 #undef NODE_NAME_CASE

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index f8de2f7d0e79..9f231be78191 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -713,6 +713,16 @@ namespace llvm {
     // Mwaitx builtin is lowered to this if the base pointer needs saving.
     MWAITX_DAG,
 
+    // Key locker nodes that produce flags.
+    AESENC128KL,
+    AESDEC128KL,
+    AESENC256KL,
+    AESDEC256KL,
+    AESENCWIDE128KL,
+    AESDECWIDE128KL,
+    AESENCWIDE256KL,
+    AESDECWIDE256KL,
+
     /// X86 strict FP compare instructions.
     STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
     STRICT_FCMPS,

diff  --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index d13ba5dbc0eb..3a3d141854a1 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -135,6 +135,11 @@ def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
 def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                          SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>;
 
+def SDT_X86AESENCDECKL : SDTypeProfile<2, 2, [SDTCisVT<0, v2i64>,
+                                              SDTCisVT<1, i32>,
+                                              SDTCisVT<2, v2i64>,
+                                              SDTCisPtrTy<3>]>;
+
 def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
                             [SDNPHasChain,SDNPSideEffect]>;
 def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
@@ -331,6 +336,15 @@ def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD,
 def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD,
                        [SDNPHasChain, SDNPSideEffect]>;
 
+def X86aesenc128kl : SDNode<"X86ISD::AESENC128KL", SDT_X86AESENCDECKL,
+                            [SDNPHasChain, SDNPSideEffect]>;
+def X86aesdec128kl : SDNode<"X86ISD::AESDEC128KL", SDT_X86AESENCDECKL,
+                            [SDNPHasChain, SDNPSideEffect]>;
+def X86aesenc256kl : SDNode<"X86ISD::AESENC256KL", SDT_X86AESENCDECKL,
+                            [SDNPHasChain, SDNPSideEffect]>;
+def X86aesdec256kl : SDNode<"X86ISD::AESDEC256KL", SDT_X86AESENCDECKL,
+                            [SDNPHasChain, SDNPSideEffect]>;
+
 //===----------------------------------------------------------------------===//
 // X86 Operand Definitions.
 //

diff  --git a/llvm/lib/Target/X86/X86InstrKL.td b/llvm/lib/Target/X86/X86InstrKL.td
index 77e011fe14d6..0c05c7a0ab2c 100644
--- a/llvm/lib/Target/X86/X86InstrKL.td
+++ b/llvm/lib/Target/X86/X86InstrKL.td
@@ -36,16 +36,24 @@ let SchedRW = [WriteSystem], Predicates = [HasKL] in {
   let Constraints = "$src1 = $dst",
       Defs = [EFLAGS] in {
    def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
-                        "aesenc128kl\t{$src2, $src1|$src1, $src2}", []>, T8XS;
+                       "aesenc128kl\t{$src2, $src1|$src1, $src2}",
+                       [(set VR128:$dst, EFLAGS,
+                         (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8XS;
 
    def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
-                        "aesdec128kl\t{$src2, $src1|$src1, $src2}", []>, T8XS;
+                       "aesdec128kl\t{$src2, $src1|$src1, $src2}",
+                       [(set VR128:$dst, EFLAGS,
+                         (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8XS;
 
    def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
-                        "aesenc256kl\t{$src2, $src1|$src1, $src2}", []>, T8XS;
+                       "aesenc256kl\t{$src2, $src1|$src1, $src2}",
+                       [(set VR128:$dst, EFLAGS,
+                         (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8XS;
 
    def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
-                        "aesdec256kl\t{$src2, $src1|$src1, $src2}", []>, T8XS;
+                       "aesdec256kl\t{$src2, $src1|$src1, $src2}",
+                       [(set VR128:$dst, EFLAGS,
+                         (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8XS;
   }
 
 } // SchedRW, Predicates

diff  --git a/llvm/test/CodeGen/X86/keylocker-intrinsics.ll b/llvm/test/CodeGen/X86/keylocker-intrinsics.ll
index d577ffd12e08..e48affb80d5f 100644
--- a/llvm/test/CodeGen/X86/keylocker-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/keylocker-intrinsics.ll
@@ -540,3 +540,103 @@ entry:
   %9 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
   ret i8 %9
 }
+
+; Tests to make sure we can select an appropriate addressing mode for a global.
+
+ at foo = external global [64 x i8]
+
+define i8 @test_mm_aesenc256kl_u8_global(<2 x i64> %data, <2 x i64>* %out) {
+; X64-LABEL: test_mm_aesenc256kl_u8_global:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    aesenc256kl {{.*}}(%rip), %xmm0
+; X64-NEXT:    sete %al
+; X64-NEXT:    movaps %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_mm_aesenc256kl_u8_global:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    aesenc256kl foo, %xmm0
+; X32-NEXT:    sete %al
+; X32-NEXT:    vmovaps %xmm0, (%ecx)
+; X32-NEXT:    retl
+entry:
+  %h = bitcast [64 x i8]* @foo to i8*
+  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %data, i8* %h)
+  %1 = extractvalue { i8, <2 x i64> } %0, 1
+  store <2 x i64> %1, <2 x i64>* %out
+  %2 = extractvalue { i8, <2 x i64> } %0, 0
+  ret i8 %2
+}
+
+define i8 @test_mm_aesdecwide256kl_u8_global(<2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6, <2 x i64> %v7, <2 x i64>* %out0, <2 x i64>* %out1, <2 x i64>* %out2, <2 x i64>* %out3, <2 x i64>* %out4, <2 x i64>* %out5, <2 x i64>* %out6, <2 x i64>* %out7) nounwind {
+; X64-LABEL: test_mm_aesdecwide256kl_u8_global:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    aesdecwide256kl {{.*}}(%rip)
+; X64-NEXT:    sete %al
+; X64-NEXT:    movaps %xmm0, (%rdi)
+; X64-NEXT:    movaps %xmm1, (%rsi)
+; X64-NEXT:    movaps %xmm1, (%rdx)
+; X64-NEXT:    movaps %xmm1, (%rcx)
+; X64-NEXT:    movaps %xmm1, (%r8)
+; X64-NEXT:    movaps %xmm1, (%r9)
+; X64-NEXT:    movaps %xmm1, (%r11)
+; X64-NEXT:    movaps %xmm1, (%r10)
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_mm_aesdecwide256kl_u8_global:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 88(%ebp), %eax
+; X32-NEXT:    vmovaps 8(%ebp), %xmm3
+; X32-NEXT:    vmovaps 24(%ebp), %xmm4
+; X32-NEXT:    vmovaps 40(%ebp), %xmm5
+; X32-NEXT:    vmovaps 56(%ebp), %xmm6
+; X32-NEXT:    vmovaps 72(%ebp), %xmm7
+; X32-NEXT:    aesdecwide256kl foo
+; X32-NEXT:    vmovaps %xmm0, (%eax)
+; X32-NEXT:    movl 92(%ebp), %eax
+; X32-NEXT:    vmovaps %xmm1, (%eax)
+; X32-NEXT:    movl 96(%ebp), %eax
+; X32-NEXT:    vmovaps %xmm1, (%eax)
+; X32-NEXT:    movl 100(%ebp), %eax
+; X32-NEXT:    vmovaps %xmm1, (%eax)
+; X32-NEXT:    movl 104(%ebp), %eax
+; X32-NEXT:    vmovaps %xmm1, (%eax)
+; X32-NEXT:    movl 108(%ebp), %eax
+; X32-NEXT:    vmovaps %xmm1, (%eax)
+; X32-NEXT:    movl 112(%ebp), %eax
+; X32-NEXT:    vmovaps %xmm1, (%eax)
+; X32-NEXT:    movl 116(%ebp), %eax
+; X32-NEXT:    vmovaps %xmm1, (%eax)
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+entry:
+  %p = bitcast [64 x i8]* @foo to i8*
+  %0 = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6,      <2 x i64> %v7)
+  %1 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
+  store <2 x i64> %1, <2 x i64>* %out0
+  %2 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
+  store <2 x i64> %2, <2 x i64>* %out1
+  %3 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
+  store <2 x i64> %2, <2 x i64>* %out2
+  %4 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
+  store <2 x i64> %2, <2 x i64>* %out3
+  %5 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5
+  store <2 x i64> %2, <2 x i64>* %out4
+  %6 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6
+  store <2 x i64> %2, <2 x i64>* %out5
+  %7 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 7
+  store <2 x i64> %2, <2 x i64>* %out6
+  %8 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 8
+  store <2 x i64> %2, <2 x i64>* %out7
+  %9 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
+  ret i8 %9
+}


        


More information about the llvm-commits mailing list