[llvm] a02b449 - [X86] Sync AESENC/DEC Key Locker builtins with gcc.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sun Oct 4 12:10:38 PDT 2020


Author: Craig Topper
Date: 2020-10-04T12:09:41-07:00
New Revision: a02b449bb1556fe0f17b86eaa69f6bcda945d123

URL: https://github.com/llvm/llvm-project/commit/a02b449bb1556fe0f17b86eaa69f6bcda945d123
DIFF: https://github.com/llvm/llvm-project/commit/a02b449bb1556fe0f17b86eaa69f6bcda945d123.diff

LOG: [X86] Sync AESENC/DEC Key Locker builtins with gcc.

For the wide builtins, pass a single input and output pointer to
the builtins. Emit the GEPs and input loads from CGBuiltin.

Added: 
    

Modified: 
    clang/include/clang/Basic/BuiltinsX86.def
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/Headers/keylockerintrin.h
    clang/test/CodeGen/X86/keylocker.c
    llvm/test/CodeGen/X86/keylocker-intrinsics-fast-isel.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index c33026139b3c..8f9cfe4b6dc5 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1902,22 +1902,16 @@ TARGET_BUILTIN(__builtin_ia32_enqcmds, "Ucv*vC*", "n", "enqcmd")
 
 // KEY LOCKER
 TARGET_BUILTIN(__builtin_ia32_loadiwkey, "vV2OiV2OiV2OiUi", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_encodekey128_u32,
-               "UiUiV2Oiv*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_encodekey256_u32,
-               "UiUiV2OiV2Oiv*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesenc128kl, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesenc256kl, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesdec128kl, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesdec256kl, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesencwide128kl,
-               "UcvC*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2OiV2OiV2OiV2OiV2OiV2OiV2OiV2Oi", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesencwide256kl,
-               "UcvC*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2OiV2OiV2OiV2OiV2OiV2OiV2OiV2Oi", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesdecwide128kl,
-               "UcvC*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2OiV2OiV2OiV2OiV2OiV2OiV2OiV2Oi", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesdecwide256kl,
-               "UcvC*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2OiV2OiV2OiV2OiV2OiV2OiV2OiV2Oi", "nV:128:", "kl,widekl")
+TARGET_BUILTIN(__builtin_ia32_encodekey128_u32, "UiUiV2Oiv*", "nV:128:", "kl")
+TARGET_BUILTIN(__builtin_ia32_encodekey256_u32, "UiUiV2OiV2Oiv*", "nV:128:", "kl")
+TARGET_BUILTIN(__builtin_ia32_aesenc128kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
+TARGET_BUILTIN(__builtin_ia32_aesenc256kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
+TARGET_BUILTIN(__builtin_ia32_aesdec128kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
+TARGET_BUILTIN(__builtin_ia32_aesdec256kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
+TARGET_BUILTIN(__builtin_ia32_aesencwide128kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
+TARGET_BUILTIN(__builtin_ia32_aesencwide256kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
+TARGET_BUILTIN(__builtin_ia32_aesdecwide128kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
+TARGET_BUILTIN(__builtin_ia32_aesdecwide256kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
 
 // SERIALIZE
 TARGET_BUILTIN(__builtin_ia32_serialize, "v", "n", "serialize")

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d3603579844d..dc3cafa5d062 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14070,75 +14070,67 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
 
     return Builder.CreateExtractValue(Call, 0);
   }
-  case X86::BI__builtin_ia32_aesenc128kl:
-  case X86::BI__builtin_ia32_aesdec128kl:
-  case X86::BI__builtin_ia32_aesenc256kl:
-  case X86::BI__builtin_ia32_aesdec256kl:
-  case X86::BI__builtin_ia32_aesencwide128kl:
-  case X86::BI__builtin_ia32_aesdecwide128kl:
-  case X86::BI__builtin_ia32_aesencwide256kl:
-  case X86::BI__builtin_ia32_aesdecwide256kl: {
-    int FirstReturnOp;
-    int ResultCount;
-    SmallVector<Value*, 9> InOps;
-    unsigned ID;
-
+  case X86::BI__builtin_ia32_aesenc128kl_u8:
+  case X86::BI__builtin_ia32_aesdec128kl_u8:
+  case X86::BI__builtin_ia32_aesenc256kl_u8:
+  case X86::BI__builtin_ia32_aesdec256kl_u8: {
+    Intrinsic::ID IID;
     switch (BuiltinID) {
-    default: llvm_unreachable("Unsupported intrinsic!");
-    case X86::BI__builtin_ia32_aesenc128kl:
-    case X86::BI__builtin_ia32_aesdec128kl:
-    case X86::BI__builtin_ia32_aesenc256kl:
-    case X86::BI__builtin_ia32_aesdec256kl: {
-      InOps = {Ops[1], Ops[2]};
-      FirstReturnOp = 0;
-      ResultCount = 1;
-      switch (BuiltinID) {
-      case X86::BI__builtin_ia32_aesenc128kl:
-        ID = Intrinsic::x86_aesenc128kl;
-        break;
-      case X86::BI__builtin_ia32_aesdec128kl:
-        ID = Intrinsic::x86_aesdec128kl;
-        break;
-      case X86::BI__builtin_ia32_aesenc256kl:
-        ID = Intrinsic::x86_aesenc256kl;
-        break;
-      case X86::BI__builtin_ia32_aesdec256kl:
-        ID = Intrinsic::x86_aesdec256kl;
-        break;
-      }
+    default: llvm_unreachable("Unexpected builtin");
+    case X86::BI__builtin_ia32_aesenc128kl_u8:
+      IID = Intrinsic::x86_aesenc128kl;
+      break;
+    case X86::BI__builtin_ia32_aesdec128kl_u8:
+      IID = Intrinsic::x86_aesdec128kl;
+      break;
+    case X86::BI__builtin_ia32_aesenc256kl_u8:
+      IID = Intrinsic::x86_aesenc256kl;
+      break;
+    case X86::BI__builtin_ia32_aesdec256kl_u8:
+      IID = Intrinsic::x86_aesdec256kl;
       break;
     }
-    case X86::BI__builtin_ia32_aesencwide128kl:
-    case X86::BI__builtin_ia32_aesdecwide128kl:
-    case X86::BI__builtin_ia32_aesencwide256kl:
-    case X86::BI__builtin_ia32_aesdecwide256kl: {
-      InOps = {Ops[0], Ops[9], Ops[10], Ops[11], Ops[12], Ops[13],
-               Ops[14], Ops[15], Ops[16]};
-      FirstReturnOp = 1;
-      ResultCount = 8;
-      switch (BuiltinID) {
-      case X86::BI__builtin_ia32_aesencwide128kl:
-        ID = Intrinsic::x86_aesencwide128kl;
-        break;
-      case X86::BI__builtin_ia32_aesdecwide128kl:
-        ID = Intrinsic::x86_aesdecwide128kl;
-        break;
-      case X86::BI__builtin_ia32_aesencwide256kl:
-        ID = Intrinsic::x86_aesencwide256kl;
-        break;
-      case X86::BI__builtin_ia32_aesdecwide256kl:
-        ID = Intrinsic::x86_aesdecwide256kl;
-        break;
-      }
+
+    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[1], Ops[2]});
+
+    Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
+                                      Ops[0]);
+
+    return Builder.CreateExtractValue(Call, 0);
+  }
+  case X86::BI__builtin_ia32_aesencwide128kl_u8:
+  case X86::BI__builtin_ia32_aesdecwide128kl_u8:
+  case X86::BI__builtin_ia32_aesencwide256kl_u8:
+  case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
+    Intrinsic::ID IID;
+    switch (BuiltinID) {
+    case X86::BI__builtin_ia32_aesencwide128kl_u8:
+      IID = Intrinsic::x86_aesencwide128kl;
+      break;
+    case X86::BI__builtin_ia32_aesdecwide128kl_u8:
+      IID = Intrinsic::x86_aesdecwide128kl;
+      break;
+    case X86::BI__builtin_ia32_aesencwide256kl_u8:
+      IID = Intrinsic::x86_aesencwide256kl;
+      break;
+    case X86::BI__builtin_ia32_aesdecwide256kl_u8:
+      IID = Intrinsic::x86_aesdecwide256kl;
       break;
     }
+
+    Value *InOps[9];
+    InOps[0] = Ops[2];
+    for (int i = 0; i != 8; ++i) {
+      Value *Ptr = Builder.CreateConstGEP1_32(Ops[1], i);
+      InOps[i + 1] = Builder.CreateAlignedLoad(Ptr, Align(16));
     }
 
-    Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID), InOps);
+    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), InOps);
 
-    for (int i = 0; i < ResultCount; ++i) {
-      Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, i + 1),
-                                        Ops[FirstReturnOp + i]);
+    for (int i = 0; i != 8; ++i) {
+      Value *Extract = Builder.CreateExtractValue(Call, i + 1);
+      Value *Ptr = Builder.CreateConstGEP1_32(Ops[0], i);
+      Builder.CreateAlignedStore(Extract, Ptr, Align(16));
     }
 
     return Builder.CreateExtractValue(Call, 0);

diff  --git a/clang/lib/Headers/keylockerintrin.h b/clang/lib/Headers/keylockerintrin.h
index c31ba16122a5..c15d39c8e392 100644
--- a/clang/lib/Headers/keylockerintrin.h
+++ b/clang/lib/Headers/keylockerintrin.h
@@ -211,7 +211,7 @@ _mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
 /// \endoperation
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
-  return __builtin_ia32_aesenc128kl(__odata, __idata, __h);
+  return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
 }
 
 /// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using
@@ -248,7 +248,7 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// \endoperation
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
-  return __builtin_ia32_aesenc256kl(__odata, __idata, __h);
+  return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
 }
 
 /// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using
@@ -285,7 +285,7 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// \endoperation
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
-  return __builtin_ia32_aesdec128kl(__odata, __idata, __h);
+  return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
 }
 
 /// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using
@@ -322,7 +322,7 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// \endoperation
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
-  return __builtin_ia32_aesdec256kl(__odata, __idata, __h);
+  return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
 }
 
 #undef __DEFAULT_FN_ATTRS
@@ -374,23 +374,8 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// \endoperation
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
-  return __builtin_ia32_aesencwide128kl(__h,
-                                        __odata,
-                                        __odata + 1,
-                                        __odata + 2,
-                                        __odata + 3,
-                                        __odata + 4,
-                                        __odata + 5,
-                                        __odata + 6,
-                                        __odata + 7,
-                                        __idata[0],
-                                        __idata[1],
-                                        __idata[2],
-                                        __idata[3],
-                                        __idata[4],
-                                        __idata[5],
-                                        __idata[6],
-                                        __idata[7]);
+  return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
 }
 
 /// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
@@ -429,23 +414,8 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 /// \endoperation
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
-  return __builtin_ia32_aesencwide256kl(__h,
-                                        __odata,
-                                        __odata + 1,
-                                        __odata + 2,
-                                        __odata + 3,
-                                        __odata + 4,
-                                        __odata + 5,
-                                        __odata + 6,
-                                        __odata + 7,
-                                        __idata[0],
-                                        __idata[1],
-                                        __idata[2],
-                                        __idata[3],
-                                        __idata[4],
-                                        __idata[5],
-                                        __idata[6],
-                                        __idata[7]);
+  return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
 }
 
 /// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
@@ -484,23 +454,8 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 /// \endoperation
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
-  return __builtin_ia32_aesdecwide128kl(__h,
-                                        __odata,
-                                        __odata + 1,
-                                        __odata + 2,
-                                        __odata + 3,
-                                        __odata + 4,
-                                        __odata + 5,
-                                        __odata + 6,
-                                        __odata + 7,
-                                        __idata[0],
-                                        __idata[1],
-                                        __idata[2],
-                                        __idata[3],
-                                        __idata[4],
-                                        __idata[5],
-                                        __idata[6],
-                                        __idata[7]);
+  return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
 }
 
 /// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
@@ -539,23 +494,8 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 /// \endoperation
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
-  return __builtin_ia32_aesdecwide256kl(__h,
-                                        __odata,
-                                        __odata + 1,
-                                        __odata + 2,
-                                        __odata + 3,
-                                        __odata + 4,
-                                        __odata + 5,
-                                        __odata + 6,
-                                        __odata + 7,
-                                        __idata[0],
-                                        __idata[1],
-                                        __idata[2],
-                                        __idata[3],
-                                        __idata[4],
-                                        __idata[5],
-                                        __idata[6],
-                                        __idata[7]);
+  return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
 }
 
 #undef __DEFAULT_FN_ATTRS

diff  --git a/clang/test/CodeGen/X86/keylocker.c b/clang/test/CodeGen/X86/keylocker.c
index b410d53b4b83..b87fe22d7761 100644
--- a/clang/test/CodeGen/X86/keylocker.c
+++ b/clang/test/CodeGen/X86/keylocker.c
@@ -78,47 +78,215 @@ unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i k
 unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *h) {
   //CHECK-LABEL: @test_mm_aesenc256kl_u8
   //CHECK: call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %{{.*}}, i8* %{{.*}})
+  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 0
   return _mm_aesenc256kl_u8(odata, idata, h);
 }
 
 unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *h) {
   //CHECK-LABEL: @test_mm_aesdec256kl_u8
   //CHECK: call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %{{.*}}, i8* %{{.*}})
+  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 0
   return _mm_aesdec256kl_u8(odata, idata, h);
 }
 
 unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *h) {
   //CHECK-LABEL: @test_mm_aesenc128kl_u8
   //CHECK: call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %{{.*}}, i8* %{{.*}})
+  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 0
   return _mm_aesenc128kl_u8(odata, idata, h);
 }
 
 unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *h) {
   //CHECK-LABEL: @test_mm_aesdec128kl_u8
   //CHECK: call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %{{.*}}, i8* %{{.*}})
+  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 0
   return _mm_aesdec128kl_u8(odata, idata, h);
 }
 
 unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
   //CHECK-LABEL: @test__mm_aesencwide128kl
-  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}},      <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 7
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 8
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
   return _mm_aesencwide128kl_u8(odata, idata, h);
 }
 
 unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
   //CHECK-LABEL: @test__mm_aesdecwide128kl
-  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}},      <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 7
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 8
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
   return _mm_aesdecwide128kl_u8(odata, idata, h);
 }
 
 unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
   //CHECK-LABEL: @test__mm_aesencwide256kl
-  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}},      <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 7
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 8
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
   return _mm_aesencwide256kl_u8(odata, idata, h);
 }
 
 unsigned char test__mm_aesdecwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
   //CHECK-LABEL: @test__mm_aesdecwide256kl
-  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}},      <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
+  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
+  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 7
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 8
+  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
+  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
+  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
   return _mm_aesdecwide256kl_u8(odata, idata, h);
 }

diff  --git a/llvm/test/CodeGen/X86/keylocker-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/keylocker-intrinsics-fast-isel.ll
index b5518ec44dc2..a2443ffbc4e6 100644
--- a/llvm/test/CodeGen/X86/keylocker-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/keylocker-intrinsics-fast-isel.ll
@@ -99,6 +99,346 @@ entry:
   ret i32 %21
 }
 
+define zeroext i8 @test_mm_aesenc256kl_u8(<2 x i64>* %odata, <2 x i64> %idata, i8* %h) {
+; CHECK-LABEL: test_mm_aesenc256kl_u8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    aesenc256kl (%rsi), %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %idata, i8* %h) #1
+  %1 = extractvalue { i8, <2 x i64> } %0, 1
+  store <2 x i64> %1, <2 x i64>* %odata, align 16
+  %2 = extractvalue { i8, <2 x i64> } %0, 0
+  ret i8 %2
+}
+
+define zeroext i8 @test_mm_aesdec256kl_u8(<2 x i64>* %odata, <2 x i64> %idata, i8* %h) {
+; CHECK-LABEL: test_mm_aesdec256kl_u8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    aesdec256kl (%rsi), %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %idata, i8* %h) #1
+  %1 = extractvalue { i8, <2 x i64> } %0, 1
+  store <2 x i64> %1, <2 x i64>* %odata, align 16
+  %2 = extractvalue { i8, <2 x i64> } %0, 0
+  ret i8 %2
+}
+
+define zeroext i8 @test_mm_aesenc128kl_u8(<2 x i64>* %odata, <2 x i64> %idata, i8* %h) {
+; CHECK-LABEL: test_mm_aesenc128kl_u8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    aesenc128kl (%rsi), %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %idata, i8* %h) #1
+  %1 = extractvalue { i8, <2 x i64> } %0, 1
+  store <2 x i64> %1, <2 x i64>* %odata, align 16
+  %2 = extractvalue { i8, <2 x i64> } %0, 0
+  ret i8 %2
+}
+
+define zeroext i8 @test_mm_aesdec128kl_u8(<2 x i64>* %odata, <2 x i64> %idata, i8* %h) {
+; CHECK-LABEL: test_mm_aesdec128kl_u8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    aesdec128kl (%rsi), %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %idata, i8* %h) #1
+  %1 = extractvalue { i8, <2 x i64> } %0, 1
+  store <2 x i64> %1, <2 x i64>* %odata, align 16
+  %2 = extractvalue { i8, <2 x i64> } %0, 0
+  ret i8 %2
+}
+
+define zeroext i8 @test__mm_aesencwide128kl_u8(<2 x i64>* %odata, <2 x i64>* %idata, i8* %h) {
+; CHECK-LABEL: test__mm_aesencwide128kl_u8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movaps (%rsi), %xmm0
+; CHECK-NEXT:    movaps 16(%rsi), %xmm1
+; CHECK-NEXT:    movaps 32(%rsi), %xmm2
+; CHECK-NEXT:    movaps 48(%rsi), %xmm3
+; CHECK-NEXT:    movaps 64(%rsi), %xmm4
+; CHECK-NEXT:    movaps 80(%rsi), %xmm5
+; CHECK-NEXT:    movaps 96(%rsi), %xmm6
+; CHECK-NEXT:    movaps 112(%rsi), %xmm7
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    aesencwide128kl (%rdx)
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
+; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
+; CHECK-NEXT:    movaps %xmm3, 48(%rdi)
+; CHECK-NEXT:    movaps %xmm4, 64(%rdi)
+; CHECK-NEXT:    movaps %xmm5, 80(%rdi)
+; CHECK-NEXT:    movaps %xmm6, 96(%rdi)
+; CHECK-NEXT:    movaps %xmm7, 112(%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = load <2 x i64>, <2 x i64>* %idata, align 16
+  %1 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 1
+  %2 = load <2 x i64>, <2 x i64>* %1, align 16
+  %3 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 2
+  %4 = load <2 x i64>, <2 x i64>* %3, align 16
+  %5 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 3
+  %6 = load <2 x i64>, <2 x i64>* %5, align 16
+  %7 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 4
+  %8 = load <2 x i64>, <2 x i64>* %7, align 16
+  %9 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 5
+  %10 = load <2 x i64>, <2 x i64>* %9, align 16
+  %11 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 6
+  %12 = load <2 x i64>, <2 x i64>* %11, align 16
+  %13 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 7
+  %14 = load <2 x i64>, <2 x i64>* %13, align 16
+  %15 = tail call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* %h, <2 x i64> %0, <2 x i64> %2, <2 x i64> %4, <2 x i64> %6, <2 x i64> %8, <2 x i64> %10, <2 x i64> %12, <2 x i64> %14) #1
+  %16 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 1
+  store <2 x i64> %16, <2 x i64>* %odata, align 16
+  %17 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 2
+  %18 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 1
+  store <2 x i64> %17, <2 x i64>* %18, align 16
+  %19 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 3
+  %20 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 2
+  store <2 x i64> %19, <2 x i64>* %20, align 16
+  %21 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 4
+  %22 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 3
+  store <2 x i64> %21, <2 x i64>* %22, align 16
+  %23 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 5
+  %24 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 4
+  store <2 x i64> %23, <2 x i64>* %24, align 16
+  %25 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 6
+  %26 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 5
+  store <2 x i64> %25, <2 x i64>* %26, align 16
+  %27 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 7
+  %28 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 6
+  store <2 x i64> %27, <2 x i64>* %28, align 16
+  %29 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 8
+  %30 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 7
+  store <2 x i64> %29, <2 x i64>* %30, align 16
+  %31 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 0
+  ret i8 %31
+}
+
+define zeroext i8 @test__mm_aesdecwide128kl_u8(<2 x i64>* %odata, <2 x i64>* %idata, i8* %h) {
+; CHECK-LABEL: test__mm_aesdecwide128kl_u8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movaps (%rsi), %xmm0
+; CHECK-NEXT:    movaps 16(%rsi), %xmm1
+; CHECK-NEXT:    movaps 32(%rsi), %xmm2
+; CHECK-NEXT:    movaps 48(%rsi), %xmm3
+; CHECK-NEXT:    movaps 64(%rsi), %xmm4
+; CHECK-NEXT:    movaps 80(%rsi), %xmm5
+; CHECK-NEXT:    movaps 96(%rsi), %xmm6
+; CHECK-NEXT:    movaps 112(%rsi), %xmm7
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    aesdecwide128kl (%rdx)
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
+; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
+; CHECK-NEXT:    movaps %xmm3, 48(%rdi)
+; CHECK-NEXT:    movaps %xmm4, 64(%rdi)
+; CHECK-NEXT:    movaps %xmm5, 80(%rdi)
+; CHECK-NEXT:    movaps %xmm6, 96(%rdi)
+; CHECK-NEXT:    movaps %xmm7, 112(%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = load <2 x i64>, <2 x i64>* %idata, align 16
+  %1 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 1
+  %2 = load <2 x i64>, <2 x i64>* %1, align 16
+  %3 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 2
+  %4 = load <2 x i64>, <2 x i64>* %3, align 16
+  %5 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 3
+  %6 = load <2 x i64>, <2 x i64>* %5, align 16
+  %7 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 4
+  %8 = load <2 x i64>, <2 x i64>* %7, align 16
+  %9 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 5
+  %10 = load <2 x i64>, <2 x i64>* %9, align 16
+  %11 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 6
+  %12 = load <2 x i64>, <2 x i64>* %11, align 16
+  %13 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 7
+  %14 = load <2 x i64>, <2 x i64>* %13, align 16
+  %15 = tail call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* %h, <2 x i64> %0, <2 x i64> %2, <2 x i64> %4, <2 x i64> %6, <2 x i64> %8, <2 x i64> %10, <2 x i64> %12, <2 x i64> %14) #1
+  %16 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 1
+  store <2 x i64> %16, <2 x i64>* %odata, align 16
+  %17 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 2
+  %18 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 1
+  store <2 x i64> %17, <2 x i64>* %18, align 16
+  %19 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 3
+  %20 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 2
+  store <2 x i64> %19, <2 x i64>* %20, align 16
+  %21 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 4
+  %22 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 3
+  store <2 x i64> %21, <2 x i64>* %22, align 16
+  %23 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 5
+  %24 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 4
+  store <2 x i64> %23, <2 x i64>* %24, align 16
+  %25 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 6
+  %26 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 5
+  store <2 x i64> %25, <2 x i64>* %26, align 16
+  %27 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 7
+  %28 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 6
+  store <2 x i64> %27, <2 x i64>* %28, align 16
+  %29 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 8
+  %30 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 7
+  store <2 x i64> %29, <2 x i64>* %30, align 16
+  %31 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 0
+  ret i8 %31
+}
+
+define zeroext i8 @test__mm_aesencwide256kl_u8(<2 x i64>* %odata, <2 x i64>* %idata, i8* %h) {
+; CHECK-LABEL: test__mm_aesencwide256kl_u8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movaps (%rsi), %xmm0
+; CHECK-NEXT:    movaps 16(%rsi), %xmm1
+; CHECK-NEXT:    movaps 32(%rsi), %xmm2
+; CHECK-NEXT:    movaps 48(%rsi), %xmm3
+; CHECK-NEXT:    movaps 64(%rsi), %xmm4
+; CHECK-NEXT:    movaps 80(%rsi), %xmm5
+; CHECK-NEXT:    movaps 96(%rsi), %xmm6
+; CHECK-NEXT:    movaps 112(%rsi), %xmm7
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    aesencwide256kl (%rdx)
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
+; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
+; CHECK-NEXT:    movaps %xmm3, 48(%rdi)
+; CHECK-NEXT:    movaps %xmm4, 64(%rdi)
+; CHECK-NEXT:    movaps %xmm5, 80(%rdi)
+; CHECK-NEXT:    movaps %xmm6, 96(%rdi)
+; CHECK-NEXT:    movaps %xmm7, 112(%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = load <2 x i64>, <2 x i64>* %idata, align 16
+  %1 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 1
+  %2 = load <2 x i64>, <2 x i64>* %1, align 16
+  %3 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 2
+  %4 = load <2 x i64>, <2 x i64>* %3, align 16
+  %5 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 3
+  %6 = load <2 x i64>, <2 x i64>* %5, align 16
+  %7 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 4
+  %8 = load <2 x i64>, <2 x i64>* %7, align 16
+  %9 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 5
+  %10 = load <2 x i64>, <2 x i64>* %9, align 16
+  %11 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 6
+  %12 = load <2 x i64>, <2 x i64>* %11, align 16
+  %13 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 7
+  %14 = load <2 x i64>, <2 x i64>* %13, align 16
+  %15 = tail call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* %h, <2 x i64> %0, <2 x i64> %2, <2 x i64> %4, <2 x i64> %6, <2 x i64> %8, <2 x i64> %10, <2 x i64> %12, <2 x i64> %14) #1
+  %16 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 1
+  store <2 x i64> %16, <2 x i64>* %odata, align 16
+  %17 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 2
+  %18 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 1
+  store <2 x i64> %17, <2 x i64>* %18, align 16
+  %19 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 3
+  %20 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 2
+  store <2 x i64> %19, <2 x i64>* %20, align 16
+  %21 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 4
+  %22 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 3
+  store <2 x i64> %21, <2 x i64>* %22, align 16
+  %23 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 5
+  %24 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 4
+  store <2 x i64> %23, <2 x i64>* %24, align 16
+  %25 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 6
+  %26 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 5
+  store <2 x i64> %25, <2 x i64>* %26, align 16
+  %27 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 7
+  %28 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 6
+  store <2 x i64> %27, <2 x i64>* %28, align 16
+  %29 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 8
+  %30 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 7
+  store <2 x i64> %29, <2 x i64>* %30, align 16
+  %31 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 0
+  ret i8 %31
+}
+
+define zeroext i8 @test__mm_aesdecwide256kl_u8(<2 x i64>* %odata, <2 x i64>* %idata, i8* %h) {
+; CHECK-LABEL: test__mm_aesdecwide256kl_u8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movaps (%rsi), %xmm0
+; CHECK-NEXT:    movaps 16(%rsi), %xmm1
+; CHECK-NEXT:    movaps 32(%rsi), %xmm2
+; CHECK-NEXT:    movaps 48(%rsi), %xmm3
+; CHECK-NEXT:    movaps 64(%rsi), %xmm4
+; CHECK-NEXT:    movaps 80(%rsi), %xmm5
+; CHECK-NEXT:    movaps 96(%rsi), %xmm6
+; CHECK-NEXT:    movaps 112(%rsi), %xmm7
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    aesdecwide256kl (%rdx)
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
+; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
+; CHECK-NEXT:    movaps %xmm3, 48(%rdi)
+; CHECK-NEXT:    movaps %xmm4, 64(%rdi)
+; CHECK-NEXT:    movaps %xmm5, 80(%rdi)
+; CHECK-NEXT:    movaps %xmm6, 96(%rdi)
+; CHECK-NEXT:    movaps %xmm7, 112(%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = load <2 x i64>, <2 x i64>* %idata, align 16
+  %1 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 1
+  %2 = load <2 x i64>, <2 x i64>* %1, align 16
+  %3 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 2
+  %4 = load <2 x i64>, <2 x i64>* %3, align 16
+  %5 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 3
+  %6 = load <2 x i64>, <2 x i64>* %5, align 16
+  %7 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 4
+  %8 = load <2 x i64>, <2 x i64>* %7, align 16
+  %9 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 5
+  %10 = load <2 x i64>, <2 x i64>* %9, align 16
+  %11 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 6
+  %12 = load <2 x i64>, <2 x i64>* %11, align 16
+  %13 = getelementptr <2 x i64>, <2 x i64>* %idata, i64 7
+  %14 = load <2 x i64>, <2 x i64>* %13, align 16
+  %15 = tail call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* %h, <2 x i64> %0, <2 x i64> %2, <2 x i64> %4, <2 x i64> %6, <2 x i64> %8, <2 x i64> %10, <2 x i64> %12, <2 x i64> %14) #1
+  %16 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 1
+  store <2 x i64> %16, <2 x i64>* %odata, align 16
+  %17 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 2
+  %18 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 1
+  store <2 x i64> %17, <2 x i64>* %18, align 16
+  %19 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 3
+  %20 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 2
+  store <2 x i64> %19, <2 x i64>* %20, align 16
+  %21 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 4
+  %22 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 3
+  store <2 x i64> %21, <2 x i64>* %22, align 16
+  %23 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 5
+  %24 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 4
+  store <2 x i64> %23, <2 x i64>* %24, align 16
+  %25 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 6
+  %26 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 5
+  store <2 x i64> %25, <2 x i64>* %26, align 16
+  %27 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 7
+  %28 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 6
+  store <2 x i64> %27, <2 x i64>* %28, align 16
+  %29 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 8
+  %30 = getelementptr <2 x i64>, <2 x i64>* %odata, i64 7
+  store <2 x i64> %29, <2 x i64>* %30, align 16
+  %31 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 0
+  ret i8 %31
+}
+
 declare void @llvm.x86.loadiwkey(<2 x i64>, <2 x i64>, <2 x i64>, i32)
 declare { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32, <2 x i64>)
 declare { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32, <2 x i64>, <2 x i64>)
+declare { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64>, i8*)
+declare { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64>, i8*)
+declare { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64>, i8*)
+declare { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64>, i8*)
+declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
+declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
+declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
+declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)


        


More information about the llvm-commits mailing list