[clang] 6d234a6 - [X86] Zero some outputs of Kelocker intrinsics in error case

Xiang1 Zhang via cfe-commits cfe-commits at lists.llvm.org
Mon Jun 28 22:37:37 PDT 2021


Author: Xiang1 Zhang
Date: 2021-06-29T13:35:40+08:00
New Revision: 6d234a6908646cbdefcbbb4c0ea1ff2cf4a5482f

URL: https://github.com/llvm/llvm-project/commit/6d234a6908646cbdefcbbb4c0ea1ff2cf4a5482f
DIFF: https://github.com/llvm/llvm-project/commit/6d234a6908646cbdefcbbb4c0ea1ff2cf4a5482f.diff

LOG: [X86] Zero some outputs of Kelocker intrinsics in error case

Reviewed By: WangPengfei

Differential Revision: https://reviews.llvm.org/D104766

Added: 
    

Modified: 
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/Headers/keylockerintrin.h
    clang/test/CodeGen/X86/keylocker.c

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2e9454921ffa..9579d706b2ae 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14736,27 +14736,56 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_aesenc256kl_u8:
   case X86::BI__builtin_ia32_aesdec256kl_u8: {
     Intrinsic::ID IID;
+    StringRef StrNoErr, StrErr, StrEnd;
     switch (BuiltinID) {
     default: llvm_unreachable("Unexpected builtin");
     case X86::BI__builtin_ia32_aesenc128kl_u8:
       IID = Intrinsic::x86_aesenc128kl;
+      StrNoErr = "aesenc128kl_no_error";
+      StrErr = "aesenc128kl_error";
+      StrEnd = "aesenc128kl_end";
       break;
     case X86::BI__builtin_ia32_aesdec128kl_u8:
       IID = Intrinsic::x86_aesdec128kl;
+      StrNoErr = "aesdec128kl_no_error";
+      StrErr = "aesdec128kl_error";
+      StrEnd = "aesdec128kl_end";
       break;
     case X86::BI__builtin_ia32_aesenc256kl_u8:
       IID = Intrinsic::x86_aesenc256kl;
+      StrNoErr = "aesenc256kl_no_error";
+      StrErr = "aesenc256kl_error";
+      StrEnd = "aesenc256kl_end";
       break;
     case X86::BI__builtin_ia32_aesdec256kl_u8:
       IID = Intrinsic::x86_aesdec256kl;
+      StrNoErr = "aesdec256kl_no_error";
+      StrErr = "aesdec256kl_error";
+      StrEnd = "aesdec256kl_end";
       break;
     }
 
     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[1], Ops[2]});
 
-    Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
-                                      Ops[0]);
+    BasicBlock *NoError = createBasicBlock(StrNoErr, this->CurFn);
+    BasicBlock *Error = createBasicBlock(StrErr, this->CurFn);
+    BasicBlock *End = createBasicBlock(StrEnd, this->CurFn);
+
+    Value *Ret = Builder.CreateExtractValue(Call, 0);
+    Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
+    Value *Out = Builder.CreateExtractValue(Call, 1);
+    Builder.CreateCondBr(Succ, NoError, Error);
+
+    Builder.SetInsertPoint(NoError);
+    Builder.CreateDefaultAlignedStore(Out, Ops[0]);
+    Builder.CreateBr(End);
 
+    Builder.SetInsertPoint(Error);
+    Constant *Zero = llvm::Constant::getNullValue(Out->getType());
+    Builder.CreateDefaultAlignedStore(Zero, Ops[0]);
+    Builder.CreateBr(End);
+
+    Builder.SetInsertPoint(End);
     return Builder.CreateExtractValue(Call, 0);
   }
   case X86::BI__builtin_ia32_aesencwide128kl_u8:
@@ -14764,18 +14793,31 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_aesencwide256kl_u8:
   case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
     Intrinsic::ID IID;
+    StringRef StrNoErr, StrErr, StrEnd;
     switch (BuiltinID) {
     case X86::BI__builtin_ia32_aesencwide128kl_u8:
       IID = Intrinsic::x86_aesencwide128kl;
+      StrNoErr = "aesencwide128kl_no_error";
+      StrErr = "aesencwide128kl_error";
+      StrEnd = "aesencwide128kl_end";
       break;
     case X86::BI__builtin_ia32_aesdecwide128kl_u8:
       IID = Intrinsic::x86_aesdecwide128kl;
+      StrNoErr = "aesdecwide128kl_no_error";
+      StrErr = "aesdecwide128kl_error";
+      StrEnd = "aesdecwide128kl_end";
       break;
     case X86::BI__builtin_ia32_aesencwide256kl_u8:
       IID = Intrinsic::x86_aesencwide256kl;
+      StrNoErr = "aesencwide256kl_no_error";
+      StrErr = "aesencwide256kl_error";
+      StrEnd = "aesencwide256kl_end";
       break;
     case X86::BI__builtin_ia32_aesdecwide256kl_u8:
       IID = Intrinsic::x86_aesdecwide256kl;
+      StrNoErr = "aesdecwide256kl_no_error";
+      StrErr = "aesdecwide256kl_error";
+      StrEnd = "aesdecwide256kl_end";
       break;
     }
 
@@ -14789,12 +14831,32 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
 
     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), InOps);
 
+    BasicBlock *NoError = createBasicBlock(StrNoErr, this->CurFn);
+    BasicBlock *Error = createBasicBlock(StrErr, this->CurFn);
+    BasicBlock *End = createBasicBlock(StrEnd, this->CurFn);
+
+    Value *Ret = Builder.CreateExtractValue(Call, 0);
+    Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
+    Builder.CreateCondBr(Succ, NoError, Error);
+
+    Builder.SetInsertPoint(NoError);
     for (int i = 0; i != 8; ++i) {
       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
       Value *Ptr = Builder.CreateConstGEP1_32(Ops[0], i);
       Builder.CreateAlignedStore(Extract, Ptr, Align(16));
     }
+    Builder.CreateBr(End);
+
+    Builder.SetInsertPoint(Error);
+    for (int i = 0; i != 8; ++i) {
+      Value *Out = Builder.CreateExtractValue(Call, i + 1);
+      Constant *Zero = llvm::Constant::getNullValue(Out->getType());
+      Value *Ptr = Builder.CreateConstGEP1_32(Ops[0], i);
+      Builder.CreateAlignedStore(Zero, Ptr, Align(16));
+    }
+    Builder.CreateBr(End);
 
+    Builder.SetInsertPoint(End);
     return Builder.CreateExtractValue(Call, 0);
   }
   }

diff  --git a/clang/lib/Headers/keylockerintrin.h b/clang/lib/Headers/keylockerintrin.h
index c15d39c8e392..68b0a5689618 100644
--- a/clang/lib/Headers/keylockerintrin.h
+++ b/clang/lib/Headers/keylockerintrin.h
@@ -230,10 +230,12 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF (Authentic == 0)
 ///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
 ///   ELSE
 ///     MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
 ///     ZF := 0
@@ -267,10 +269,12 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                  HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
 ///   IF (Authentic == 0)
 ///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
 ///   ELSE
 ///     MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
 ///     ZF := 0
@@ -304,10 +308,12 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                   HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF (Authentic == 0)
 ///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
 ///   ELSE
 ///     MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
 ///     ZF := 0
@@ -354,10 +360,16 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
@@ -394,10 +406,16 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
@@ -434,10 +452,16 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///                    HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
@@ -474,10 +498,16 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///                   HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
 /// If (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)

diff  --git a/clang/test/CodeGen/X86/keylocker.c b/clang/test/CodeGen/X86/keylocker.c
index b87fe22d7761..ded6e57bfb8b 100644
--- a/clang/test/CodeGen/X86/keylocker.c
+++ b/clang/test/CodeGen/X86/keylocker.c
@@ -1,292 +1,1339 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 %s -O0 -ffreestanding -triple=x86_64-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=CHECK64
+// RUN: %clang_cc1 %s -O0 -ffreestanding -triple=i386-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=CHECK32
 
 #include <x86intrin.h>
 
+// CHECK64-LABEL: @test_loadiwkey(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__CTL_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK64-NEXT:    [[__INTKEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[__ENKEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[__ENKEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[CTL_ADDR:%.*]] = alloca i32, align 4
+// CHECK64-NEXT:    [[INTKEY_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[ENKEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[ENKEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    store i32 [[CTL:%.*]], i32* [[CTL_ADDR]], align 4
+// CHECK64-NEXT:    store <2 x i64> [[INTKEY:%.*]], <2 x i64>* [[INTKEY_ADDR]], align 16
+// CHECK64-NEXT:    store <2 x i64> [[ENKEY_LO:%.*]], <2 x i64>* [[ENKEY_LO_ADDR]], align 16
+// CHECK64-NEXT:    store <2 x i64> [[ENKEY_HI:%.*]], <2 x i64>* [[ENKEY_HI_ADDR]], align 16
+// CHECK64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CTL_ADDR]], align 4
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[INTKEY_ADDR]], align 16
+// CHECK64-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_LO_ADDR]], align 16
+// CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_HI_ADDR]], align 16
+// CHECK64-NEXT:    store i32 [[TMP0]], i32* [[__CTL_ADDR_I]], align 4
+// CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__INTKEY_ADDR_I]], align 16
+// CHECK64-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
+// CHECK64-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__INTKEY_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__CTL_ADDR_I]], align 4
+// CHECK64-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]]) #[[ATTR1:[0-9]+]]
+// CHECK64-NEXT:    ret void
+//
+// CHECK32-LABEL: @test_loadiwkey(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__CTL_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK32-NEXT:    [[__INTKEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[__ENKEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[__ENKEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[CTL_ADDR:%.*]] = alloca i32, align 4
+// CHECK32-NEXT:    [[INTKEY_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[ENKEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[ENKEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    store i32 [[CTL:%.*]], i32* [[CTL_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[INTKEY:%.*]], <2 x i64>* [[INTKEY_ADDR]], align 16
+// CHECK32-NEXT:    store <2 x i64> [[ENKEY_LO:%.*]], <2 x i64>* [[ENKEY_LO_ADDR]], align 16
+// CHECK32-NEXT:    store <2 x i64> [[ENKEY_HI:%.*]], <2 x i64>* [[ENKEY_HI_ADDR]], align 16
+// CHECK32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CTL_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[INTKEY_ADDR]], align 16
+// CHECK32-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_LO_ADDR]], align 16
+// CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ENKEY_HI_ADDR]], align 16
+// CHECK32-NEXT:    store i32 [[TMP0]], i32* [[__CTL_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__INTKEY_ADDR_I]], align 16
+// CHECK32-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
+// CHECK32-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__INTKEY_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__CTL_ADDR_I]], align 4
+// CHECK32-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]]) #[[ATTR1:[0-9]+]]
+// CHECK32-NEXT:    ret void
+//
 void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i enkey_hi) {
-  //CHECK-LABEL: @test_loadiwkey
-  //CHECK: @llvm.x86.loadiwkey
   _mm_loadiwkey(ctl, intkey, enkey_lo, enkey_hi);
 }
 
+// CHECK64-LABEL: @test_encodekey128_u32(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK64-NEXT:    [[__KEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
+// CHECK64-NEXT:    [[KEY_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
+// CHECK64-NEXT:    store <2 x i64> [[KEY:%.*]], <2 x i64>* [[KEY_ADDR]], align 16
+// CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_ADDR]], align 16
+// CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
+// CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_ADDR_I]], align 16
+// CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
+// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
+// CHECK64-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
+// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 2
+// CHECK64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP5]], i32 16
+// CHECK64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 1
+// CHECK64-NEXT:    [[TMP12:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 3
+// CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[TMP5]], i32 32
+// CHECK64-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP14]], align 1
+// CHECK64-NEXT:    [[TMP15:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 4
+// CHECK64-NEXT:    [[TMP16:%.*]] = getelementptr i8, i8* [[TMP5]], i32 48
+// CHECK64-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP17]], align 1
+// CHECK64-NEXT:    [[TMP18:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 5
+// CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[TMP5]], i32 64
+// CHECK64-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* [[TMP20]], align 1
+// CHECK64-NEXT:    [[TMP21:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 6
+// CHECK64-NEXT:    [[TMP22:%.*]] = getelementptr i8, i8* [[TMP5]], i32 80
+// CHECK64-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP22]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP21]], <2 x i64>* [[TMP23]], align 1
+// CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 0
+// CHECK64-NEXT:    ret i32 [[TMP24]]
+//
+// CHECK32-LABEL: @test_encodekey128_u32(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK32-NEXT:    [[__KEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
+// CHECK32-NEXT:    [[KEY_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[KEY:%.*]], <2 x i64>* [[KEY_ADDR]], align 16
+// CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_ADDR]], align 16
+// CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_ADDR_I]], align 16
+// CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
+// CHECK32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
+// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 2
+// CHECK32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP5]], i32 16
+// CHECK32-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 1
+// CHECK32-NEXT:    [[TMP12:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 3
+// CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[TMP5]], i32 32
+// CHECK32-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP14]], align 1
+// CHECK32-NEXT:    [[TMP15:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 4
+// CHECK32-NEXT:    [[TMP16:%.*]] = getelementptr i8, i8* [[TMP5]], i32 48
+// CHECK32-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP17]], align 1
+// CHECK32-NEXT:    [[TMP18:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 5
+// CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[TMP5]], i32 64
+// CHECK32-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP18]], <2 x i64>* [[TMP20]], align 1
+// CHECK32-NEXT:    [[TMP21:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 6
+// CHECK32-NEXT:    [[TMP22:%.*]] = getelementptr i8, i8* [[TMP5]], i32 80
+// CHECK32-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP22]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP21]], <2 x i64>* [[TMP23]], align 1
+// CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 0
+// CHECK32-NEXT:    ret i32 [[TMP24]]
+//
 unsigned int test_encodekey128_u32(unsigned int htype, __m128i key, void *h) {
-  //CHECK-LABEL: @test_encodekey128_u32
-  //CHECK: call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 %{{.*}}, <2 x i64> %{{.*}})
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
-  //CHECK: itcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 16
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 32
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 48
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 64
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 80
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
   return _mm_encodekey128_u32(htype, key, h);
 }
 
+// CHECK64-LABEL: @test_encodekey256_u32(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK64-NEXT:    [[__KEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[__KEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
+// CHECK64-NEXT:    [[KEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[KEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
+// CHECK64-NEXT:    store <2 x i64> [[KEY_LO:%.*]], <2 x i64>* [[KEY_LO_ADDR]], align 16
+// CHECK64-NEXT:    store <2 x i64> [[KEY_HI:%.*]], <2 x i64>* [[KEY_HI_ADDR]], align 16
+// CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_LO_ADDR]], align 16
+// CHECK64-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_HI_ADDR]], align 16
+// CHECK64-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
+// CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
+// CHECK64-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
+// CHECK64-NEXT:    store i8* [[TMP3]], i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
+// CHECK64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
+// CHECK64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 1
+// CHECK64-NEXT:    [[TMP11:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 2
+// CHECK64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP7]], i32 16
+// CHECK64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* [[TMP13]], align 1
+// CHECK64-NEXT:    [[TMP14:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 3
+// CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr i8, i8* [[TMP7]], i32 32
+// CHECK64-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[TMP16]], align 1
+// CHECK64-NEXT:    [[TMP17:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 4
+// CHECK64-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[TMP7]], i32 48
+// CHECK64-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* [[TMP19]], align 1
+// CHECK64-NEXT:    [[TMP20:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 5
+// CHECK64-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8* [[TMP7]], i32 64
+// CHECK64-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP21]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* [[TMP22]], align 1
+// CHECK64-NEXT:    [[TMP23:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 6
+// CHECK64-NEXT:    [[TMP24:%.*]] = getelementptr i8, i8* [[TMP7]], i32 80
+// CHECK64-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP23]], <2 x i64>* [[TMP25]], align 1
+// CHECK64-NEXT:    [[TMP26:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 7
+// CHECK64-NEXT:    [[TMP27:%.*]] = getelementptr i8, i8* [[TMP7]], i32 96
+// CHECK64-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to <2 x i64>*
+// CHECK64-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* [[TMP28]], align 1
+// CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 0
+// CHECK64-NEXT:    ret i32 [[TMP29]]
+//
+// CHECK32-LABEL: @test_encodekey256_u32(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK32-NEXT:    [[__KEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[__KEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
+// CHECK32-NEXT:    [[KEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[KEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    store i32 [[HTYPE:%.*]], i32* [[HTYPE_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[KEY_LO:%.*]], <2 x i64>* [[KEY_LO_ADDR]], align 16
+// CHECK32-NEXT:    store <2 x i64> [[KEY_HI:%.*]], <2 x i64>* [[KEY_HI_ADDR]], align 16
+// CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[HTYPE_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_LO_ADDR]], align 16
+// CHECK32-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[KEY_HI_ADDR]], align 16
+// CHECK32-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    store i32 [[TMP0]], i32* [[__HTYPE_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
+// CHECK32-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
+// CHECK32-NEXT:    store i8* [[TMP3]], i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
+// CHECK32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 1
+// CHECK32-NEXT:    [[TMP11:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 2
+// CHECK32-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP7]], i32 16
+// CHECK32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* [[TMP13]], align 1
+// CHECK32-NEXT:    [[TMP14:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 3
+// CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr i8, i8* [[TMP7]], i32 32
+// CHECK32-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[TMP16]], align 1
+// CHECK32-NEXT:    [[TMP17:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 4
+// CHECK32-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[TMP7]], i32 48
+// CHECK32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP17]], <2 x i64>* [[TMP19]], align 1
+// CHECK32-NEXT:    [[TMP20:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 5
+// CHECK32-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8* [[TMP7]], i32 64
+// CHECK32-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP21]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP20]], <2 x i64>* [[TMP22]], align 1
+// CHECK32-NEXT:    [[TMP23:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 6
+// CHECK32-NEXT:    [[TMP24:%.*]] = getelementptr i8, i8* [[TMP7]], i32 80
+// CHECK32-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP23]], <2 x i64>* [[TMP25]], align 1
+// CHECK32-NEXT:    [[TMP26:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 7
+// CHECK32-NEXT:    [[TMP27:%.*]] = getelementptr i8, i8* [[TMP7]], i32 96
+// CHECK32-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to <2 x i64>*
+// CHECK32-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* [[TMP28]], align 1
+// CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 0
+// CHECK32-NEXT:    ret i32 [[TMP29]]
+//
 unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i key_hi, void *h) {
-  //CHECK-LABEL: @test_encodekey256_u32
-  //CHECK: call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
-  //CHECK: itcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 16
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 32
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 48
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 64
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 80
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 7
-  //CHECK: getelementptr i8, i8* %{{.*}}, i32 96
-  //CHECK: bitcast i8* %{{.*}} to <2 x i64>*
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  //CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
   return _mm_encodekey256_u32(htype, key_lo, key_hi, h);
 }
 
+// CHECK64-LABEL: @test_mm_aesenc256kl_u8(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
+// CHECK64-NEXT:    br i1 [[TMP8]], label [[AESENC256KL_NO_ERROR_I:%.*]], label [[AESENC256KL_ERROR_I:%.*]]
+// CHECK64:       aesenc256kl_no_error.i:
+// CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT:%.*]]
+// CHECK64:       aesenc256kl_error.i:
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT]]
+// CHECK64:       _mm_aesenc256kl_u8.exit:
+// CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK64-NEXT:    ret i8 [[TMP10]]
+//
+// CHECK32-LABEL: @test_mm_aesenc256kl_u8(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
+// CHECK32-NEXT:    br i1 [[TMP8]], label [[AESENC256KL_NO_ERROR_I:%.*]], label [[AESENC256KL_ERROR_I:%.*]]
+// CHECK32:       aesenc256kl_no_error.i:
+// CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT:%.*]]
+// CHECK32:       aesenc256kl_error.i:
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT]]
+// CHECK32:       _mm_aesenc256kl_u8.exit:
+// CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK32-NEXT:    ret i8 [[TMP10]]
+//
 unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *h) {
-  //CHECK-LABEL: @test_mm_aesenc256kl_u8
-  //CHECK: call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %{{.*}}, i8* %{{.*}})
-  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 0
   return _mm_aesenc256kl_u8(odata, idata, h);
 }
 
+// CHECK64-LABEL: @test_mm_aesdec256kl_u8(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
+// CHECK64-NEXT:    br i1 [[TMP8]], label [[AESDEC256KL_NO_ERROR_I:%.*]], label [[AESDEC256KL_ERROR_I:%.*]]
+// CHECK64:       aesdec256kl_no_error.i:
+// CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT:%.*]]
+// CHECK64:       aesdec256kl_error.i:
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT]]
+// CHECK64:       _mm_aesdec256kl_u8.exit:
+// CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK64-NEXT:    ret i8 [[TMP10]]
+//
+// CHECK32-LABEL: @test_mm_aesdec256kl_u8(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
+// CHECK32-NEXT:    br i1 [[TMP8]], label [[AESDEC256KL_NO_ERROR_I:%.*]], label [[AESDEC256KL_ERROR_I:%.*]]
+// CHECK32:       aesdec256kl_no_error.i:
+// CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT:%.*]]
+// CHECK32:       aesdec256kl_error.i:
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT]]
+// CHECK32:       _mm_aesdec256kl_u8.exit:
+// CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK32-NEXT:    ret i8 [[TMP10]]
+//
 unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *h) {
-  //CHECK-LABEL: @test_mm_aesdec256kl_u8
-  //CHECK: call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %{{.*}}, i8* %{{.*}})
-  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 0
   return _mm_aesdec256kl_u8(odata, idata, h);
 }
 
+// CHECK64-LABEL: @test_mm_aesenc128kl_u8(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
+// CHECK64-NEXT:    br i1 [[TMP8]], label [[AESENC128KL_NO_ERROR_I:%.*]], label [[AESENC128KL_ERROR_I:%.*]]
+// CHECK64:       aesenc128kl_no_error.i:
+// CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT:%.*]]
+// CHECK64:       aesenc128kl_error.i:
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT]]
+// CHECK64:       _mm_aesenc128kl_u8.exit:
+// CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK64-NEXT:    ret i8 [[TMP10]]
+//
+// CHECK32-LABEL: @test_mm_aesenc128kl_u8(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
+// CHECK32-NEXT:    br i1 [[TMP8]], label [[AESENC128KL_NO_ERROR_I:%.*]], label [[AESENC128KL_ERROR_I:%.*]]
+// CHECK32:       aesenc128kl_no_error.i:
+// CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT:%.*]]
+// CHECK32:       aesenc128kl_error.i:
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT]]
+// CHECK32:       _mm_aesenc128kl_u8.exit:
+// CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK32-NEXT:    ret i8 [[TMP10]]
+//
 unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *h) {
-  //CHECK-LABEL: @test_mm_aesenc128kl_u8
-  //CHECK: call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %{{.*}}, i8* %{{.*}})
-  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 0
   return _mm_aesenc128kl_u8(odata, idata, h);
 }
 
+// CHECK64-LABEL: @test_mm_aesdec128kl_u8(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
+// CHECK64-NEXT:    br i1 [[TMP8]], label [[AESDEC128KL_NO_ERROR_I:%.*]], label [[AESDEC128KL_ERROR_I:%.*]]
+// CHECK64:       aesdec128kl_no_error.i:
+// CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT:%.*]]
+// CHECK64:       aesdec128kl_error.i:
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT]]
+// CHECK64:       _mm_aesdec128kl_u8.exit:
+// CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK64-NEXT:    ret i8 [[TMP10]]
+//
+// CHECK32-LABEL: @test_mm_aesdec128kl_u8(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[IDATA_ADDR]], align 16
+// CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
+// CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
+// CHECK32-NEXT:    br i1 [[TMP8]], label [[AESDEC128KL_NO_ERROR_I:%.*]], label [[AESDEC128KL_ERROR_I:%.*]]
+// CHECK32:       aesdec128kl_no_error.i:
+// CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT:%.*]]
+// CHECK32:       aesdec128kl_error.i:
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT]]
+// CHECK32:       _mm_aesdec128kl_u8.exit:
+// CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
+// CHECK32-NEXT:    ret i8 [[TMP10]]
+//
 unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *h) {
-  //CHECK-LABEL: @test_mm_aesdec128kl_u8
-  //CHECK: call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %{{.*}}, i8* %{{.*}})
-  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64> } %{{.*}}, 0
   return _mm_aesdec128kl_u8(odata, idata, h);
 }
 
+// CHECK64-LABEL: @test__mm_aesencwide128kl_u8(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
+// CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
+// CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
+// CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
+// CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
+// CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
+// CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
+// CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
+// CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
+// CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
+// CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
+// CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
+// CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
+// CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
+// CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
+// CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
+// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
+// CHECK64-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
+// CHECK64:       aesencwide128kl_no_error.i:
+// CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
+// CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
+// CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
+// CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
+// CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
+// CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
+// CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT:%.*]]
+// CHECK64:       aesencwide128kl_error.i:
+// CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
+// CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
+// CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
+// CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
+// CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
+// CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
+// CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT]]
+// CHECK64:       _mm_aesencwide128kl_u8.exit:
+// CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK64-NEXT:    ret i8 [[TMP54]]
+//
+// CHECK32-LABEL: @test__mm_aesencwide128kl_u8(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
+// CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
+// CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
+// CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
+// CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
+// CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
+// CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
+// CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
+// CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
+// CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
+// CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
+// CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
+// CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
+// CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
+// CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
+// CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
+// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
+// CHECK32-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
+// CHECK32:       aesencwide128kl_no_error.i:
+// CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
+// CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
+// CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
+// CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
+// CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
+// CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
+// CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT:%.*]]
+// CHECK32:       aesencwide128kl_error.i:
+// CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
+// CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
+// CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
+// CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
+// CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
+// CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
+// CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT]]
+// CHECK32:       _mm_aesencwide128kl_u8.exit:
+// CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK32-NEXT:    ret i8 [[TMP54]]
+//
 unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
-  //CHECK-LABEL: @test__mm_aesencwide128kl
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 7
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 8
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
   return _mm_aesencwide128kl_u8(odata, idata, h);
 }
 
+// CHECK64-LABEL: @test__mm_aesdecwide128kl_u8(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
+// CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
+// CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
+// CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
+// CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
+// CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
+// CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
+// CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
+// CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
+// CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
+// CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
+// CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
+// CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
+// CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
+// CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
+// CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
+// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
+// CHECK64-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
+// CHECK64:       aesdecwide128kl_no_error.i:
+// CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
+// CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
+// CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
+// CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
+// CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
+// CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
+// CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT:%.*]]
+// CHECK64:       aesdecwide128kl_error.i:
+// CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
+// CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
+// CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
+// CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
+// CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
+// CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
+// CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT]]
+// CHECK64:       _mm_aesdecwide128kl_u8.exit:
+// CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK64-NEXT:    ret i8 [[TMP54]]
+//
+// CHECK32-LABEL: @test__mm_aesdecwide128kl_u8(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
+// CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
+// CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
+// CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
+// CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
+// CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
+// CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
+// CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
+// CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
+// CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
+// CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
+// CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
+// CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
+// CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
+// CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
+// CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
+// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
+// CHECK32-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
+// CHECK32:       aesdecwide128kl_no_error.i:
+// CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
+// CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
+// CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
+// CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
+// CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
+// CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
+// CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT:%.*]]
+// CHECK32:       aesdecwide128kl_error.i:
+// CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
+// CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
+// CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
+// CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
+// CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
+// CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
+// CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT]]
+// CHECK32:       _mm_aesdecwide128kl_u8.exit:
+// CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK32-NEXT:    ret i8 [[TMP54]]
+//
 unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
-  //CHECK-LABEL: @test__mm_aesdecwide128kl
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 7
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 8
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
   return _mm_aesdecwide128kl_u8(odata, idata, h);
 }
 
+// CHECK64-LABEL: @test__mm_aesencwide256kl_u8(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
+// CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
+// CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
+// CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
+// CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
+// CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
+// CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
+// CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
+// CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
+// CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
+// CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
+// CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
+// CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
+// CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
+// CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
+// CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
+// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
+// CHECK64-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
+// CHECK64:       aesencwide256kl_no_error.i:
+// CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
+// CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
+// CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
+// CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
+// CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
+// CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
+// CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT:%.*]]
+// CHECK64:       aesencwide256kl_error.i:
+// CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
+// CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
+// CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
+// CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
+// CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
+// CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
+// CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT]]
+// CHECK64:       _mm_aesencwide256kl_u8.exit:
+// CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK64-NEXT:    ret i8 [[TMP54]]
+//
+// CHECK32-LABEL: @test__mm_aesencwide256kl_u8(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
+// CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
+// CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
+// CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
+// CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
+// CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
+// CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
+// CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
+// CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
+// CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
+// CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
+// CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
+// CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
+// CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
+// CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
+// CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
+// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
+// CHECK32-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
+// CHECK32:       aesencwide256kl_no_error.i:
+// CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
+// CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
+// CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
+// CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
+// CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
+// CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
+// CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT:%.*]]
+// CHECK32:       aesencwide256kl_error.i:
+// CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
+// CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
+// CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
+// CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
+// CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
+// CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
+// CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT]]
+// CHECK32:       _mm_aesencwide256kl_u8.exit:
+// CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK32-NEXT:    ret i8 [[TMP54]]
+//
 unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
-  //CHECK-LABEL: @test__mm_aesencwide256kl
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 7
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 8
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
   return _mm_aesencwide256kl_u8(odata, idata, h);
 }
 
+// CHECK64-LABEL: @test__mm_aesdecwide256kl_u8(
+// CHECK64-NEXT:  entry:
+// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 8
+// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 8
+// CHECK64-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 8
+// CHECK64-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 8
+// CHECK64-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 8
+// CHECK64-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
+// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
+// CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
+// CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
+// CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
+// CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
+// CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
+// CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
+// CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
+// CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
+// CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
+// CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
+// CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
+// CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
+// CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
+// CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
+// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
+// CHECK64-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
+// CHECK64:       aesdecwide256kl_no_error.i:
+// CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK64-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK64-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
+// CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK64-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
+// CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK64-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
+// CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK64-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
+// CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK64-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
+// CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK64-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
+// CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK64-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT:%.*]]
+// CHECK64:       aesdecwide256kl_error.i:
+// CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
+// CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
+// CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
+// CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
+// CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
+// CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
+// CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK64-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
+// CHECK64-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT]]
+// CHECK64:       _mm_aesdecwide256kl_u8.exit:
+// CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK64-NEXT:    ret i8 [[TMP54]]
+//
+// CHECK32-LABEL: @test__mm_aesdecwide256kl_u8(
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>*, align 4
+// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca i8*, align 4
+// CHECK32-NEXT:    store <2 x i64>* [[ODATA:%.*]], <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[IDATA:%.*]], <2 x i64>** [[IDATA_ADDR]], align 4
+// CHECK32-NEXT:    store i8* [[H:%.*]], i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load <2 x i64>*, <2 x i64>** [[ODATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>*, <2 x i64>** [[IDATA_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[H_ADDR]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP0]], <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store <2 x i64>* [[TMP1]], <2 x i64>** [[__IDATA_ADDR_I]], align 4
+// CHECK32-NEXT:    store i8* [[TMP2]], i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>*, <2 x i64>** [[__IDATA_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
+// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 16
+// CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 1
+// CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 16
+// CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 2
+// CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 16
+// CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 3
+// CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 16
+// CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 4
+// CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 16
+// CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 5
+// CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 16
+// CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 6
+// CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
+// CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
+// CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
+// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
+// CHECK32-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
+// CHECK32:       aesdecwide256kl_no_error.i:
+// CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK32-NEXT:    store <2 x i64> [[TMP24]], <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK32-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[TMP26]], align 16
+// CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK32-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[TMP28]], align 16
+// CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK32-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[TMP30]], align 16
+// CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK32-NEXT:    store <2 x i64> [[TMP31]], <2 x i64>* [[TMP32]], align 16
+// CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK32-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[TMP34]], align 16
+// CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK32-NEXT:    store <2 x i64> [[TMP35]], <2 x i64>* [[TMP36]], align 16
+// CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK32-NEXT:    store <2 x i64> [[TMP37]], <2 x i64>* [[TMP38]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT:%.*]]
+// CHECK32:       aesdecwide256kl_error.i:
+// CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP3]], align 16
+// CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
+// CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 1
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP41]], align 16
+// CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
+// CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 2
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP43]], align 16
+// CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
+// CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 3
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP45]], align 16
+// CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
+// CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 4
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP47]], align 16
+// CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
+// CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 5
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP49]], align 16
+// CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
+// CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 6
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP51]], align 16
+// CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
+// CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP3]], i32 7
+// CHECK32-NEXT:    store <2 x i64> zeroinitializer, <2 x i64>* [[TMP53]], align 16
+// CHECK32-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT]]
+// CHECK32:       _mm_aesdecwide256kl_u8.exit:
+// CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
+// CHECK32-NEXT:    ret i8 [[TMP54]]
+//
 unsigned char test__mm_aesdecwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
-  //CHECK-LABEL: @test__mm_aesdecwide256kl
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
-  //CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
-  //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 1
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 2
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 3
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 4
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 5
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 7
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 6
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 8
-  //CHECK: getelementptr <2 x i64>, <2 x i64>* %{{.*}}, i32 7
-  //CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
-  //CHECK: extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
   return _mm_aesdecwide256kl_u8(odata, idata, h);
 }


        


More information about the cfe-commits mailing list