[clang] [CIR][X86] Add support for `aes` and `aeswide` builtins (PR #175892)

Haocong Lu via cfe-commits cfe-commits at lists.llvm.org
Thu Jan 15 00:45:55 PST 2026


https://github.com/Luhaocong updated https://github.com/llvm/llvm-project/pull/175892

>From c1fdb9cf6c28e0373df5016ce54b8d1ea87acdba Mon Sep 17 00:00:00 2001
From: Haocong Lu <haocong.lu at witintech.com>
Date: Thu, 15 Jan 2026 16:50:31 +0800
Subject: [PATCH] [CIR][X86] Add support for `aes` and `aeswide` builtins

- Support CIR codegen for follow builtin: `aesenc`, `aesdec`,
  `aesencwide` and `aesdecwide`
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  166 ++-
 .../test/CIR/CodeGenBuiltins/X86/keylocker.c  | 1176 +++++++++++++++++
 2 files changed, 1339 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/keylocker.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 44af8d6cc0ef4..497462a465145 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -597,6 +597,119 @@ static mlir::Value emitX86Fpclass(CIRGenBuilderTy &builder, mlir::Location loc,
   return emitX86MaskedCompareResult(builder, fpclass, numElts, maskIn, loc);
 }
 
+static mlir::Value emitX86Aes(CIRGenBuilderTy &builder, mlir::Location loc,
+                              llvm::StringRef intrinsicName, mlir::Type retType,
+                              llvm::ArrayRef<mlir::Value> ops) {
+  // Create return struct type and call intrinsic function.
+  mlir::Type vecType =
+      mlir::cast<cir::PointerType>(ops[0].getType()).getPointee();
+  cir::RecordType rstRecTy = builder.getAnonRecordTy({retType, vecType});
+  mlir::Value rstValueRec = emitIntrinsicCallOp(
+      builder, loc, intrinsicName, rstRecTy, mlir::ValueRange{ops[1], ops[2]});
+
+  // Extract the first return value and truncate it to 1 bit, then cast result
+  // to bool value.
+  mlir::Value flag =
+      cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/0);
+  mlir::Value flagBit0 = builder.createCast(loc, cir::CastKind::integral, flag,
+                                            builder.getUIntNTy(1));
+  mlir::Value succ = builder.createCast(loc, cir::CastKind::int_to_bool,
+                                        flagBit0, builder.getBoolTy());
+
+  // Extract the second return value, store it to output address if success.
+  mlir::Value out =
+      cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/1);
+  Address outAddr(ops[0], /*align=*/CharUnits::fromQuantity(16));
+  cir::IfOp::create(
+      builder, loc, succ, /*withElseRegion=*/true,
+      /*thenBuilder=*/
+      [&](mlir::OpBuilder &b, mlir::Location) {
+        builder.createStore(loc, out, outAddr);
+        builder.createYield(loc);
+      },
+      /*elseBuilder=*/
+      [&](mlir::OpBuilder &b, mlir::Location) {
+        mlir::Value zero = builder.getNullValue(vecType, loc);
+        builder.createStore(loc, zero, outAddr);
+        builder.createYield(loc);
+      });
+
+  return cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/0);
+}
+
+static mlir::Value emitX86Aeswide(CIRGenBuilderTy &builder, mlir::Location loc,
+                                  llvm::StringRef intrinsicName,
+                                  mlir::Type retType,
+                                  llvm::ArrayRef<mlir::Value> ops) {
+  mlir::Type vecType =
+      mlir::cast<cir::PointerType>(ops[1].getType()).getPointee();
+
+  // Create struct for return type and load input arguments, then call
+  // intrinsic function.
+  mlir::Type recTypes[9] = {retType, vecType, vecType, vecType, vecType,
+                            vecType, vecType, vecType, vecType};
+  mlir::Value arguments[9];
+  arguments[0] = ops[2];
+  for (int i = 0; i < 8; i++) {
+    // Loading each vector argument from input address.
+    cir::ConstantOp idx = builder.getUInt32(i, loc);
+    mlir::Value nextInElePtr =
+        builder.getArrayElement(loc, loc, ops[1], vecType, idx,
+                                /*shouldDecay=*/false);
+    arguments[i + 1] =
+        builder.createAlignedLoad(loc, vecType, nextInElePtr,
+                                  /*align=*/CharUnits::fromQuantity(16));
+  }
+  cir::RecordType rstRecTy = builder.getAnonRecordTy(recTypes);
+  mlir::Value rstValueRec =
+      emitIntrinsicCallOp(builder, loc, intrinsicName, rstRecTy, arguments);
+
+  // Extract the first return value and truncate it to 1 bit, then cast result
+  // to bool value.
+  mlir::Value flag =
+      cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/0);
+  mlir::Value flagBit0 = builder.createCast(loc, cir::CastKind::integral, flag,
+                                            builder.getUIntNTy(1));
+  mlir::Value succ = builder.createCast(loc, cir::CastKind::int_to_bool,
+                                        flagBit0, builder.getBoolTy());
+
+  // Extract other return values, store those to output address if success.
+  cir::IfOp::create(
+      builder, loc, succ, /*withElseRegion=*/true,
+      /*thenBuilder=*/
+      [&](mlir::OpBuilder &b, mlir::Location) {
+        for (int i = 0; i < 8; i++) {
+          mlir::Value out =
+              cir::ExtractMemberOp::create(builder, loc, rstValueRec,
+                                           /*index=*/i + 1);
+          cir::ConstantOp idx = builder.getUInt32(i, loc);
+          mlir::Value nextOutEleAddr =
+              builder.getArrayElement(loc, loc, ops[0], vecType, idx,
+                                      /*shouldDecay=*/false);
+          Address outAddr(nextOutEleAddr,
+                          /*align=*/CharUnits::fromQuantity(16));
+          builder.createStore(loc, out, outAddr);
+        }
+        builder.createYield(loc);
+      },
+      /*elseBuilder=*/
+      [&](mlir::OpBuilder &b, mlir::Location) {
+        mlir::Value zero = builder.getNullValue(vecType, loc);
+        for (int i = 0; i < 8; i++) {
+          cir::ConstantOp idx = builder.getUInt32(i, loc);
+          mlir::Value nextOutEleAddr =
+              builder.getArrayElement(loc, loc, ops[0], vecType, idx,
+                                      /*shouldDecay=*/false);
+          Address outAddr(nextOutEleAddr,
+                          /*align=*/CharUnits::fromQuantity(16));
+          builder.createStore(loc, zero, outAddr);
+        }
+        builder.createYield(loc);
+      });
+
+  return cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/0);
+}
+
 std::optional<mlir::Value>
 CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -2147,15 +2260,62 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__readgsdword:
   case X86::BI__readgsqword:
   case X86::BI__builtin_ia32_encodekey128_u32:
-  case X86::BI__builtin_ia32_encodekey256_u32:
+  case X86::BI__builtin_ia32_encodekey256_u32: {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  }
   case X86::BI__builtin_ia32_aesenc128kl_u8:
   case X86::BI__builtin_ia32_aesdec128kl_u8:
   case X86::BI__builtin_ia32_aesenc256kl_u8:
-  case X86::BI__builtin_ia32_aesdec256kl_u8:
+  case X86::BI__builtin_ia32_aesdec256kl_u8: {
+    llvm::StringRef intrinsicName;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unexpected builtin");
+    case X86::BI__builtin_ia32_aesenc128kl_u8:
+      intrinsicName = "x86.aesenc128kl";
+      break;
+    case X86::BI__builtin_ia32_aesdec128kl_u8:
+      intrinsicName = "x86.aesdec128kl";
+      break;
+    case X86::BI__builtin_ia32_aesenc256kl_u8:
+      intrinsicName = "x86.aesenc256kl";
+      break;
+    case X86::BI__builtin_ia32_aesdec256kl_u8:
+      intrinsicName = "x86.aesdec256kl";
+      break;
+    }
+
+    return emitX86Aes(builder, getLoc(expr->getExprLoc()), intrinsicName,
+                      convertType(expr->getType()), ops);
+  }
   case X86::BI__builtin_ia32_aesencwide128kl_u8:
   case X86::BI__builtin_ia32_aesdecwide128kl_u8:
   case X86::BI__builtin_ia32_aesencwide256kl_u8:
-  case X86::BI__builtin_ia32_aesdecwide256kl_u8:
+  case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
+    llvm::StringRef intrinsicName;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unexpected builtin");
+    case X86::BI__builtin_ia32_aesencwide128kl_u8:
+      intrinsicName = "x86.aesencwide128kl";
+      break;
+    case X86::BI__builtin_ia32_aesdecwide128kl_u8:
+      intrinsicName = "x86.aesdecwide128kl";
+      break;
+    case X86::BI__builtin_ia32_aesencwide256kl_u8:
+      intrinsicName = "x86.aesencwide256kl";
+      break;
+    case X86::BI__builtin_ia32_aesdecwide256kl_u8:
+      intrinsicName = "x86.aesdecwide256kl";
+      break;
+    }
+
+    return emitX86Aeswide(builder, getLoc(expr->getExprLoc()), intrinsicName,
+                          convertType(expr->getType()), ops);
+  }
   case X86::BI__builtin_ia32_vfcmaddcph512_mask:
   case X86::BI__builtin_ia32_vfmaddcph512_mask:
   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/keylocker.c b/clang/test/CIR/CodeGenBuiltins/X86/keylocker.c
new file mode 100644
index 0000000000000..a4995a108c8c8
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/keylocker.c
@@ -0,0 +1,1176 @@
+// RUN: %clang_cc1 -x c -ffreestanding -triple x86_64-unknown-linux -target-feature +kl -target-feature +widekl -Wno-implicit-function-declaration -fclangir -emit-cir -o %t.cir %s
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -ffreestanding -triple x86_64-unknown-linux -target-feature +kl -target-feature +widekl -Wno-implicit-function-declaration -fclangir -emit-cir -o %t.cir %s
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -ffreestanding -triple x86_64-unknown-linux -target-feature +kl -target-feature +widekl -Wno-implicit-function-declaration -fclangir -emit-llvm -o %t.ll %s
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -ffreestanding -triple x86_64-unknown-linux -Wno-implicit-function-declaration -fclangir -emit-llvm -o %t.ll %s
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -ffreestanding -triple=x86_64-unknown-linux -target-feature +kl -target-feature +widekl -emit-llvm -Wall -Werror %s -o - | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -ffreestanding -triple=x86_64-unknown-linux -target-feature +kl -target-feature +widekl -emit-llvm -Wall -Werror %s -o - | FileCheck %s -check-prefix=OGCG
+
+// This test mimics clang/test/CodeGen/X86/keylocker.c, which eventually
+// CIR shall be able to support fully.
+
+#include <x86intrin.h>
+
+// CIR: !rec_anon_struct = !cir.record<struct  {!u8i, !cir.vector<2 x !s64i>}>
+// CIR: !rec_anon_struct1 = !cir.record<struct  {!u8i, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>}>
+
+unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *h) {
+  // CIR-LABEL: _mm_aesenc256kl_u8
+  // CIR:  %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesenc256kl" %[[IDATA:.+]], %[[H:.+]] : (!cir.vector<2 x !s64i>, !cir.ptr<!void>) -> !rec_anon_struct
+  // CIR:  %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+  // CIR:  %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+  // CIR:  %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+  // CIR:  %[[OUT:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct -> !cir.vector<2 x !s64i>
+  // CIR:  cir.if %[[SUCC]] {
+  // CIR:    cir.store align(16) %[[OUT]], %[[ODATA_PTR:.+]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  } else {
+  // CIR:    %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+  // CIR:    cir.store align(16) %[[NULL]], %[[ODATA_PTR]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  }
+  // CIR:  %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+  // CIR:  cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+  // CIR:  %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+  // CIR:  cir.return %[[RET]] : !u8i
+
+  // LLVM-LABEL: _mm_aesenc256kl_u8
+  // LLVM:   %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+  // LLVM:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // LLVM:   %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+  // LLVM:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // LLVM: [[NO_ERROR]]:
+  // LLVM:   store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+  // LLVM:   br label %[[EXIT:.+]]
+  // LLVM: [[ERROR]]:
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // LLVM:   br label %[[EXIT]]
+  // LLVM: [[EXIT]]:
+  // LLVM:   %[[FLAG1:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+  // LLVM:   %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+  // LLVM:   store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+  // LLVM:   %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+  // LLVM:   ret i8 %[[RET]]
+
+  // OGCG-LABEL: test_mm_aesenc256kl_u8
+  // OGCG:   %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+  // OGCG:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // OGCG:   %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+  // OGCG:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // OGCG: [[NO_ERROR]]:
+  // OGCG:   store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+  // OGCG:   br label %[[EXIT:.+]]
+  // OGCG: [[ERROR]]:
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // OGCG:   br label %[[EXIT]]
+  // OGCG: [[EXIT]]:
+  // OGCG:   %[[RET:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   ret i8 %[[RET]]
+  return _mm_aesenc256kl_u8(odata, idata, h);
+}
+
+unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *h) {
+  // CIR-LABEL: _mm_aesdec256kl_u8
+  // CIR:  %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesdec256kl" %[[IDATA:.+]], %[[H:.+]] : (!cir.vector<2 x !s64i>, !cir.ptr<!void>) -> !rec_anon_struct
+  // CIR:  %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+  // CIR:  %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+  // CIR:  %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+  // CIR:  %[[OUT:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct -> !cir.vector<2 x !s64i>
+  // CIR:  cir.if %[[SUCC]] {
+  // CIR:    cir.store align(16) %[[OUT]], %[[ODATA_PTR:.+]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  } else {
+  // CIR:    %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+  // CIR:    cir.store align(16) %[[NULL]], %[[ODATA_PTR]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  }
+  // CIR:  %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+  // CIR:  cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+  // CIR:  %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+  // CIR:  cir.return %[[RET]] : !u8i
+
+  // LLVM-LABEL: _mm_aesdec256kl_u8
+  // LLVM:   %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+  // LLVM:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // LLVM:   %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+  // LLVM:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // LLVM: [[NO_ERROR]]:
+  // LLVM:   store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+  // LLVM:   br label %[[EXIT:.+]]
+  // LLVM: [[ERROR]]:
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // LLVM:   br label %[[EXIT]]
+  // LLVM: [[EXIT]]:
+  // LLVM:   %[[FLAG1:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+  // LLVM:   %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+  // LLVM:   store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+  // LLVM:   %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+  // LLVM:   ret i8 %[[RET]]
+
+  // OGCG-LABEL: test_mm_aesdec256kl_u8
+  // OGCG:   %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+  // OGCG:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // OGCG:   %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+  // OGCG:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // OGCG: [[NO_ERROR]]:
+  // OGCG:   store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+  // OGCG:   br label %[[EXIT:.+]]
+  // OGCG: [[ERROR]]:
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // OGCG:   br label %[[EXIT]]
+  // OGCG: [[EXIT]]:
+  // OGCG:   %[[RET:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   ret i8 %[[RET]]
+  return _mm_aesdec256kl_u8(odata, idata, h);
+}
+
+unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *h) {
+  // CIR-LABEL: _mm_aesenc128kl_u8
+  // CIR:  %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesenc128kl" %[[IDATA:.+]], %[[H:.+]] : (!cir.vector<2 x !s64i>, !cir.ptr<!void>) -> !rec_anon_struct
+  // CIR:  %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+  // CIR:  %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+  // CIR:  %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+  // CIR:  %[[OUT:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct -> !cir.vector<2 x !s64i>
+  // CIR:  cir.if %[[SUCC]] {
+  // CIR:    cir.store align(16) %[[OUT]], %[[ODATA_PTR:.+]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  } else {
+  // CIR:    %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+  // CIR:    cir.store align(16) %[[NULL]], %[[ODATA_PTR]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  }
+  // CIR:  %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+  // CIR:  cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+  // CIR:  %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+  // CIR:  cir.return %[[RET]] : !u8i
+
+  // LLVM-LABEL: _mm_aesenc128kl_u8
+  // LLVM:   %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+  // LLVM:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // LLVM:   %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+  // LLVM:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // LLVM: [[NO_ERROR]]:
+  // LLVM:   store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+  // LLVM:   br label %[[EXIT:.+]]
+  // LLVM: [[ERROR]]:
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // LLVM:   br label %[[EXIT]]
+  // LLVM: [[EXIT]]:
+  // LLVM:   %[[FLAG1:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+  // LLVM:   %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+  // LLVM:   store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+  // LLVM:   %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+  // LLVM:   ret i8 %[[RET]]
+
+  // OGCG-LABEL: test_mm_aesenc128kl_u8
+  // OGCG:   %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+  // OGCG:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // OGCG:   %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+  // OGCG:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // OGCG: [[NO_ERROR]]:
+  // OGCG:   store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+  // OGCG:   br label %[[EXIT:.+]]
+  // OGCG: [[ERROR]]:
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // OGCG:   br label %[[EXIT]]
+  // OGCG: [[EXIT]]:
+  // OGCG:   %[[RET:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   ret i8 %[[RET]]
+  return _mm_aesenc128kl_u8(odata, idata, h);
+}
+
+unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *h) {
+  // CIR-LABEL: _mm_aesdec128kl_u8
+  // CIR:  %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesdec128kl" %[[IDATA:.+]], %[[H:.+]] : (!cir.vector<2 x !s64i>, !cir.ptr<!void>) -> !rec_anon_struct
+  // CIR:  %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+  // CIR:  %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+  // CIR:  %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+  // CIR:  %[[OUT:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct -> !cir.vector<2 x !s64i>
+  // CIR:  cir.if %[[SUCC]] {
+  // CIR:    cir.store align(16) %[[OUT]], %[[ODATA_PTR:.+]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  } else {
+  // CIR:    %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+  // CIR:    cir.store align(16) %[[NULL]], %[[ODATA_PTR]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  }
+  // CIR:  %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+  // CIR:  cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+  // CIR:  %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+  // CIR:  cir.return %[[RET]] : !u8i
+
+  // LLVM-LABEL: _mm_aesdec128kl_u8
+  // LLVM:   %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+  // LLVM:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // LLVM:   %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+  // LLVM:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // LLVM: [[NO_ERROR]]:
+  // LLVM:   store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+  // LLVM:   br label %[[EXIT:.+]]
+  // LLVM: [[ERROR]]:
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // LLVM:   br label %[[EXIT]]
+  // LLVM: [[EXIT]]:
+  // LLVM:   %[[FLAG1:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+  // LLVM:   %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+  // LLVM:   store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+  // LLVM:   %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+  // LLVM:   ret i8 %[[RET]]
+
+  // OGCG-LABEL: test_mm_aesdec128kl_u8
+  // OGCG:   %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+  // OGCG:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // OGCG:   %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+  // OGCG:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // OGCG: [[NO_ERROR]]:
+  // OGCG:   store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+  // OGCG:   br label %[[EXIT:.+]]
+  // OGCG: [[ERROR]]:
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // OGCG:   br label %[[EXIT]]
+  // OGCG: [[EXIT]]:
+  // OGCG:   %[[RET:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   ret i8 %[[RET]]
+  return _mm_aesdec128kl_u8(odata, idata, h);
+}
+
+unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
+  // CIR-LABEL: _mm_aesencwide256kl_u8
+  // CIR:  %[[ZERO:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:  %[[IN_ADDR0:.+]] = cir.ptr_stride %[[IDATA_ADDR:.+]], %[[ZERO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA0:.+]] = cir.load align(16) %[[IN_ADDR0]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[ONE:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:  %[[IN_ADDR1:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[ONE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA1:.+]] = cir.load align(16) %[[IN_ADDR1]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[TWO:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:  %[[IN_ADDR2:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[TWO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA2:.+]] = cir.load align(16) %[[IN_ADDR2]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[THREE:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:  %[[IN_ADDR3:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[THREE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA3:.+]] = cir.load align(16) %[[IN_ADDR3]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[FOUR:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:  %[[IN_ADDR4:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FOUR]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA4:.+]] = cir.load align(16) %[[IN_ADDR4]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[FIVE:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:  %[[IN_ADDR5:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FIVE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA5:.+]] = cir.load align(16) %[[IN_ADDR5]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[SIX:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:  %[[IN_ADDR6:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SIX]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA6:.+]] = cir.load align(16) %[[IN_ADDR6]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[SEVEN:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:  %[[IN_ADDR7:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SEVEN]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA7:.+]] = cir.load align(16) %[[IN_ADDR7]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesencwide256kl" %[[H_ADDR:.+]], %[[IN_DATA0]], %[[IN_DATA1]], %[[IN_DATA2]], %[[IN_DATA3]], %[[IN_DATA4]], %[[IN_DATA5]], %[[IN_DATA6]], %[[IN_DATA7]] : (!cir.ptr<!void>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>) -> !rec_anon_struct1
+  // CIR:  %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+  // CIR:  %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+  // CIR:  %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+  // CIR:  cir.if %[[SUCC]] {
+  // CIR:    %[[OUT_DATA0:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[ZERO_1:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:    %[[OUT_ADDR0:.+]] = cir.ptr_stride %[[ODATA_ADDR:.+]], %[[ZERO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA0]], %[[OUT_ADDR0]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA1:.+]] = cir.extract_member %[[RESULT]][2] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[ONE_1:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:    %[[OUT_ADDR1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA1]], %[[OUT_ADDR1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA2:.+]] = cir.extract_member %[[RESULT]][3] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[TWO_1:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:    %[[OUT_ADDR2:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA2]], %[[OUT_ADDR2]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA3:.+]] = cir.extract_member %[[RESULT]][4] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[THREE_1:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:    %[[OUT_ADDR3:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA3]], %[[OUT_ADDR3]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA4:.+]] = cir.extract_member %[[RESULT]][5] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[FOUR_1:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:    %[[OUT_ADDR4:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA4]], %[[OUT_ADDR4]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA5:.+]] = cir.extract_member %[[RESULT]][6] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[FIVE_1:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:    %[[OUT_ADDR5:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA5]], %[[OUT_ADDR5]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA6:.+]] = cir.extract_member %[[RESULT]][7] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[SIX_1:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:    %[[OUT_ADDR6:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA6]], %[[OUT_ADDR6]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA7:.+]] = cir.extract_member %[[RESULT]][8] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[SEVEN_1:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:    %[[OUT_ADDR7:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA7]], %[[OUT_ADDR7]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  } else {
+  // CIR:    %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+  // CIR:    %[[ZERO_2:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:    %[[OUT_ADDR0_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ZERO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR0_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[ONE_2:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:    %[[OUT_ADDR1_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR1_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[TWO_2:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:    %[[OUT_ADDR2_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR2_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[THREE_2:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:    %[[OUT_ADDR3_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR3_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[FOUR_2:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:    %[[OUT_ADDR4_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR4_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[FIVE_2:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:    %[[OUT_ADDR5_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR5_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[SIX_2:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:    %[[OUT_ADDR6_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR6_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[SEVEN_2:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:    %[[OUT_ADDR7_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR7_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  }
+  // CIR:  %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+  // CIR:  cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+  // CIR:  %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+  // CIR:  cir.return %[[RET]] : !u8i
+
+  // LLVM-LABEL: _mm_aesencwide256kl_u8
+  // LLVM:   %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+  // LLVM:   %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 1
+  // LLVM:   %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+  // LLVM:   %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 2
+  // LLVM:   %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+  // LLVM:   %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 3
+  // LLVM:   %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+  // LLVM:   %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 4
+  // LLVM:   %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+  // LLVM:   %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 5
+  // LLVM:   %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+  // LLVM:   %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 6
+  // LLVM:   %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+  // LLVM:   %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 7
+  // LLVM:   %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+  // LLVM:   %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(ptr %[[H_ADDR:.+]], <2 x i64>  %[[IN_DATA0]], <2 x i64>  %[[IN_DATA1]], <2 x i64>  %[[IN_DATA2]], <2 x i64>  %[[IN_DATA3]], <2 x i64>  %[[IN_DATA4]], <2 x i64>  %[[IN_DATA5]], <2 x i64>  %[[IN_DATA6]], <2 x i64>  %[[IN_DATA7]])
+  // LLVM:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   %[[SUCC:.+]] = trunc i8 %34 to i1
+  // LLVM:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // LLVM: [[NO_ERROR]]:
+  // LLVM:   %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+  // LLVM:   store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+  // LLVM:   %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+  // LLVM:   %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+  // LLVM:   store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+  // LLVM:   %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+  // LLVM:   %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+  // LLVM:   store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+  // LLVM:   %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+  // LLVM:   %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+  // LLVM:   store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+  // LLVM:   %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+  // LLVM:   %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+  // LLVM:   store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+  // LLVM:   %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+  // LLVM:   %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+  // LLVM:   store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+  // LLVM:   %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+  // LLVM:   %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+  // LLVM:   store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+  // LLVM:   %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+  // LLVM:   %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+  // LLVM:   store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+  // LLVM:   br label %[[EXIT:.+]]
+  // LLVM: [[ERROR]]:
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // LLVM:   %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+  // LLVM:   %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+  // LLVM:   %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+  // LLVM:   %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+  // LLVM:   %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+  // LLVM:   %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+  // LLVM:   %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+  // LLVM:   br label %[[EXIT]]
+  // LLVM: [[EXIT]]:
+  // LLVM:   %[[FLAG1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+  // LLVM:   %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+  // LLVM:   store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+  // LLVM:   %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+  // LLVM:   ret i8 %[[RET]]
+
+  // OGCG-LABEL: _mm_aesencwide256kl_u8
+  // OGCG:   %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+  // OGCG:   %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 1
+  // OGCG:   %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+  // OGCG:   %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 2
+  // OGCG:   %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+  // OGCG:   %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 3
+  // OGCG:   %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+  // OGCG:   %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 4
+  // OGCG:   %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+  // OGCG:   %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 5
+  // OGCG:   %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+  // OGCG:   %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 6
+  // OGCG:   %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+  // OGCG:   %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 7
+  // OGCG:   %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+  // OGCG:   %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(ptr %[[H_ADDR:.+]], <2 x i64>  %[[IN_DATA0]], <2 x i64>  %[[IN_DATA1]], <2 x i64>  %[[IN_DATA2]], <2 x i64>  %[[IN_DATA3]], <2 x i64>  %[[IN_DATA4]], <2 x i64>  %[[IN_DATA5]], <2 x i64>  %[[IN_DATA6]], <2 x i64>  %[[IN_DATA7]])
+  // OGCG:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // OGCG:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // OGCG: [[NO_ERROR]]:
+  // OGCG:   %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+  // OGCG:   store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+  // OGCG:   %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+  // OGCG:   %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+  // OGCG:   store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+  // OGCG:   %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+  // OGCG:   %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+  // OGCG:   store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+  // OGCG:   %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+  // OGCG:   %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+  // OGCG:   store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+  // OGCG:   %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+  // OGCG:   %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+  // OGCG:   store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+  // OGCG:   %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+  // OGCG:   %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+  // OGCG:   store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+  // OGCG:   %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+  // OGCG:   %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+  // OGCG:   store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+  // OGCG:   %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+  // OGCG:   %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+  // OGCG:   store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+  // OGCG:   br label %[[EXIT:.+]]
+  // OGCG: [[ERROR]]:
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // OGCG:   %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+  // OGCG:   %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+  // OGCG:   %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+  // OGCG:   %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+  // OGCG:   %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+  // OGCG:   %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+  // OGCG:   %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+  // OGCG:   br label %[[EXIT]]
+  // OGCG: [[EXIT]]:
+  // OGCG:   %[[RET:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   ret i8 %[[RET]]
+  return _mm_aesencwide256kl_u8(odata, idata, h);
+}
+
+unsigned char test__mm_aesdecwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
+  // CIR-LABEL: _mm_aesdecwide256kl_u8
+  // CIR:  %[[ZERO:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:  %[[IN_ADDR0:.+]] = cir.ptr_stride %[[IDATA_ADDR:.+]], %[[ZERO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA0:.+]] = cir.load align(16) %[[IN_ADDR0]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[ONE:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:  %[[IN_ADDR1:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[ONE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA1:.+]] = cir.load align(16) %[[IN_ADDR1]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[TWO:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:  %[[IN_ADDR2:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[TWO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA2:.+]] = cir.load align(16) %[[IN_ADDR2]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[THREE:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:  %[[IN_ADDR3:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[THREE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA3:.+]] = cir.load align(16) %[[IN_ADDR3]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[FOUR:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:  %[[IN_ADDR4:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FOUR]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA4:.+]] = cir.load align(16) %[[IN_ADDR4]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[FIVE:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:  %[[IN_ADDR5:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FIVE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA5:.+]] = cir.load align(16) %[[IN_ADDR5]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[SIX:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:  %[[IN_ADDR6:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SIX]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA6:.+]] = cir.load align(16) %[[IN_ADDR6]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[SEVEN:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:  %[[IN_ADDR7:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SEVEN]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA7:.+]] = cir.load align(16) %[[IN_ADDR7]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesdecwide256kl" %[[H_ADDR:.+]], %[[IN_DATA0]], %[[IN_DATA1]], %[[IN_DATA2]], %[[IN_DATA3]], %[[IN_DATA4]], %[[IN_DATA5]], %[[IN_DATA6]], %[[IN_DATA7]] : (!cir.ptr<!void>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>) -> !rec_anon_struct1
+  // CIR:  %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+  // CIR:  %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+  // CIR:  %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+  // CIR:  cir.if %[[SUCC]] {
+  // CIR:    %[[OUT_DATA0:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[ZERO_1:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:    %[[OUT_ADDR0:.+]] = cir.ptr_stride %[[ODATA_ADDR:.+]], %[[ZERO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA0]], %[[OUT_ADDR0]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA1:.+]] = cir.extract_member %[[RESULT]][2] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[ONE_1:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:    %[[OUT_ADDR1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA1]], %[[OUT_ADDR1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA2:.+]] = cir.extract_member %[[RESULT]][3] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[TWO_1:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:    %[[OUT_ADDR2:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA2]], %[[OUT_ADDR2]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA3:.+]] = cir.extract_member %[[RESULT]][4] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[THREE_1:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:    %[[OUT_ADDR3:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA3]], %[[OUT_ADDR3]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA4:.+]] = cir.extract_member %[[RESULT]][5] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[FOUR_1:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:    %[[OUT_ADDR4:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA4]], %[[OUT_ADDR4]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA5:.+]] = cir.extract_member %[[RESULT]][6] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[FIVE_1:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:    %[[OUT_ADDR5:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA5]], %[[OUT_ADDR5]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA6:.+]] = cir.extract_member %[[RESULT]][7] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[SIX_1:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:    %[[OUT_ADDR6:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA6]], %[[OUT_ADDR6]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA7:.+]] = cir.extract_member %[[RESULT]][8] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[SEVEN_1:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:    %[[OUT_ADDR7:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA7]], %[[OUT_ADDR7]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  } else {
+  // CIR:    %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+  // CIR:    %[[ZERO_2:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:    %[[OUT_ADDR0_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ZERO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR0_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[ONE_2:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:    %[[OUT_ADDR1_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR1_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[TWO_2:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:    %[[OUT_ADDR2_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR2_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[THREE_2:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:    %[[OUT_ADDR3_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR3_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[FOUR_2:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:    %[[OUT_ADDR4_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR4_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[FIVE_2:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:    %[[OUT_ADDR5_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR5_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[SIX_2:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:    %[[OUT_ADDR6_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR6_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[SEVEN_2:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:    %[[OUT_ADDR7_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR7_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  }
+  // CIR:  %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+  // CIR:  cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+  // CIR:  %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+  // CIR:  cir.return %[[RET]] : !u8i
+
+  // LLVM-LABEL: _mm_aesdecwide256kl_u8
+  // LLVM:   %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+  // LLVM:   %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 1
+  // LLVM:   %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+  // LLVM:   %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 2
+  // LLVM:   %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+  // LLVM:   %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 3
+  // LLVM:   %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+  // LLVM:   %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 4
+  // LLVM:   %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+  // LLVM:   %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 5
+  // LLVM:   %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+  // LLVM:   %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 6
+  // LLVM:   %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+  // LLVM:   %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 7
+  // LLVM:   %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+  // LLVM:   %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr %[[H_ADDR:.+]], <2 x i64>  %[[IN_DATA0]], <2 x i64>  %[[IN_DATA1]], <2 x i64>  %[[IN_DATA2]], <2 x i64>  %[[IN_DATA3]], <2 x i64>  %[[IN_DATA4]], <2 x i64>  %[[IN_DATA5]], <2 x i64>  %[[IN_DATA6]], <2 x i64>  %[[IN_DATA7]])
+  // LLVM:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   %[[SUCC:.+]] = trunc i8 %34 to i1
+  // LLVM:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // LLVM: [[NO_ERROR]]:
+  // LLVM:   %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+  // LLVM:   store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+  // LLVM:   %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+  // LLVM:   %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+  // LLVM:   store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+  // LLVM:   %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+  // LLVM:   %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+  // LLVM:   store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+  // LLVM:   %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+  // LLVM:   %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+  // LLVM:   store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+  // LLVM:   %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+  // LLVM:   %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+  // LLVM:   store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+  // LLVM:   %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+  // LLVM:   %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+  // LLVM:   store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+  // LLVM:   %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+  // LLVM:   %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+  // LLVM:   store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+  // LLVM:   %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+  // LLVM:   %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+  // LLVM:   store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+  // LLVM:   br label %[[EXIT:.+]]
+  // LLVM: [[ERROR]]:
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // LLVM:   %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+  // LLVM:   %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+  // LLVM:   %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+  // LLVM:   %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+  // LLVM:   %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+  // LLVM:   %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+  // LLVM:   %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+  // LLVM:   br label %[[EXIT]]
+  // LLVM: [[EXIT]]:
+  // LLVM:   %[[FLAG1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+  // LLVM:   %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+  // LLVM:   store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+  // LLVM:   %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+  // LLVM:   ret i8 %[[RET]]
+
+  // OGCG-LABEL: _mm_aesdecwide256kl_u8
+  // OGCG:   %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+  // OGCG:   %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 1
+  // OGCG:   %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+  // OGCG:   %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 2
+  // OGCG:   %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+  // OGCG:   %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 3
+  // OGCG:   %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+  // OGCG:   %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 4
+  // OGCG:   %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+  // OGCG:   %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 5
+  // OGCG:   %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+  // OGCG:   %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 6
+  // OGCG:   %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+  // OGCG:   %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 7
+  // OGCG:   %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+  // OGCG:   %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr %[[H_ADDR:.+]], <2 x i64>  %[[IN_DATA0]], <2 x i64>  %[[IN_DATA1]], <2 x i64>  %[[IN_DATA2]], <2 x i64>  %[[IN_DATA3]], <2 x i64>  %[[IN_DATA4]], <2 x i64>  %[[IN_DATA5]], <2 x i64>  %[[IN_DATA6]], <2 x i64>  %[[IN_DATA7]])
+  // OGCG:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // OGCG:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // OGCG: [[NO_ERROR]]:
+  // OGCG:   %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+  // OGCG:   store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+  // OGCG:   %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+  // OGCG:   %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+  // OGCG:   store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+  // OGCG:   %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+  // OGCG:   %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+  // OGCG:   store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+  // OGCG:   %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+  // OGCG:   %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+  // OGCG:   store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+  // OGCG:   %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+  // OGCG:   %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+  // OGCG:   store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+  // OGCG:   %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+  // OGCG:   %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+  // OGCG:   store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+  // OGCG:   %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+  // OGCG:   %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+  // OGCG:   store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+  // OGCG:   %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+  // OGCG:   %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+  // OGCG:   store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+  // OGCG:   br label %[[EXIT:.+]]
+  // OGCG: [[ERROR]]:
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // OGCG:   %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+  // OGCG:   %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+  // OGCG:   %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+  // OGCG:   %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+  // OGCG:   %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+  // OGCG:   %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+  // OGCG:   %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+  // OGCG:   br label %[[EXIT]]
+  // OGCG: [[EXIT]]:
+  // OGCG:   %[[RET:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   ret i8 %[[RET]]
+  return _mm_aesdecwide256kl_u8(odata, idata, h);
+}
+
+unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
+  // CIR-LABEL: _mm_aesencwide128kl_u8
+  // CIR:  %[[ZERO:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:  %[[IN_ADDR0:.+]] = cir.ptr_stride %[[IDATA_ADDR:.+]], %[[ZERO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA0:.+]] = cir.load align(16) %[[IN_ADDR0]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[ONE:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:  %[[IN_ADDR1:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[ONE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA1:.+]] = cir.load align(16) %[[IN_ADDR1]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[TWO:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:  %[[IN_ADDR2:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[TWO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA2:.+]] = cir.load align(16) %[[IN_ADDR2]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[THREE:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:  %[[IN_ADDR3:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[THREE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA3:.+]] = cir.load align(16) %[[IN_ADDR3]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[FOUR:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:  %[[IN_ADDR4:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FOUR]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA4:.+]] = cir.load align(16) %[[IN_ADDR4]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[FIVE:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:  %[[IN_ADDR5:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FIVE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA5:.+]] = cir.load align(16) %[[IN_ADDR5]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[SIX:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:  %[[IN_ADDR6:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SIX]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA6:.+]] = cir.load align(16) %[[IN_ADDR6]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[SEVEN:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:  %[[IN_ADDR7:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SEVEN]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA7:.+]] = cir.load align(16) %[[IN_ADDR7]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesencwide128kl" %[[H_ADDR:.+]], %[[IN_DATA0]], %[[IN_DATA1]], %[[IN_DATA2]], %[[IN_DATA3]], %[[IN_DATA4]], %[[IN_DATA5]], %[[IN_DATA6]], %[[IN_DATA7]] : (!cir.ptr<!void>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>) -> !rec_anon_struct1
+  // CIR:  %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+  // CIR:  %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+  // CIR:  %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+  // CIR:  cir.if %[[SUCC]] {
+  // CIR:    %[[OUT_DATA0:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[ZERO_1:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:    %[[OUT_ADDR0:.+]] = cir.ptr_stride %[[ODATA_ADDR:.+]], %[[ZERO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA0]], %[[OUT_ADDR0]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA1:.+]] = cir.extract_member %[[RESULT]][2] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[ONE_1:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:    %[[OUT_ADDR1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA1]], %[[OUT_ADDR1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA2:.+]] = cir.extract_member %[[RESULT]][3] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[TWO_1:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:    %[[OUT_ADDR2:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA2]], %[[OUT_ADDR2]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA3:.+]] = cir.extract_member %[[RESULT]][4] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[THREE_1:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:    %[[OUT_ADDR3:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA3]], %[[OUT_ADDR3]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA4:.+]] = cir.extract_member %[[RESULT]][5] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[FOUR_1:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:    %[[OUT_ADDR4:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA4]], %[[OUT_ADDR4]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA5:.+]] = cir.extract_member %[[RESULT]][6] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[FIVE_1:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:    %[[OUT_ADDR5:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA5]], %[[OUT_ADDR5]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA6:.+]] = cir.extract_member %[[RESULT]][7] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[SIX_1:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:    %[[OUT_ADDR6:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA6]], %[[OUT_ADDR6]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA7:.+]] = cir.extract_member %[[RESULT]][8] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[SEVEN_1:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:    %[[OUT_ADDR7:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA7]], %[[OUT_ADDR7]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  } else {
+  // CIR:    %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+  // CIR:    %[[ZERO_2:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:    %[[OUT_ADDR0_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ZERO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR0_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[ONE_2:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:    %[[OUT_ADDR1_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR1_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[TWO_2:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:    %[[OUT_ADDR2_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR2_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[THREE_2:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:    %[[OUT_ADDR3_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR3_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[FOUR_2:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:    %[[OUT_ADDR4_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR4_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[FIVE_2:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:    %[[OUT_ADDR5_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR5_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[SIX_2:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:    %[[OUT_ADDR6_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR6_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[SEVEN_2:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:    %[[OUT_ADDR7_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR7_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  }
+  // CIR:  %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+  // CIR:  cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+  // CIR:  %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+  // CIR:  cir.return %[[RET]] : !u8i
+
+  // LLVM-LABEL: _mm_aesencwide128kl_u8
+  // LLVM:   %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+  // LLVM:   %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 1
+  // LLVM:   %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+  // LLVM:   %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 2
+  // LLVM:   %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+  // LLVM:   %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 3
+  // LLVM:   %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+  // LLVM:   %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 4
+  // LLVM:   %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+  // LLVM:   %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 5
+  // LLVM:   %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+  // LLVM:   %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 6
+  // LLVM:   %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+  // LLVM:   %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 7
+  // LLVM:   %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+  // LLVM:   %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(ptr %[[H_ADDR:.+]], <2 x i64>  %[[IN_DATA0]], <2 x i64>  %[[IN_DATA1]], <2 x i64>  %[[IN_DATA2]], <2 x i64>  %[[IN_DATA3]], <2 x i64>  %[[IN_DATA4]], <2 x i64>  %[[IN_DATA5]], <2 x i64>  %[[IN_DATA6]], <2 x i64>  %[[IN_DATA7]])
+  // LLVM:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   %[[SUCC:.+]] = trunc i8 %34 to i1
+  // LLVM:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // LLVM: [[NO_ERROR]]:
+  // LLVM:   %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+  // LLVM:   store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+  // LLVM:   %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+  // LLVM:   %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+  // LLVM:   store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+  // LLVM:   %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+  // LLVM:   %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+  // LLVM:   store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+  // LLVM:   %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+  // LLVM:   %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+  // LLVM:   store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+  // LLVM:   %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+  // LLVM:   %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+  // LLVM:   store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+  // LLVM:   %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+  // LLVM:   %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+  // LLVM:   store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+  // LLVM:   %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+  // LLVM:   %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+  // LLVM:   store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+  // LLVM:   %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+  // LLVM:   %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+  // LLVM:   store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+  // LLVM:   br label %[[EXIT:.+]]
+  // LLVM: [[ERROR]]:
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // LLVM:   %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+  // LLVM:   %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+  // LLVM:   %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+  // LLVM:   %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+  // LLVM:   %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+  // LLVM:   %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+  // LLVM:   %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+  // LLVM:   br label %[[EXIT]]
+  // LLVM: [[EXIT]]:
+  // LLVM:   %[[FLAG1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+  // LLVM:   %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+  // LLVM:   store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+  // LLVM:   %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+  // LLVM:   ret i8 %[[RET]]
+
+  // OGCG-LABEL: _mm_aesencwide128kl_u8
+  // OGCG:   %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+  // OGCG:   %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 1
+  // OGCG:   %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+  // OGCG:   %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 2
+  // OGCG:   %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+  // OGCG:   %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 3
+  // OGCG:   %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+  // OGCG:   %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 4
+  // OGCG:   %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+  // OGCG:   %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 5
+  // OGCG:   %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+  // OGCG:   %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 6
+  // OGCG:   %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+  // OGCG:   %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 7
+  // OGCG:   %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+  // OGCG:   %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(ptr %[[H_ADDR:.+]], <2 x i64>  %[[IN_DATA0]], <2 x i64>  %[[IN_DATA1]], <2 x i64>  %[[IN_DATA2]], <2 x i64>  %[[IN_DATA3]], <2 x i64>  %[[IN_DATA4]], <2 x i64>  %[[IN_DATA5]], <2 x i64>  %[[IN_DATA6]], <2 x i64>  %[[IN_DATA7]])
+  // OGCG:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // OGCG:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // OGCG: [[NO_ERROR]]:
+  // OGCG:   %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+  // OGCG:   store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+  // OGCG:   %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+  // OGCG:   %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+  // OGCG:   store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+  // OGCG:   %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+  // OGCG:   %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+  // OGCG:   store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+  // OGCG:   %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+  // OGCG:   %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+  // OGCG:   store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+  // OGCG:   %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+  // OGCG:   %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+  // OGCG:   store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+  // OGCG:   %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+  // OGCG:   %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+  // OGCG:   store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+  // OGCG:   %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+  // OGCG:   %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+  // OGCG:   store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+  // OGCG:   %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+  // OGCG:   %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+  // OGCG:   store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+  // OGCG:   br label %[[EXIT:.+]]
+  // OGCG: [[ERROR]]:
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // OGCG:   %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+  // OGCG:   %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+  // OGCG:   %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+  // OGCG:   %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+  // OGCG:   %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+  // OGCG:   %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+  // OGCG:   %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+  // OGCG:   br label %[[EXIT]]
+  // OGCG: [[EXIT]]:
+  // OGCG:   %[[RET:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   ret i8 %[[RET]]
+  return _mm_aesencwide128kl_u8(odata, idata, h);
+}
+
+unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
+  // CIR-LABEL: _mm_aesdecwide128kl_u8
+  // CIR:  %[[ZERO:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:  %[[IN_ADDR0:.+]] = cir.ptr_stride %[[IDATA_ADDR:.+]], %[[ZERO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA0:.+]] = cir.load align(16) %[[IN_ADDR0]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[ONE:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:  %[[IN_ADDR1:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[ONE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA1:.+]] = cir.load align(16) %[[IN_ADDR1]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[TWO:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:  %[[IN_ADDR2:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[TWO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA2:.+]] = cir.load align(16) %[[IN_ADDR2]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[THREE:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:  %[[IN_ADDR3:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[THREE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA3:.+]] = cir.load align(16) %[[IN_ADDR3]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[FOUR:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:  %[[IN_ADDR4:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FOUR]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA4:.+]] = cir.load align(16) %[[IN_ADDR4]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[FIVE:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:  %[[IN_ADDR5:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FIVE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA5:.+]] = cir.load align(16) %[[IN_ADDR5]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[SIX:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:  %[[IN_ADDR6:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SIX]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA6:.+]] = cir.load align(16) %[[IN_ADDR6]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[SEVEN:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:  %[[IN_ADDR7:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SEVEN]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  %[[IN_DATA7:.+]] = cir.load align(16) %[[IN_ADDR7]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+  // CIR:  %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesdecwide128kl" %[[H_ADDR:.+]], %[[IN_DATA0]], %[[IN_DATA1]], %[[IN_DATA2]], %[[IN_DATA3]], %[[IN_DATA4]], %[[IN_DATA5]], %[[IN_DATA6]], %[[IN_DATA7]] : (!cir.ptr<!void>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>) -> !rec_anon_struct1
+  // CIR:  %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+  // CIR:  %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+  // CIR:  %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+  // CIR:  cir.if %[[SUCC]] {
+  // CIR:    %[[OUT_DATA0:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[ZERO_1:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:    %[[OUT_ADDR0:.+]] = cir.ptr_stride %[[ODATA_ADDR:.+]], %[[ZERO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA0]], %[[OUT_ADDR0]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA1:.+]] = cir.extract_member %[[RESULT]][2] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[ONE_1:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:    %[[OUT_ADDR1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA1]], %[[OUT_ADDR1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA2:.+]] = cir.extract_member %[[RESULT]][3] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[TWO_1:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:    %[[OUT_ADDR2:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA2]], %[[OUT_ADDR2]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA3:.+]] = cir.extract_member %[[RESULT]][4] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[THREE_1:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:    %[[OUT_ADDR3:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA3]], %[[OUT_ADDR3]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA4:.+]] = cir.extract_member %[[RESULT]][5] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[FOUR_1:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:    %[[OUT_ADDR4:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA4]], %[[OUT_ADDR4]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA5:.+]] = cir.extract_member %[[RESULT]][6] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[FIVE_1:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:    %[[OUT_ADDR5:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA5]], %[[OUT_ADDR5]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA6:.+]] = cir.extract_member %[[RESULT]][7] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[SIX_1:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:    %[[OUT_ADDR6:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA6]], %[[OUT_ADDR6]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[OUT_DATA7:.+]] = cir.extract_member %[[RESULT]][8] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+  // CIR:    %[[SEVEN_1:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:    %[[OUT_ADDR7:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[OUT_DATA7]], %[[OUT_ADDR7]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  } else {
+  // CIR:    %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+  // CIR:    %[[ZERO_2:.+]] = cir.const #cir.int<0> : !u32i
+  // CIR:    %[[OUT_ADDR0_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ZERO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR0_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[ONE_2:.+]] = cir.const #cir.int<1> : !u32i
+  // CIR:    %[[OUT_ADDR1_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR1_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[TWO_2:.+]] = cir.const #cir.int<2> : !u32i
+  // CIR:    %[[OUT_ADDR2_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR2_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[THREE_2:.+]] = cir.const #cir.int<3> : !u32i
+  // CIR:    %[[OUT_ADDR3_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR3_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[FOUR_2:.+]] = cir.const #cir.int<4> : !u32i
+  // CIR:    %[[OUT_ADDR4_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR4_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[FIVE_2:.+]] = cir.const #cir.int<5> : !u32i
+  // CIR:    %[[OUT_ADDR5_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR5_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[SIX_2:.+]] = cir.const #cir.int<6> : !u32i
+  // CIR:    %[[OUT_ADDR6_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR6_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    %[[SEVEN_2:.+]] = cir.const #cir.int<7> : !u32i
+  // CIR:    %[[OUT_ADDR7_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:    cir.store align(16) %[[NULL]], %[[OUT_ADDR7_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+  // CIR:  }
+  // CIR:  %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+  // CIR:  cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+  // CIR:  %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+  // CIR:  cir.return %[[RET]] : !u8i
+
+  // LLVM-LABEL: _mm_aesdecwide128kl_u8
+  // LLVM:   %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+  // LLVM:   %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 1
+  // LLVM:   %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+  // LLVM:   %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 2
+  // LLVM:   %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+  // LLVM:   %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 3
+  // LLVM:   %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+  // LLVM:   %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 4
+  // LLVM:   %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+  // LLVM:   %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 5
+  // LLVM:   %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+  // LLVM:   %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 6
+  // LLVM:   %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+  // LLVM:   %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 7
+  // LLVM:   %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+  // LLVM:   %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(ptr %[[H_ADDR:.+]], <2 x i64>  %[[IN_DATA0]], <2 x i64>  %[[IN_DATA1]], <2 x i64>  %[[IN_DATA2]], <2 x i64>  %[[IN_DATA3]], <2 x i64>  %[[IN_DATA4]], <2 x i64>  %[[IN_DATA5]], <2 x i64>  %[[IN_DATA6]], <2 x i64>  %[[IN_DATA7]])
+  // LLVM:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   %[[SUCC:.+]] = trunc i8 %34 to i1
+  // LLVM:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // LLVM: [[NO_ERROR]]:
+  // LLVM:   %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+  // LLVM:   store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+  // LLVM:   %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+  // LLVM:   %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+  // LLVM:   store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+  // LLVM:   %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+  // LLVM:   %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+  // LLVM:   store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+  // LLVM:   %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+  // LLVM:   %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+  // LLVM:   store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+  // LLVM:   %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+  // LLVM:   %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+  // LLVM:   store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+  // LLVM:   %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+  // LLVM:   %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+  // LLVM:   store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+  // LLVM:   %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+  // LLVM:   %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+  // LLVM:   store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+  // LLVM:   %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+  // LLVM:   %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+  // LLVM:   store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+  // LLVM:   br label %[[EXIT:.+]]
+  // LLVM: [[ERROR]]:
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // LLVM:   %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+  // LLVM:   %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+  // LLVM:   %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+  // LLVM:   %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+  // LLVM:   %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+  // LLVM:   %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+  // LLVM:   %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+  // LLVM:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+  // LLVM:   br label %[[EXIT]]
+  // LLVM: [[EXIT]]:
+  // LLVM:   %[[FLAG1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // LLVM:   store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+  // LLVM:   %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+  // LLVM:   store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+  // LLVM:   %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+  // LLVM:   ret i8 %[[RET]]
+
+  // OGCG-LABEL: _mm_aesdecwide128kl_u8
+  // OGCG:   %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+  // OGCG:   %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 1
+  // OGCG:   %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+  // OGCG:   %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 2
+  // OGCG:   %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+  // OGCG:   %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 3
+  // OGCG:   %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+  // OGCG:   %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 4
+  // OGCG:   %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+  // OGCG:   %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 5
+  // OGCG:   %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+  // OGCG:   %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 6
+  // OGCG:   %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+  // OGCG:   %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 7
+  // OGCG:   %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+  // OGCG:   %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(ptr %[[H_ADDR:.+]], <2 x i64>  %[[IN_DATA0]], <2 x i64>  %[[IN_DATA1]], <2 x i64>  %[[IN_DATA2]], <2 x i64>  %[[IN_DATA3]], <2 x i64>  %[[IN_DATA4]], <2 x i64>  %[[IN_DATA5]], <2 x i64>  %[[IN_DATA6]], <2 x i64>  %[[IN_DATA7]])
+  // OGCG:   %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+  // OGCG:   br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+  // OGCG: [[NO_ERROR]]:
+  // OGCG:   %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+  // OGCG:   store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+  // OGCG:   %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+  // OGCG:   %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+  // OGCG:   store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+  // OGCG:   %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+  // OGCG:   %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+  // OGCG:   store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+  // OGCG:   %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+  // OGCG:   %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+  // OGCG:   store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+  // OGCG:   %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+  // OGCG:   %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+  // OGCG:   store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+  // OGCG:   %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+  // OGCG:   %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+  // OGCG:   store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+  // OGCG:   %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+  // OGCG:   %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+  // OGCG:   store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+  // OGCG:   %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+  // OGCG:   %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+  // OGCG:   store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+  // OGCG:   br label %[[EXIT:.+]]
+  // OGCG: [[ERROR]]:
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+  // OGCG:   %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+  // OGCG:   %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+  // OGCG:   %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+  // OGCG:   %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+  // OGCG:   %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+  // OGCG:   %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+  // OGCG:   %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+  // OGCG:   store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+  // OGCG:   br label %[[EXIT]]
+  // OGCG: [[EXIT]]:
+  // OGCG:   %[[RET:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+  // OGCG:   ret i8 %[[RET]]
+  return _mm_aesdecwide128kl_u8(odata, idata, h);
+}



More information about the cfe-commits mailing list