mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 03:27:39 +00:00
957c61cbbf
This change upgrades to GCC 12.3 and GNU binutils 2.42. The GNU linker appears to have changed things so that only a single de-duplicated str table is present in the binary, and it gets placed wherever the linker wants, regardless of what the linker script says. To cope with that we need to stop using .ident to embed licenses. As such, this change does significant work to revamp how third party licenses are defined in the codebase, using `.section .notice,"aR",@progbits`. This new GCC 12.3 toolchain has support for GNU indirect functions. It lets us support __target_clones__ for the first time. This is used for optimizing the performance of libc string functions such as strlen and friends so far on x86, by ensuring AVX systems favor a second codepath that uses VEX encoding. It shaves some latency off certain operations. It's a useful feature to have for scientific computing for the reasons explained by the test/libcxx/openmp_test.cc example which compiles for fifteen different microarchitectures. Thanks to the upgrades, it's now also possible to use newer instruction sets, such as AVX512FP16, VNNI. Cosmo now uses the %gs register on x86 by default for TLS. Doing it is helpful for any program that links `cosmo_dlopen()`. Such programs had to recompile their binaries at startup to change the TLS instructions. That's not great, since it means every page in the executable needs to be faulted. The work of rewriting TLS-related x86 opcodes, is moved to fixupobj.com instead. This is great news for MacOS x86 users, since we previously needed to morph the binary every time for that platform but now that's no longer necessary. The only platforms where we need fixup of TLS x86 opcodes at runtime are now Windows, OpenBSD, and NetBSD. On Windows we morph TLS to point deeper into the TIB, based on a TlsAlloc assignment, and on OpenBSD/NetBSD we morph %gs back into %fs since the kernels do not allow us to specify a value for the %gs register. OpenBSD users are now required to use APE Loader to run Cosmo binaries and assimilation is no longer possible. OpenBSD kernel needs to change to allow programs to specify a value for the %gs register, or it needs to stop marking executable pages loaded by the kernel as mimmutable(). This release fixes __constructor__, .ctor, .init_array, and lastly the .preinit_array so they behave the exact same way as glibc. We no longer use hex constants to define math.h symbols like M_PI.
452 lines
20 KiB
C
452 lines
20 KiB
C
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:4;coding:utf-8 -*-│
|
|
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ Copyright The Mbed TLS Contributors │
|
|
│ │
|
|
│ Licensed under the Apache License, Version 2.0 (the "License"); │
|
|
│ you may not use this file except in compliance with the License. │
|
|
│ You may obtain a copy of the License at │
|
|
│ │
|
|
│ http://www.apache.org/licenses/LICENSE-2.0 │
|
|
│ │
|
|
│ Unless required by applicable law or agreed to in writing, software │
|
|
│ distributed under the License is distributed on an "AS IS" BASIS, │
|
|
│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. │
|
|
│ See the License for the specific language governing permissions and │
|
|
│ limitations under the License. │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "third_party/mbedtls/aesni.h"
|
|
#include "libc/serialize.h"
|
|
#include "libc/str/str.h"
|
|
#include "third_party/mbedtls/common.h"
|
|
__static_yoink("mbedtls_notice");
|
|
|
|
/*
|
|
* [AES-WP] http://software.intel.com/en-us/articles/intel-advanced-encryption-standard-aes-instructions-set
|
|
* [CLMUL-WP] http://software.intel.com/en-us/articles/intel-carry-less-multiplication-instruction-and-its-usage-for-computing-the-gcm-mode/
|
|
*/
|
|
|
|
#if defined(MBEDTLS_AESNI_C)
|
|
#if defined(MBEDTLS_HAVE_X86_64)
|
|
|
|
/*
|
|
* Binutils needs to be at least 2.19 to support AES-NI instructions.
|
|
* Unfortunately, a lot of users have a lower version now (2014-04).
|
|
* Emit bytecode directly in order to support "old" version of gas.
|
|
*
|
|
* Opcodes from the Intel architecture reference manual, vol. 3.
|
|
* We always use registers, so we don't need prefixes for memory operands.
|
|
* Operand macros are in gas order (src, dst) as opposed to Intel order
|
|
* (dst, src) in order to blend better into the surrounding assembly code.
|
|
*/
|
|
#define AESDEC ".byte 0x66,0x0F,0x38,0xDE,"
|
|
#define AESDECLAST ".byte 0x66,0x0F,0x38,0xDF,"
|
|
#define AESENC ".byte 0x66,0x0F,0x38,0xDC,"
|
|
#define AESENCLAST ".byte 0x66,0x0F,0x38,0xDD,"
|
|
#define AESIMC ".byte 0x66,0x0F,0x38,0xDB,"
|
|
#define AESKEYGENA ".byte 0x66,0x0F,0x3A,0xDF,"
|
|
#define PCLMULQDQ ".byte 0x66,0x0F,0x3A,0x44,"
|
|
|
|
#define xmm0_xmm0 "0xC0"
|
|
#define xmm0_xmm1 "0xC8"
|
|
#define xmm0_xmm2 "0xD0"
|
|
#define xmm0_xmm3 "0xD8"
|
|
#define xmm0_xmm4 "0xE0"
|
|
#define xmm1_xmm0 "0xC1"
|
|
#define xmm1_xmm2 "0xD1"
|
|
|
|
/**
|
|
* \brief Internal AES-NI AES-ECB block encryption and decryption
|
|
*
|
|
* \note This function is only for internal use by other library
|
|
* functions; you must not call it directly.
|
|
*
|
|
* \param ctx AES context
|
|
* \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
|
|
* \param input 16-byte input block
|
|
* \param output 16-byte output block
|
|
*
|
|
* \return 0 on success (cannot fail)
|
|
*/
|
|
int mbedtls_aesni_crypt_ecb( mbedtls_aes_context *ctx,
|
|
int mode,
|
|
const unsigned char input[16],
|
|
unsigned char output[16] )
|
|
{
|
|
asm( "movdqu (%3), %%xmm0 \n\t" // load input
|
|
"movdqu (%1), %%xmm1 \n\t" // load round key 0
|
|
"pxor %%xmm1, %%xmm0 \n\t" // round 0
|
|
"add $16, %1 \n\t" // point to next round key
|
|
"subl $1, %0 \n\t" // normal rounds = nr - 1
|
|
"test %2, %2 \n\t" // mode?
|
|
"jz 2f \n\t" // 0 = decrypt
|
|
|
|
"1: \n\t" // encryption loop
|
|
"movdqu (%1), %%xmm1 \n\t" // load round key
|
|
"aesenc %%xmm1, %%xmm0 \n\t" // do round
|
|
"add $16, %1 \n\t" // point to next round key
|
|
"subl $1, %0 \n\t" // loop
|
|
"jnz 1b \n\t"
|
|
"movdqu (%1), %%xmm1 \n\t" // load round key
|
|
"aesenclast %%xmm1, %%xmm0 \n\t" // last round
|
|
"jmp 3f \n\t"
|
|
|
|
"2: \n\t" // decryption loop
|
|
"movdqu (%1), %%xmm1 \n\t"
|
|
"aesdec %%xmm1, %%xmm0 \n\t" // do round
|
|
"add $16, %1 \n\t"
|
|
"subl $1, %0 \n\t"
|
|
"jnz 2b \n\t"
|
|
"movdqu (%1), %%xmm1 \n\t" // load round key
|
|
"aesdeclast %%xmm1,%%xmm0 \n\t" // last round
|
|
|
|
"3: \n\t"
|
|
"movdqu %%xmm0, (%4) \n\t" // export output
|
|
:
|
|
: "r" (ctx->nr), "r" (ctx->rk), "r" (mode), "r" (input), "r" (output)
|
|
: "memory", "cc", "xmm0", "xmm1" );
|
|
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
/**
|
|
* \brief Internal GCM multiplication: a = a * b in GF(2^128)
|
|
* Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5.
|
|
*
|
|
* \note This function is intended for internal use.
|
|
*
|
|
* \param a First operand (big endian)
|
|
* \param b Second operand (host endian)
|
|
*
|
|
* \note Both operands and result are bit strings interpreted as
|
|
* elements of GF(2^128) as per the GCM spec.
|
|
*/
|
|
void mbedtls_aesni_gcm_mult( unsigned char a[16], const uint64_t b[2] )
|
|
{
|
|
uint64_t aa __attribute__((__vector_size__(16), __aligned__(16)));
|
|
uint64_t bb __attribute__((__vector_size__(16), __aligned__(16)));
|
|
|
|
/* The inputs are in big-endian order, so byte-reverse them */
|
|
aa[0] = READ64BE(a+8);
|
|
aa[1] = READ64BE(a+0);
|
|
memcpy(&bb, b, 16);
|
|
|
|
asm( /*
|
|
* Caryless multiplication xmm2:xmm1 = xmm0 * xmm1
|
|
* using [CLMUL-WP] algorithm 1 (p. 13).
|
|
*/
|
|
"movdqa %1, %%xmm2 \n\t" // copy of b1:b0
|
|
"movdqa %1, %%xmm3 \n\t" // same
|
|
"movdqa %1, %%xmm4 \n\t" // same
|
|
"pclmullqlqdq %0,%1 \n\t" // a0*b0 = c1:c0
|
|
"pclmulhqhqdq %0,%%xmm2 \n\t" // a1*b1 = d1:d0
|
|
"pclmullqhqdq %0,%%xmm3 \n\t" // a0*b1 = e1:e0
|
|
"pclmulhqlqdq %0,%%xmm4 \n\t" // a1*b0 = f1:f0
|
|
"pxor %%xmm3, %%xmm4 \n\t" // e1+f1:e0+f0
|
|
"movdqa %%xmm4, %%xmm3 \n\t" // same
|
|
"psrldq $8, %%xmm4 \n\t" // 0:e1+f1
|
|
"pslldq $8, %%xmm3 \n\t" // e0+f0:0
|
|
"pxor %%xmm4, %%xmm2 \n\t" // d1:d0+e1+f1
|
|
"pxor %%xmm3, %1 \n\t" // c1+e0+f1:c0
|
|
|
|
/*
|
|
* Now shift the result one bit to the left,
|
|
* taking advantage of [CLMUL-WP] eq 27 (p. 20)
|
|
*/
|
|
"movdqa %1, %%xmm3 \n\t" // r1:r0
|
|
"movdqa %%xmm2, %%xmm4 \n\t" // r3:r2
|
|
"psllq $1, %1 \n\t" // r1<<1:r0<<1
|
|
"psllq $1, %%xmm2 \n\t" // r3<<1:r2<<1
|
|
"psrlq $63, %%xmm3 \n\t" // r1>>63:r0>>63
|
|
"psrlq $63, %%xmm4 \n\t" // r3>>63:r2>>63
|
|
"movdqa %%xmm3, %%xmm5 \n\t" // r1>>63:r0>>63
|
|
"pslldq $8, %%xmm3 \n\t" // r0>>63:0
|
|
"pslldq $8, %%xmm4 \n\t" // r2>>63:0
|
|
"psrldq $8, %%xmm5 \n\t" // 0:r1>>63
|
|
"por %%xmm3, %1 \n\t" // r1<<1|r0>>63:r0<<1
|
|
"por %%xmm4, %%xmm2 \n\t" // r3<<1|r2>>62:r2<<1
|
|
"por %%xmm5, %%xmm2 \n\t" // r3<<1|r2>>62:r2<<1|r1>>63
|
|
|
|
/*
|
|
* Now reduce modulo the GCM polynomial x^128 + x^7 + x^2 + x + 1
|
|
* using [CLMUL-WP] algorithm 5 (p. 20).
|
|
* Currently xmm2:xmm1 holds x3:x2:x1:x0 (already shifted).
|
|
*/
|
|
/* Step 2 (1) */
|
|
"movdqa %1, %%xmm3 \n\t" // x1:x0
|
|
"movdqa %1, %%xmm4 \n\t" // same
|
|
"movdqa %1, %%xmm5 \n\t" // same
|
|
"psllq $63, %%xmm3 \n\t" // x1<<63:x0<<63 = stuff:a
|
|
"psllq $62, %%xmm4 \n\t" // x1<<62:x0<<62 = stuff:b
|
|
"psllq $57, %%xmm5 \n\t" // x1<<57:x0<<57 = stuff:c
|
|
|
|
/* Step 2 (2) */
|
|
"pxor %%xmm4, %%xmm3 \n\t" // stuff:a+b
|
|
"pxor %%xmm5, %%xmm3 \n\t" // stuff:a+b+c
|
|
"pslldq $8, %%xmm3 \n\t" // a+b+c:0
|
|
"pxor %%xmm3, %1 \n\t" // x1+a+b+c:x0 = d:x0
|
|
|
|
/* Steps 3 and 4 */
|
|
"movdqa %1,%0 \n\t" // d:x0
|
|
"movdqa %1,%%xmm4 \n\t" // same
|
|
"movdqa %1,%%xmm5 \n\t" // same
|
|
"psrlq $1, %0 \n\t" // e1:x0>>1 = e1:e0'
|
|
"psrlq $2, %%xmm4 \n\t" // f1:x0>>2 = f1:f0'
|
|
"psrlq $7, %%xmm5 \n\t" // g1:x0>>7 = g1:g0'
|
|
"pxor %%xmm4, %0 \n\t" // e1+f1:e0'+f0'
|
|
"pxor %%xmm5, %0 \n\t" // e1+f1+g1:e0'+f0'+g0'
|
|
// e0'+f0'+g0' is almost e0+f0+g0, ex\tcept for some missing
|
|
// bits carried from d. Now get those\t bits back in.
|
|
"movdqa %1,%%xmm3 \n\t" // d:x0
|
|
"movdqa %1,%%xmm4 \n\t" // same
|
|
"movdqa %1,%%xmm5 \n\t" // same
|
|
"psllq $63, %%xmm3 \n\t" // d<<63:stuff
|
|
"psllq $62, %%xmm4 \n\t" // d<<62:stuff
|
|
"psllq $57, %%xmm5 \n\t" // d<<57:stuff
|
|
"pxor %%xmm4, %%xmm3 \n\t" // d<<63+d<<62:stuff
|
|
"pxor %%xmm5, %%xmm3 \n\t" // missing bits of d:stuff
|
|
"psrldq $8, %%xmm3 \n\t" // 0:missing bits of d
|
|
"pxor %%xmm3, %0 \n\t" // e1+f1+g1:e0+f0+g0
|
|
"pxor %1, %0 \n\t" // h1:h0
|
|
"pxor %%xmm2, %0 \n\t" // x3+h1:x2+h0
|
|
: "+x" (aa)
|
|
: "x" (bb)
|
|
: "xmm2", "xmm3", "xmm4", "xmm5" );
|
|
|
|
/* Now byte-reverse the outputs */
|
|
WRITE64BE(a+0, aa[1]);
|
|
WRITE64BE(a+8, aa[0]);
|
|
}
|
|
|
|
/**
|
|
* \brief Internal round key inversion. This function computes
|
|
* decryption round keys from the encryption round keys.
|
|
*
|
|
* \note This function is only for internal use by other library
|
|
* functions; you must not call it directly.
|
|
*
|
|
* \param invkey Round keys for the equivalent inverse cipher
|
|
* \param fwdkey Original round keys (for encryption)
|
|
* \param nr Number of rounds (that is, number of round keys minus one)
|
|
*/
|
|
void mbedtls_aesni_inverse_key( unsigned char *invkey,
|
|
const unsigned char *fwdkey, int nr )
|
|
{
|
|
unsigned char *ik = invkey;
|
|
const unsigned char *fk = fwdkey + 16 * nr;
|
|
|
|
memcpy( ik, fk, 16 );
|
|
|
|
for( fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16 )
|
|
asm( "movdqu (%0), %%xmm0 \n\t"
|
|
AESIMC xmm0_xmm0 "\n\t"
|
|
"movdqu %%xmm0, (%1) \n\t"
|
|
:
|
|
: "r" (fk), "r" (ik)
|
|
: "memory", "xmm0" );
|
|
|
|
memcpy( ik, fk, 16 );
|
|
}
|
|
|
|
/*
|
|
* Key expansion, 128-bit case
|
|
*/
|
|
static void aesni_setkey_enc_128( unsigned char *rk,
|
|
const unsigned char *key )
|
|
{
|
|
asm( "movdqu (%1), %%xmm0 \n\t" // copy the original key
|
|
"movdqu %%xmm0, (%0) \n\t" // as round key 0
|
|
"jmp 2f \n\t" // skip auxiliary routine
|
|
|
|
/*
|
|
* Finish generating the next round key.
|
|
*
|
|
* On entry xmm0 is r3:r2:r1:r0 and xmm1 is X:stuff:stuff:stuff
|
|
* with X = rot( sub( r3 ) ) ^ RCON.
|
|
*
|
|
* On exit, xmm0 is r7:r6:r5:r4
|
|
* with r4 = X + r0, r5 = r4 + r1, r6 = r5 + r2, r7 = r6 + r3
|
|
* and those are written to the round key buffer.
|
|
*/
|
|
"1: \n\t"
|
|
"pshufd $0xff, %%xmm1, %%xmm1 \n\t" // X:X:X:X
|
|
"pxor %%xmm0, %%xmm1 \n\t" // X+r3:X+r2:X+r1:r4
|
|
"pslldq $4, %%xmm0 \n\t" // r2:r1:r0:0
|
|
"pxor %%xmm0, %%xmm1 \n\t" // X+r3+r2:X+r2+r1:r5:r4
|
|
"pslldq $4, %%xmm0 \n\t" // etc
|
|
"pxor %%xmm0, %%xmm1 \n\t"
|
|
"pslldq $4, %%xmm0 \n\t"
|
|
"pxor %%xmm1, %%xmm0 \n\t" // update xmm0 for next time!
|
|
"add $16, %0 \n\t" // point to next round key
|
|
"movdqu %%xmm0, (%0) \n\t" // write it
|
|
"ret \n\t"
|
|
|
|
/* Main "loop" */
|
|
"2: \n\t"
|
|
AESKEYGENA xmm0_xmm1 ",0x01 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm0_xmm1 ",0x02 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm0_xmm1 ",0x04 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm0_xmm1 ",0x08 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm0_xmm1 ",0x10 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm0_xmm1 ",0x20 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm0_xmm1 ",0x40 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm0_xmm1 ",0x80 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm0_xmm1 ",0x1B \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm0_xmm1 ",0x36 \n\tcall 1b \n\t"
|
|
:
|
|
: "r" (rk), "r" (key)
|
|
: "memory", "cc", "0" );
|
|
}
|
|
|
|
/*
|
|
* Key expansion, 192-bit case
|
|
*/
|
|
static void aesni_setkey_enc_192( unsigned char *rk,
|
|
const unsigned char *key )
|
|
{
|
|
asm( "movdqu (%1), %%xmm0 \n\t" // copy original round key
|
|
"movdqu %%xmm0, (%0) \n\t"
|
|
"add $16, %0 \n\t"
|
|
"movq 16(%1), %%xmm1 \n\t"
|
|
"movq %%xmm1, (%0) \n\t"
|
|
"add $8, %0 \n\t"
|
|
"jmp 2f \n\t" // skip auxiliary routine
|
|
|
|
/*
|
|
* Finish generating the next 6 quarter-keys.
|
|
*
|
|
* On entry xmm0 is r3:r2:r1:r0, xmm1 is stuff:stuff:r5:r4
|
|
* and xmm2 is stuff:stuff:X:stuff with X = rot( sub( r3 ) ) ^ RCON.
|
|
*
|
|
* On exit, xmm0 is r9:r8:r7:r6 and xmm1 is stuff:stuff:r11:r10
|
|
* and those are written to the round key buffer.
|
|
*/
|
|
"1: \n\t"
|
|
"pshufd $0x55, %%xmm2, %%xmm2 \n\t" // X:X:X:X
|
|
"pxor %%xmm0, %%xmm2 \n\t" // X+r3:X+r2:X+r1:r4
|
|
"pslldq $4, %%xmm0 \n\t" // etc
|
|
"pxor %%xmm0, %%xmm2 \n\t"
|
|
"pslldq $4, %%xmm0 \n\t"
|
|
"pxor %%xmm0, %%xmm2 \n\t"
|
|
"pslldq $4, %%xmm0 \n\t"
|
|
"pxor %%xmm2, %%xmm0 \n\t" // update xmm0 = r9:r8:r7:r6
|
|
"movdqu %%xmm0, (%0) \n\t"
|
|
"add $16, %0 \n\t"
|
|
"pshufd $0xff, %%xmm0, %%xmm2 \n\t" // r9:r9:r9:r9
|
|
"pxor %%xmm1, %%xmm2 \n\t" // stuff:stuff:r9+r5:r10
|
|
"pslldq $4, %%xmm1 \n\t" // r2:r1:r0:0
|
|
"pxor %%xmm2, %%xmm1 \n\t" // xmm1 = stuff:stuff:r11:r10
|
|
"movq %%xmm1, (%0) \n\t"
|
|
"add $8, %0 \n\t"
|
|
"ret \n\t"
|
|
|
|
"2: \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x01 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x02 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x04 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x08 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x10 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x20 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x40 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x80 \n\tcall 1b \n\t"
|
|
|
|
:
|
|
: "r" (rk), "r" (key)
|
|
: "memory", "cc", "0" );
|
|
}
|
|
|
|
/*
|
|
* Key expansion, 256-bit case
|
|
*/
|
|
static void aesni_setkey_enc_256( unsigned char *rk,
|
|
const unsigned char *key )
|
|
{
|
|
asm( "movdqu (%1), %%xmm0 \n\t"
|
|
"movdqu %%xmm0, (%0) \n\t"
|
|
"add $16, %0 \n\t"
|
|
"movdqu 16(%1), %%xmm1 \n\t"
|
|
"movdqu %%xmm1, (%0) \n\t"
|
|
"jmp 2f \n\t" // skip auxiliary routine
|
|
|
|
/*
|
|
* Finish generating the next two round keys.
|
|
*
|
|
* On entry xmm0 is r3:r2:r1:r0, xmm1 is r7:r6:r5:r4 and
|
|
* xmm2 is X:stuff:stuff:stuff with X = rot( sub( r7 )) ^ RCON
|
|
*
|
|
* On exit, xmm0 is r11:r10:r9:r8 and xmm1 is r15:r14:r13:r12
|
|
* and those have been written to the output buffer.
|
|
*/
|
|
"1: \n\t"
|
|
"pshufd $0xff, %%xmm2, %%xmm2 \n\t"
|
|
"pxor %%xmm0, %%xmm2 \n\t"
|
|
"pslldq $4, %%xmm0 \n\t"
|
|
"pxor %%xmm0, %%xmm2 \n\t"
|
|
"pslldq $4, %%xmm0 \n\t"
|
|
"pxor %%xmm0, %%xmm2 \n\t"
|
|
"pslldq $4, %%xmm0 \n\t"
|
|
"pxor %%xmm2, %%xmm0 \n\t"
|
|
"add $16, %0 \n\t"
|
|
"movdqu %%xmm0, (%0) \n\t"
|
|
|
|
/* Set xmm2 to stuff:Y:stuff:stuff with Y = subword( r11 )
|
|
* and proceed to generate next round key from there */
|
|
AESKEYGENA xmm0_xmm2 ",0x00 \n\t"
|
|
"pshufd $0xaa, %%xmm2, %%xmm2 \n\t"
|
|
"pxor %%xmm1, %%xmm2 \n\t"
|
|
"pslldq $4, %%xmm1 \n\t"
|
|
"pxor %%xmm1, %%xmm2 \n\t"
|
|
"pslldq $4, %%xmm1 \n\t"
|
|
"pxor %%xmm1, %%xmm2 \n\t"
|
|
"pslldq $4, %%xmm1 \n\t"
|
|
"pxor %%xmm2, %%xmm1 \n\t"
|
|
"add $16, %0 \n\t"
|
|
"movdqu %%xmm1, (%0) \n\t"
|
|
"ret \n\t"
|
|
|
|
/*
|
|
* Main "loop" - Generating one more key than necessary,
|
|
* see definition of mbedtls_aes_context.buf
|
|
*/
|
|
"2: \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x01 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x02 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x04 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x08 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x10 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x20 \n\tcall 1b \n\t"
|
|
AESKEYGENA xmm1_xmm2 ",0x40 \n\tcall 1b \n\t"
|
|
:
|
|
: "r" (rk), "r" (key)
|
|
: "memory", "cc", "0" );
|
|
}
|
|
|
|
/**
|
|
* \brief Internal key expansion for encryption
|
|
*
|
|
* \note This function is only for internal use by other library
|
|
* functions; you must not call it directly.
|
|
*
|
|
* \param rk Destination buffer where the round keys are written
|
|
* \param key Encryption key
|
|
* \param bits Key size in bits (must be 128, 192 or 256)
|
|
*
|
|
* \return 0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH
|
|
*/
|
|
int mbedtls_aesni_setkey_enc( unsigned char *rk,
|
|
const unsigned char *key,
|
|
size_t bits )
|
|
{
|
|
switch( bits )
|
|
{
|
|
case 128: aesni_setkey_enc_128( rk, key ); break;
|
|
case 192: aesni_setkey_enc_192( rk, key ); break;
|
|
case 256: aesni_setkey_enc_256( rk, key ); break;
|
|
default : return( MBEDTLS_ERR_AES_INVALID_KEY_LENGTH );
|
|
}
|
|
return( 0 );
|
|
}
|
|
|
|
#endif /* MBEDTLS_HAVE_X86_64 */
|
|
#endif /* MBEDTLS_AESNI_C */
|