/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:4;coding:utf-8 -*-│
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright The Mbed TLS Contributors                                          │
│                                                                              │
│ Licensed under the Apache License, Version 2.0 (the "License");              │
│ you may not use this file except in compliance with the License.             │
│ You may obtain a copy of the License at                                      │
│                                                                              │
│     http://www.apache.org/licenses/LICENSE-2.0                               │
│                                                                              │
│ Unless required by applicable law or agreed to in writing, software          │
│ distributed under the License is distributed on an "AS IS" BASIS,            │
│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
│ See the License for the specific language governing permissions and          │
│ limitations under the License.                                               │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/nexgen32e/x86feature.h"
#include "third_party/mbedtls/bignum_internal.h"
#include "third_party/mbedtls/math.h"

/**
 * Computes 512-bit product of 256-bit and 256-bit numbers.
 *
 * @param C receives 8 quadword result
 * @param A is left hand side which must have 4 quadwords
 * @param B is right hand side which must have 4 quadwords
 * @note words are host endian while array is little endian
 * @mayalias
 */
void (*Mul4x4)(uint64_t C[16], const uint64_t A[8], const uint64_t B[8]);

__attribute__((__constructor__(10))) static textstartup void Mul4x4Init()
{
    Mul4x4 = X86_HAVE(ADX) && X86_HAVE(BMI2) ? Mul4x4Adx : Mul4x4Pure;
}

void Mul4x4Pure(uint64_t C[16], const uint64_t A[8], const uint64_t B[8])
{
    uint128_t t;
    uint64_t h, c1, c2, c3;
    uint64_t r0, r1, r2, r3;
    c1 = c2 = c3 = 0;
    MADD(A[0], B[0], c1, c2, c3);
    r0 = c1, c1 = 0;
    MADD(A[0], B[1], c2, c3, c1);
    MADD(A[1], B[0], c2, c3, c1);
    r1 = c2, c2 = 0;
    MADD(A[2], B[0], c3, c1, c2);
    MADD(A[1], B[1], c3, c1, c2);
    MADD(A[0], B[2], c3, c1, c2);
    r2 = c3, c3 = 0;
    MADD(A[0], B[3], c1, c2, c3);
    MADD(A[1], B[2], c1, c2, c3);
    MADD(A[2], B[1], c1, c2, c3);
    MADD(A[3], B[0], c1, c2, c3);
    C[0] = r0;
    r3 = c1, c1 = 0;
    MADD(A[3], B[1], c2, c3, c1);
    MADD(A[2], B[2], c2, c3, c1);
    MADD(A[1], B[3], c2, c3, c1);
    C[1] = r1;
    C[4] = c2, c2 = 0;
    MADD(A[2], B[3], c3, c1, c2);
    MADD(A[3], B[2], c3, c1, c2);
    C[2] = r2;
    C[5] = c3, c3 = 0;
    MADD(A[3], B[3], c1, c2, c3);
    C[3] = r3;
    C[6] = c1;
    C[7] = c2;
}