cosmopolitan/third_party/python/Modules/_decimal/libmpdec/transpose.c

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright (c) 2008-2016 Stefan Krah. All rights reserved.                    │
│                                                                              │
│ Redistribution and use in source and binary forms, with or without           │
│ modification, are permitted provided that the following conditions           │
│ are met:                                                                     │
│                                                                              │
│ 1. Redistributions of source code must retain the above copyright            │
│    notice, this list of conditions and the following disclaimer.             │
│                                                                              │
│ 2. Redistributions in binary form must reproduce the above copyright         │
│    notice, this list of conditions and the following disclaimer in           │
│    the documentation and/or other materials provided with the                │
│    distribution.                                                             │
│                                                                              │
│ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND         │
│ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE        │
│ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR           │
│ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS         │
│ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,          │
│ OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT         │
│ OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR           │
│ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,        │
│ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE         │
│ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,            │
│ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                           │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "third_party/python/Modules/_decimal/libmpdec/transpose.h"
#include "libc/mem/gc.h"
#include "libc/mem/mem.h"
#include "third_party/python/Modules/_decimal/libmpdec/bits.h"
#include "third_party/python/Modules/_decimal/libmpdec/constants.h"
#include "third_party/python/Modules/_decimal/libmpdec/mpdecimal.h"
#include "third_party/python/Modules/_decimal/libmpdec/typearith.h"
__static_yoink("libmpdec_notice");

#define BUFSIZE 4096
#define SIDE 128

/* Bignum: The transpose functions are used for very large transforms
   in sixstep.c and fourstep.c. */

/* Definition of the matrix transpose */
void
std_trans(mpd_uint_t dest[], mpd_uint_t src[], mpd_size_t rows, mpd_size_t cols)
{
    mpd_size_t idest, isrc;
    mpd_size_t r, c;
    for (r = 0; r < rows; r++) {
        isrc = r * cols;
        idest = r;
        for (c = 0; c < cols; c++) {
            dest[idest] = src[isrc];
            isrc += 1;
            idest += rows;
        }
    }
}

/*
 * Swap half-rows of 2^n * (2*2^n) matrix.
 * FORWARD_CYCLE: even/odd permutation of the halfrows.
 * BACKWARD_CYCLE: reverse the even/odd permutation.
 */
static int
swap_halfrows_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols, int dir)
{
    mpd_uint_t *buf1 = gc(malloc(BUFSIZE*sizeof(mpd_uint_t)));
    mpd_uint_t *buf2 = gc(malloc(BUFSIZE*sizeof(mpd_uint_t)));
    mpd_uint_t *readbuf, *writebuf, *hp;
    mpd_size_t *done, dbits;
    mpd_size_t b = BUFSIZE, stride;
    mpd_size_t hn, hmax; /* halfrow number */
    mpd_size_t m, r=0;
    mpd_size_t offset;
    mpd_size_t next;
    assert(cols == mul_size_t(2, rows));
    if (dir == FORWARD_CYCLE) {
        r = rows;
    }
    else if (dir == BACKWARD_CYCLE) {
        r = 2;
    }
    else {
        abort(); /* GCOV_NOT_REACHED */
    }
    m = cols - 1;
    hmax = rows; /* cycles start at odd halfrows */
    dbits = 8 * sizeof *done;
    if ((done = mpd_calloc(hmax/(sizeof *done) + 1, sizeof *done)) == NULL) {
        return 0;
    }
    for (hn = 1; hn <= hmax; hn += 2) {
        if (done[hn/dbits] & mpd_bits[hn%dbits]) {
            continue;
        }
        readbuf = buf1; writebuf = buf2;
        for (offset = 0; offset < cols/2; offset += b) {
            stride = (offset + b < cols/2) ? b : cols/2-offset;
            hp = matrix + hn*cols/2;
            memcpy(readbuf, hp+offset, stride*(sizeof *readbuf));
            pointerswap(&readbuf, &writebuf);
            next = mulmod_size_t(hn, r, m);
            hp = matrix + next*cols/2;
            while (next != hn) {
                memcpy(readbuf, hp+offset, stride*(sizeof *readbuf));
                memcpy(hp+offset, writebuf, stride*(sizeof *writebuf));
                pointerswap(&readbuf, &writebuf);
                done[next/dbits] |= mpd_bits[next%dbits];
                next = mulmod_size_t(next, r, m);
                    hp = matrix + next*cols/2;
            }
            memcpy(hp+offset, writebuf, stride*(sizeof *writebuf));
            done[hn/dbits] |= mpd_bits[hn%dbits];
        }
    }
    mpd_free(done);
    return 1;
}

/* In-place transpose of a square matrix */
static inline void
squaretrans(mpd_uint_t *buf, mpd_size_t cols)
{
    mpd_uint_t tmp;
    mpd_size_t idest, isrc;
    mpd_size_t r, c;
    for (r = 0; r < cols; r++) {
        c = r+1;
        isrc = r*cols + c;
        idest = c*cols + r;
        for (c = r+1; c < cols; c++) {
            tmp = buf[isrc];
            buf[isrc] = buf[idest];
            buf[idest] = tmp;
            isrc += 1;
            idest += cols;
        }
    }
}

/*
 * Transpose 2^n * 2^n matrix. For cache efficiency, the matrix is split into
 * square blocks with side length 'SIDE'. First, the blocks are transposed,
 * then a square transposition is done on each individual block.
 */
static void
squaretrans_pow2(mpd_uint_t *matrix, mpd_size_t size)
{
    mpd_uint_t *buf1 = gc(malloc(SIDE*SIDE*sizeof(mpd_uint_t)));
    mpd_uint_t *buf2 = gc(malloc(SIDE*SIDE*sizeof(mpd_uint_t)));
    mpd_uint_t *to, *from;
    mpd_size_t b = size;
    mpd_size_t r, c;
    mpd_size_t i;
    while (b > SIDE) b >>= 1;
    for (r = 0; r < size; r += b) {
        for (c = r; c < size; c += b) {
            from = matrix + r*size + c;
            to = buf1;
            for (i = 0; i < b; i++) {
                memcpy(to, from, b*(sizeof *to));
                from += size;
                to += b;
            }
            squaretrans(buf1, b);
            if (r == c) {
                to = matrix + r*size + c;
                from = buf1;
                for (i = 0; i < b; i++) {
                    memcpy(to, from, b*(sizeof *to));
                    from += b;
                    to += size;
                }
                continue;
            }
            else {
                from = matrix + c*size + r;
                to = buf2;
                for (i = 0; i < b; i++) {
                    memcpy(to, from, b*(sizeof *to));
                    from += size;
                    to += b;
                }
                squaretrans(buf2, b);
                to = matrix + c*size + r;
                from = buf1;
                for (i = 0; i < b; i++) {
                    memcpy(to, from, b*(sizeof *to));
                    from += b;
                    to += size;
                }
                to = matrix + r*size + c;
                from = buf2;
                for (i = 0; i < b; i++) {
                    memcpy(to, from, b*(sizeof *to));
                    from += b;
                    to += size;
                }
            }
        }
    }
}

/*
 * In-place transposition of a 2^n x 2^n or a 2^n x (2*2^n)
 * or a (2*2^n) x 2^n matrix.
 */
int
transpose_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols)
{
    mpd_size_t size = mul_size_t(rows, cols);
    assert(ispower2(rows));
    assert(ispower2(cols));
    if (cols == rows) {
        squaretrans_pow2(matrix, rows);
    }
    else if (cols == mul_size_t(2, rows)) {
        if (!swap_halfrows_pow2(matrix, rows, cols, FORWARD_CYCLE)) {
            return 0;
        }
        squaretrans_pow2(matrix, rows);
        squaretrans_pow2(matrix+(size/2), rows);
    }
    else if (rows == mul_size_t(2, cols)) {
        squaretrans_pow2(matrix, cols);
        squaretrans_pow2(matrix+(size/2), cols);
        if (!swap_halfrows_pow2(matrix, cols, rows, BACKWARD_CYCLE)) {
            return 0;
        }
    }
    else {
        __builtin_unreachable();
    }
    return 1;
}