Import gcrypt public-key cryptography and implement signature checking.

This commit is contained in:
Vladimir 'phcoder' Serbinenko 2013-01-11 21:32:42 +01:00
parent 535714bdcf
commit 5e3b8dcbb5
238 changed files with 40500 additions and 417 deletions

View file

@ -0,0 +1,115 @@
Copyright 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
INTEL PENTIUM-4 MPN SUBROUTINES
This directory contains mpn functions optimized for Intel Pentium-4.
The mmx subdirectory has routines using MMX instructions, the sse2
subdirectory has routines using SSE2 instructions. All P4s have these, the
separate directories are just so configure can omit that code if the
assembler doesn't support it.
STATUS
cycles/limb
mpn_add_n/sub_n 4 normal, 6 in-place
mpn_mul_1 4 normal, 6 in-place
mpn_addmul_1 6
mpn_submul_1 7
mpn_mul_basecase 6 cycles/crossproduct (approx)
mpn_sqr_basecase 3.5 cycles/crossproduct (approx)
or 7.0 cycles/triangleproduct (approx)
mpn_l/rshift 1.75
The shifts ought to be able to go at 1.5 c/l, but not much effort has been
applied to them yet.
In-place operations, and all addmul, submul, mul_basecase and sqr_basecase
calls, suffer from pipeline anomalies associated with write combining and
movd reads and writes to the same or nearby locations. The movq
instructions do not trigger the same hardware problems. Unfortunately,
using movq and splitting/combining seems to require too many extra
instructions to help. Perhaps future chip steppings will be better.
NOTES
The Pentium-4 pipeline "Netburst", provides for quite a number of surprises.
Many traditional x86 instructions run very slowly, requiring use of
alterative instructions for acceptable performance.
adcl and sbbl are quite slow at 8 cycles for reg->reg. paddq of 32-bits
within a 64-bit mmx register seems better, though the combination
paddq/psrlq when propagating a carry is still a 4 cycle latency.
incl and decl should be avoided, instead use add $1 and sub $1. Apparently
the carry flag is not separately renamed, so incl and decl depend on all
previous flags-setting instructions.
shll and shrl have a 4 cycle latency, or 8 times the latency of the fastest
integer instructions (addl, subl, orl, andl, and some more). shldl and
shrdl seem to have 13 and 15 cycles latency, respectively. Bizarre.
movq mmx -> mmx does have 6 cycle latency, as noted in the documentation.
pxor/por or similar combination at 2 cycles latency can be used instead.
The movq however executes in the float unit, thereby saving MMX execution
resources. With the right juggling, data moves shouldn't be on a dependent
chain.
L1 is write-through, but the write-combining sounds like it does enough to
not require explicit destination prefetching.
xmm registers so far haven't found a use, but not much effort has been
expended. A configure test for whether the operating system knows
fxsave/fxrestor will be needed if they're used.
REFERENCES
Intel Pentium-4 processor manuals,
http://developer.intel.com/design/pentium4/manuals
"Intel Pentium 4 Processor Optimization Reference Manual", Intel, 2001,
order number 248966. Available on-line:
http://developer.intel.com/design/pentium4/manuals/248966.htm
----------------
Local variables:
mode: text
fill-column: 76
End:

View file

@ -0,0 +1,3 @@
README

View file

@ -0,0 +1,2 @@
mpih-lshift.S
mpih-rshift.S

View file

@ -0,0 +1,457 @@
/* Intel Pentium-4 mpn_lshift -- left shift.
*
* Copyright 2001, 2002 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*
* Note: This code is heavily based on the GNU MP Library.
* Actually it's the same code with only minor changes in the
* way the data is stored; this is to support the abstraction
* of an optional secure memory allocation which may be used
* to avoid revealing of sensitive data due to paging etc.
*/
#include "sysdep.h"
#include "asm-syntax.h"
/*******************
* mpi_limb_t
* _gcry_mpih_lshift( mpi_ptr_t wp, (sp + 4)
* mpi_ptr_t up, (sp + 8)
* mpi_size_t usize, (sp + 12)
* unsigned cnt) (sp + 16)
*
* P4 Willamette, Northwood: 1.75 cycles/limb
* P4 Prescott: 2.0 cycles/limb
*/
.text
ALIGN (3)
.globl C_SYMBOL_NAME(_gcry_mpih_lshift)
C_SYMBOL_NAME(_gcry_mpih_lshift:)
pushl %ebx
pushl %edi
movl 20(%esp), %eax
movl 12(%esp), %edx
movl 16(%esp), %ebx
movl 24(%esp), %ecx
cmp $5, %eax
jae .Lunroll
movl -4(%ebx,%eax,4), %edi
decl %eax
jnz .Lsimple
shldl %cl, %edi, %eax
shll %cl, %edi
movl %edi, (%edx)
popl %edi
popl %ebx
ret
.Lsimple:
movd (%ebx,%eax,4), %mm5
movd %ecx, %mm6
negl %ecx
psllq %mm6, %mm5
addl $32, %ecx
movd %ecx, %mm7
psrlq $32, %mm5
.Lsimple_top:
movq -4(%ebx,%eax,4), %mm0
decl %eax
psrlq %mm7, %mm0
movd %mm0, 4(%edx,%eax,4)
jnz .Lsimple_top
movd (%ebx), %mm0
movd %mm5, %eax
psllq %mm6, %mm0
popl %edi
popl %ebx
movd %mm0, (%edx)
emms
ret
.align 8, 0x90
.Lunroll:
movd -4(%ebx,%eax,4), %mm5
leal (%ebx,%eax,4), %edi
movd %ecx, %mm6
andl $4, %edi
psllq %mm6, %mm5
jz .Lstart_src_aligned
movq -8(%ebx,%eax,4), %mm0
psllq %mm6, %mm0
decl %eax
psrlq $32, %mm0
movd %mm0, (%edx,%eax,4)
.Lstart_src_aligned:
movq -8(%ebx,%eax,4), %mm1
leal (%edx,%eax,4), %edi
andl $4, %edi
psrlq $32, %mm5
movq -16(%ebx,%eax,4), %mm3
jz .Lstart_dst_aligned
movq %mm1, %mm0
addl $32, %ecx
psllq %mm6, %mm0
movd %ecx, %mm6
psrlq $32, %mm0
movd %mm0, -4(%edx,%eax,4)
subl $4, %edx
.Lstart_dst_aligned:
psllq %mm6, %mm1
negl %ecx
addl $64, %ecx
movq %mm3, %mm2
movd %ecx, %mm7
subl $8, %eax
psrlq %mm7, %mm3
por %mm1, %mm3
jc .Lfinish
.align 8, 0x90
.Lunroll_loop:
movq 8(%ebx,%eax,4), %mm0
psllq %mm6, %mm2
movq %mm0, %mm1
psrlq %mm7, %mm0
movq %mm3, 24(%edx,%eax,4)
por %mm2, %mm0
movq (%ebx,%eax,4), %mm3
psllq %mm6, %mm1
movq %mm0, 16(%edx,%eax,4)
movq %mm3, %mm2
psrlq %mm7, %mm3
subl $4, %eax
por %mm1, %mm3
jnc .Lunroll_loop
.Lfinish:
testb $2, %al
jz .Lfinish_no_two
movq 8(%ebx,%eax,4), %mm0
psllq %mm6, %mm2
movq %mm0, %mm1
psrlq %mm7, %mm0
movq %mm3, 24(%edx,%eax,4)
por %mm2, %mm0
movq %mm1, %mm2
movq %mm0, %mm3
subl $2, %eax
.Lfinish_no_two:
testb $1, %al
movd %mm5, %eax
popl %edi
jz .Lfinish_zero
movd (%ebx), %mm0
psllq %mm6, %mm2
movq %mm3, 12(%edx)
psllq $32, %mm0
movq %mm0, %mm1
psrlq %mm7, %mm0
por %mm2, %mm0
psllq %mm6, %mm1
movq %mm0, 4(%edx)
psrlq $32, %mm1
andl $32, %ecx
popl %ebx
jz .Lfinish_one_unaligned
movd %mm1, (%edx)
.Lfinish_one_unaligned:
emms
ret
.Lfinish_zero:
movq %mm3, 8(%edx)
andl $32, %ecx
psllq %mm6, %mm2
jz .Lfinish_zero_unaligned
movq %mm2, (%edx)
.Lfinish_zero_unaligned:
psrlq $32, %mm2
popl %ebx
movd %mm5, %eax
movd %mm2, 4(%edx)
emms
ret

View file

@ -0,0 +1,453 @@
/* Intel Pentium-4 mpn_rshift -- right shift.
*
* Copyright 2001, 2002 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*
* Note: This code is heavily based on the GNU MP Library.
* Actually it's the same code with only minor changes in the
* way the data is stored; this is to support the abstraction
* of an optional secure memory allocation which may be used
* to avoid revealing of sensitive data due to paging etc.
*/
#include "sysdep.h"
#include "asm-syntax.h"
/*******************
* mpi_limb_t
* _gcry_mpih_rshift( mpi_ptr_t wp, (sp + 4)
* mpi_ptr_t up, (sp + 8)
* mpi_size_t usize, (sp + 12)
* unsigned cnt) (sp + 16)
*
* P4 Willamette, Northwood: 1.75 cycles/limb
* P4 Prescott: 2.0 cycles/limb
*/
.text
ALIGN (3)
.globl C_SYMBOL_NAME(_gcry_mpih_rshift)
C_SYMBOL_NAME(_gcry_mpih_rshift:)
pushl %ebx
pushl %edi
movl 20(%esp), %eax
movl 12(%esp), %edx
movl 16(%esp), %ebx
movl 24(%esp), %ecx
cmp $5, %eax
jae .Lunroll
decl %eax
movl (%ebx), %edi
jnz .Lsimple
shrdl %cl, %edi, %eax
shrl %cl, %edi
movl %edi, (%edx)
popl %edi
popl %ebx
ret
.align 8, 0x90
.Lsimple:
movd (%ebx), %mm5
leal (%ebx,%eax,4), %ebx
movd %ecx, %mm6
leal -4(%edx,%eax,4), %edx
psllq $32, %mm5
negl %eax
.Lsimple_top:
movq (%ebx,%eax,4), %mm0
incl %eax
psrlq %mm6, %mm0
movd %mm0, (%edx,%eax,4)
jnz .Lsimple_top
movd (%ebx), %mm0
psrlq %mm6, %mm5
psrlq %mm6, %mm0
popl %edi
movd %mm5, %eax
popl %ebx
movd %mm0, 4(%edx)
emms
ret
.align 8, 0x90
.Lunroll:
movd (%ebx), %mm5
movl $4, %edi
movd %ecx, %mm6
testl %edi, %ebx
psllq $32, %mm5
jz .Lstart_src_aligned
movq (%ebx), %mm0
psrlq %mm6, %mm0
addl $4, %ebx
decl %eax
movd %mm0, (%edx)
addl $4, %edx
.Lstart_src_aligned:
movq (%ebx), %mm1
testl %edi, %edx
psrlq %mm6, %mm5
jz .Lstart_dst_aligned
movq %mm1, %mm0
addl $32, %ecx
psrlq %mm6, %mm0
movd %ecx, %mm6
movd %mm0, (%edx)
addl $4, %edx
.Lstart_dst_aligned:
movq 8(%ebx), %mm3
negl %ecx
movq %mm3, %mm2
addl $64, %ecx
movd %ecx, %mm7
psrlq %mm6, %mm1
leal -12(%ebx,%eax,4), %ebx
leal -20(%edx,%eax,4), %edx
psllq %mm7, %mm3
subl $7, %eax
por %mm1, %mm3
negl %eax
jns .Lfinish
.align 8, 0x90
.Lunroll_loop:
movq (%ebx,%eax,4), %mm0
psrlq %mm6, %mm2
movq %mm0, %mm1
psllq %mm7, %mm0
movq %mm3, -8(%edx,%eax,4)
por %mm2, %mm0
movq 8(%ebx,%eax,4), %mm3
psrlq %mm6, %mm1
movq %mm0, (%edx,%eax,4)
movq %mm3, %mm2
psllq %mm7, %mm3
addl $4, %eax
por %mm1, %mm3
js .Lunroll_loop
.Lfinish:
testb $2, %al
jnz .Lfinish_no_two
movq (%ebx,%eax,4), %mm0
psrlq %mm6, %mm2
movq %mm0, %mm1
psllq %mm7, %mm0
movq %mm3, -8(%edx,%eax,4)
por %mm2, %mm0
movq %mm1, %mm2
movq %mm0, %mm3
addl $2, %eax
.Lfinish_no_two:
testb $1, %al
popl %edi
movd %mm5, %eax
jnz .Lfinish_zero
movd 8(%ebx), %mm0
psrlq %mm6, %mm2
movq %mm0, %mm1
psllq %mm7, %mm0
movq %mm3, (%edx)
por %mm2, %mm0
psrlq %mm6, %mm1
andl $32, %ecx
popl %ebx
jz .Lfinish_one_unaligned
movd %mm1, 16(%edx)
.Lfinish_one_unaligned:
movq %mm0, 8(%edx)
emms
ret
.Lfinish_zero:
movq %mm3, 4(%edx)
psrlq %mm6, %mm2
movd %mm2, 12(%edx)
andl $32, %ecx
popl %ebx
jz .Lfinish_zero_unaligned
movq %mm2, 12(%edx)
.Lfinish_zero_unaligned:
emms
ret

View file

@ -0,0 +1,5 @@
mpih-add1.S
mpih-mul1.S
mpih-mul2.S
mpih-mul3.S
mpih-sub1.S

View file

@ -0,0 +1,91 @@
/* Intel Pentium-4 mpn_add_n -- mpn addition.
*
* Copyright 2001, 2002 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*
* Note: This code is heavily based on the GNU MP Library.
* Actually it's the same code with only minor changes in the
* way the data is stored; this is to support the abstraction
* of an optional secure memory allocation which may be used
* to avoid revealing of sensitive data due to paging etc.
*/
#include "sysdep.h"
#include "asm-syntax.h"
/*******************
* mpi_limb_t
* _gcry_mpih_add_n( mpi_ptr_t res_ptr, (sp + 4)
* mpi_ptr_t s1_ptr, (sp + 8)
* mpi_ptr_t s2_ptr, (sp + 12)
* mpi_size_t size) (sp + 16)
*
* P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2
* 6.0 cycles/limb if dst==src1 or dst==src2
* P4 Prescott: >= 5 cycles/limb
*
* The 4 c/l achieved here isn't particularly good, but is better than 9 c/l
* for a basic adc loop.
*/
TEXT
ALIGN (3)
GLOBL C_SYMBOL_NAME(_gcry_mpih_add_n)
C_SYMBOL_NAME(_gcry_mpih_add_n:)
pxor %mm0, %mm0
movl 8(%esp), %eax /* s1_ptr */
movl %ebx, 8(%esp) /* re-use parameter space */
movl 12(%esp), %ebx /* res_ptr */
movl 4(%esp), %edx /* s2_ptr */
movl 16(%esp), %ecx /* size */
leal (%eax,%ecx,4), %eax /* src1 end */
leal (%ebx,%ecx,4), %ebx /* src2 end */
leal (%edx,%ecx,4), %edx /* dst end */
negl %ecx /* -size */
Ltop:
/*
C eax src1 end
C ebx src2 end
C ecx counter, limbs, negative
C edx dst end
C mm0 carry bit
*/
movd (%eax,%ecx,4), %mm1
movd (%ebx,%ecx,4), %mm2
paddq %mm2, %mm1
paddq %mm1, %mm0
movd %mm0, (%edx,%ecx,4)
psrlq $32, %mm0
addl $1, %ecx
jnz Ltop
movd %mm0, %eax
movl 8(%esp), %ebx /* restore saved EBX */
emms
ret

View file

@ -0,0 +1,96 @@
/* Intel Pentium-4 mpn_mul_1 -- Multiply a limb vector with a limb and store
* the result in a second limb vector.
*
* Copyright 2001, 2002, 2003, 2005 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*
* Note: This code is heavily based on the GNU MP Library.
* Actually it's the same code with only minor changes in the
* way the data is stored; this is to support the abstraction
* of an optional secure memory allocation which may be used
* to avoid revealing of sensitive data due to paging etc.
*/
#include "sysdep.h"
#include "asm-syntax.h"
/*******************
* mpi_limb_t
* _gcry_mpih_mul_1( mpi_ptr_t res_ptr, (sp + 4)
* mpi_ptr_t s1_ptr, (sp + 8)
* mpi_size_t s1_size, (sp + 12)
* mpi_limb_t s2_limb) (sp + 16)
*
* src != dst src == dst
* P6 model 9 (Banias) ?.?
* P6 model 13 (Dothan) 4.75 4.75
* P4 model 0 (Willamette) 4.0 6.0
* P4 model 1 (?) 4.0 6.0
* P4 model 2 (Northwood) 4.0 6.0
* P4 model 3 (Prescott) ?.? ?.?
* P4 model 4 (Nocona) ?.? ?.?
* Unfortunately when src==dst the write-combining described in
* pentium4/README takes us up to 6 c/l.
*
*/
TEXT
ALIGN (3)
GLOBL C_SYMBOL_NAME(_gcry_mpih_mul_1)
C_SYMBOL_NAME(_gcry_mpih_mul_1:);
pxor %mm0, %mm0
.Lstart_1c:
movl 8(%esp), %eax
movd 16(%esp), %mm7
movl 4(%esp), %edx
movl 12(%esp), %ecx
.Ltop:
/*
C eax src, incrementing
C ebx
C ecx counter, size iterations
C edx dst, incrementing
C
C mm0 carry limb
C mm7 multiplier
*/
movd (%eax), %mm1
addl $4, %eax
pmuludq %mm7, %mm1
paddq %mm1, %mm0
movd %mm0, (%edx)
addl $4, %edx
psrlq $32, %mm0
subl $1, %ecx
jnz .Ltop
movd %mm0, %eax
emms
ret

View file

@ -0,0 +1,136 @@
/* Intel Pentium-4 mpn_addmul_1 -- Multiply a limb vector with a limb and add
* the result to a second limb vector.
*
* Copyright 2001, 2002, 2004, 2005 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*
* Note: This code is heavily based on the GNU MP Library.
* Actually it's the same code with only minor changes in the
* way the data is stored; this is to support the abstraction
* of an optional secure memory allocation which may be used
* to avoid revealing of sensitive data due to paging etc.
*/
#include "sysdep.h"
#include "asm-syntax.h"
/*******************
* mpi_limb_t
* _gcry_mpih_addmul_1( mpi_ptr_t res_ptr, (sp + 4)
* mpi_ptr_t s1_ptr, (sp + 8)
* mpi_size_t s1_size, (sp + 12)
* mpi_limb_t s2_limb) (sp + 16)
*
* P3 model 9 (Banias) ?.?
* P3 model 13 (Dothan) 5.8
* P4 model 0 (Willamette) 5.5
* P4 model 1 (?) 5.5
* P4 model 2 (Northwood) 5.5
* P4 model 3 (Prescott) 6.0
* P4 model 4 (Nocona)
*
* Only the carry limb propagation is on the dependent chain, but some other
* Pentium4 pipeline magic brings down performance to 6 cycles/l from the
* ideal 4 cycles/l.
*/
TEXT
ALIGN (4)
GLOBL C_SYMBOL_NAME(_gcry_mpih_addmul_1)
C_SYMBOL_NAME(_gcry_mpih_addmul_1:)
pxor %mm4, %mm4
.Lstart_1c:
movl 8(%esp), %eax
movl 12(%esp), %ecx
movl 4(%esp), %edx
movd 16(%esp), %mm7
/*
C eax src, incrementing ; 5B
C ecx loop counter, decrementing
C edx dst, incrementing
C
C mm4 carry, low 32-bits
C mm7 multiplier
*/
movd (%eax), %mm2
pmuludq %mm7, %mm2
shrl $1, %ecx
jnc .Leven
leal 4(%eax), %eax
movd (%edx), %mm1
paddq %mm2, %mm1
paddq %mm1, %mm4
movd %mm4, (%edx)
psrlq $32, %mm4
testl %ecx, %ecx
jz .Lrtn
leal 4(%edx), %edx
movd (%eax), %mm2
pmuludq %mm7, %mm2
.Leven:
movd 4(%eax), %mm0
movd (%edx), %mm1
pmuludq %mm7, %mm0
subl $1, %ecx
jz .Lend
.Lloop:
paddq %mm2, %mm1
movd 8(%eax), %mm2
paddq %mm1, %mm4
movd 4(%edx), %mm3
pmuludq %mm7, %mm2
movd %mm4, (%edx)
psrlq $32, %mm4
paddq %mm0, %mm3
movd 12(%eax), %mm0
paddq %mm3, %mm4
movd 8(%edx), %mm1
pmuludq %mm7, %mm0
movd %mm4, 4(%edx)
psrlq $32, %mm4
leal 8(%eax), %eax
leal 8(%edx), %edx
subl $1, %ecx
jnz .Lloop
.Lend:
paddq %mm2, %mm1
paddq %mm1, %mm4
movd 4(%edx), %mm3
movd %mm4, (%edx)
psrlq $32, %mm4
paddq %mm0, %mm3
paddq %mm3, %mm4
movd %mm4, 4(%edx)
psrlq $32, %mm4
.Lrtn:
movd %mm4, %eax
emms
ret

View file

@ -0,0 +1,127 @@
/* Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
* subtract the result from a second limb vector.
*
* Copyright 2001, 2002 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*
* Note: This code is heavily based on the GNU MP Library.
* Actually it's the same code with only minor changes in the
* way the data is stored; this is to support the abstraction
* of an optional secure memory allocation which may be used
* to avoid revealing of sensitive data due to paging etc.
*/
#include "sysdep.h"
#include "asm-syntax.h"
/*******************
* mpi_limb_t
* _gcry_mpih_submul_1( mpi_ptr_t res_ptr, (sp + 4)
* mpi_ptr_t s1_ptr, (sp + 8)
* mpi_size_t s1_size, (sp + 12)
* mpi_limb_t s2_limb) (sp + 16)
*
* P4: 7 cycles/limb, unstable timing, at least on early Pentium4 silicon
* (stepping 10).
*
* This code is not particularly good at 7 c/l. The dependent chain is only
* 4 c/l and there's only 4 MMX unit instructions, so it's not clear why that
* speed isn't achieved.
*
* The arrangements made here to get a two instruction dependent chain are
* slightly subtle. In the loop the carry (or borrow rather) is a negative
* so that a paddq can be used to give a low limb ready to store, and a high
* limb ready to become the new carry after a psrlq.
*
* If the carry was a simple twos complement negative then the psrlq shift
* would need to bring in 0 bits or 1 bits according to whether the high was
* zero or non-zero, since a non-zero value would represent a negative
* needing sign extension. That wouldn't be particularly easy to arrange and
* certainly would add an instruction to the dependent chain, so instead an
* offset is applied so that the high limb will be 0xFFFFFFFF+c. With c in
* the range -0xFFFFFFFF to 0, the value 0xFFFFFFFF+c is in the range 0 to
* 0xFFFFFFFF and is therefore always positive and can always have 0 bits
* shifted in, which is what psrlq does.
*
* The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
* done off the dependent chain. The total adjustment then is to add
* 0xFFFFFFFF00000000 to offset the new carry, and subtract
* 0x00000000FFFFFFFF to remove the offset from the current carry, for a net
* add of 0xFFFFFFFE00000001. In the code this is applied to the destination
* limb when fetched.
*
* It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
* negative, which is how it's undone for the return value, but that doesn't
* seem as clear.
*/
TEXT
ALIGN (4)
GLOBL C_SYMBOL_NAME(_gcry_mpih_submul_1)
C_SYMBOL_NAME(_gcry_mpih_submul_1:)
pxor %mm1, %mm1
.Lstart_1c:
movl 8(%esp), %eax
pcmpeqd %mm0, %mm0
movd 16(%esp), %mm7
pcmpeqd %mm6, %mm6
movl 4(%esp), %edx
psrlq $32, %mm0
movl 12(%esp), %ecx
psllq $32, %mm6
psubq %mm0, %mm6
psubq %mm1, %mm0
/*
C eax src, incrementing
C ebx
C ecx loop counter, decrementing
C edx dst, incrementing
C
C mm0 0xFFFFFFFF - borrow
C mm6 0xFFFFFFFE00000001
C mm7 multiplier
*/
.Lloop:
movd (%eax), %mm1
leal 4(%eax), %eax
movd (%edx), %mm2
paddq %mm6, %mm2
pmuludq %mm7, %mm1
psubq %mm1, %mm2
paddq %mm2, %mm0
subl $1, %ecx
movd %mm0, (%edx)
psrlq $32, %mm0
leal 4(%edx), %edx
jnz .Lloop
movd %mm0, %eax
notl %eax
emms
ret

View file

@ -0,0 +1,112 @@
/* Intel Pentium-4 mpn_sub_n -- mpn subtraction.
*
* Copyright 2001, 2002 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*
* Note: This code is heavily based on the GNU MP Library.
* Actually it's the same code with only minor changes in the
* way the data is stored; this is to support the abstraction
* of an optional secure memory allocation which may be used
* to avoid revealing of sensitive data due to paging etc.
*/
#include "sysdep.h"
#include "asm-syntax.h"
/*******************
* mpi_limb_t
* _gcry_mpih_sub_n( mpi_ptr_t res_ptr, (sp + 4)
* mpi_ptr_t s1_ptr, (sp + 8)
* mpi_ptr_t s2_ptr, (sp + 12)
* mpi_size_t size) (sp + 16)
*
* P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2
* 6.0 cycles/limb if dst==src1 or dst==src2
* P4 Prescott: >= 5 cycles/limb
*
* The main loop code is 2x unrolled so that the carry bit can alternate
* between mm0 and mm1.
*/
.text
ALIGN (3)
.globl C_SYMBOL_NAME(_gcry_mpih_sub_n)
C_SYMBOL_NAME(_gcry_mpih_sub_n:)
pxor %mm0, %mm0
.Lstart_nc:
movl 8(%esp), %eax
movl %ebx, 8(%esp)
movl 12(%esp), %ebx
movl 4(%esp), %edx
movl 16(%esp), %ecx
leal (%eax,%ecx,4), %eax
leal (%ebx,%ecx,4), %ebx
leal (%edx,%ecx,4), %edx
negl %ecx
.Ltop:
/*
C eax src1 end
C ebx src2 end
C ecx counter, limbs, negative
C edx dst end
C mm0 carry bit
*/
movd (%eax,%ecx,4), %mm1
movd (%ebx,%ecx,4), %mm2
psubq %mm2, %mm1
psubq %mm0, %mm1
movd %mm1, (%edx,%ecx,4)
psrlq $63, %mm1
addl $1, %ecx
jz .Ldone_mm1
movd (%eax,%ecx,4), %mm0
movd (%ebx,%ecx,4), %mm2
psubq %mm2, %mm0
psubq %mm1, %mm0
movd %mm0, (%edx,%ecx,4)
psrlq $63, %mm0
addl $1, %ecx
jnz .Ltop
movd %mm0, %eax
movl 8(%esp), %ebx
emms
ret
.Ldone_mm1:
movd %mm1, %eax
movl 8(%esp), %ebx
emms
ret