cosmopolitan/libc/nexgen32e/div10.greg.S

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
│                                                                              │
│ This program is free software; you can redistribute it and/or modify         │
│ it under the terms of the GNU General Public License as published by         │
│ the Free Software Foundation; version 2 of the License.                      │
│                                                                              │
│ This program is distributed in the hope that it will be useful, but          │
│ WITHOUT ANY WARRANTY; without even the implied warranty of                   │
│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU             │
│ General Public License for more details.                                     │
│                                                                              │
│ You should have received a copy of the GNU General Public License            │
│ along with this program; if not, write to the Free Software                  │
│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA                │
│ 02110-1301 USA                                                               │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/macros.h"

/	Performs 128-bit div+mod by 10 without using div or mod.
/
/	If we didn't have this one-off function, our palandprintf()
/	implementation would cause nearly everything to need a soft
/	math library. It also somehow goes faster than 64-bit IDIV.
/
/	@param	rdi:rsi is the number
/	@param	rdx points to where remainder goes
/	@return	rax:rdx is result of division
/	@see	“Division by Invariant Integers using Multiplication”
/	@see	llog10() and div10int64() is a tiny bit faster
div10:	.leafprologue
	.profilable
	push	%rbx
	mov	%rdx,%r8
	test	%rsi,%rsi
	je	1f
	bsr	%rsi,%r10
	xor	$63,%r10d
	mov	$125,%r9d
	sub	%r10d,%r9d
	cmp	$64,%r9d
	jne	6f
	xor	%eax,%eax
	xor	%r11d,%r11d
	jmp	9f
1:	test	%r8,%r8
	je	3f
	movabs	$0xcccccccccccccccd,%rcx
	mov	%rdi,%rax
	mul	%rcx
	shr	$3,%rdx
	add	%edx,%edx
	lea	(%rdx,%rdx,4),%eax
	mov	%edi,%ecx
	sub	%eax,%ecx
	mov	%ecx,(%r8)
3:	movabs	$0xcccccccccccccccd,%rcx
	mov	%rdi,%rax
	mul	%rcx
	mov	%rdx,%rax
	shr	$3,%rax
	xor	%edi,%edi
	jmp	14f
6:	mov	%r9d,%ecx
	neg	%cl
	cmp	$62,%r10d
	jb	8f
	mov	%rdi,%rdx
	shl	%cl,%rdx
	mov	%rsi,%rax
	mov	%r9d,%ecx
	shr	%cl,%rax
	shrd	%cl,%rsi,%rdi
	xor	%r11d,%r11d
	mov	%rdi,%rsi
	mov	%rdx,%rdi
	jmp	9f
8:	mov	%rdi,%r11
	shl	%cl,%r11
	mov	%rsi,%rax
	shl	%cl,%rax
	mov	%r9d,%ecx
	shr	%cl,%rdi
	or	%rax,%rdi
	shr	%cl,%rsi
	xor	%eax,%eax
9:	add	$-125,%r10d
	xor	%ecx,%ecx
	mov	$9,%r9d
10:	shld	$1,%rsi,%rax
	shld	$1,%rdi,%rsi
	shld	$1,%r11,%rdi
	mov	%r11,%rdx
	add	%r11,%rdx
	mov	%rcx,%r11
	or	%rdx,%r11
	cmp	%rsi,%r9
	mov	$0,%ebx
	sbb	%rax,%rbx
	sar	$63,%rbx
	mov	%ebx,%ecx
	and	$1,%ecx
	and	$10,%ebx
	sub	%rbx,%rsi
	sbb	$0,%rax
	inc	%r10d
	jne	10b
	test	%r8,%r8
	je	13f
	mov	%esi,(%r8)
13:	lea	(%rcx,%r11,2),%rax
	shld	$1,%rdx,%rdi
14:	mov	%rdi,%rdx
	pop	%rbx
	.leafepilogue
	.endfn	div10,globl,hidden
	.source	__FILE__
Polish up repository and other revisions 2020-06-16 13:38:43 +00:00			`/-- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│`
Initial import 2020-06-15 14:18:57 +00:00			`│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│`
			`╞══════════════════════════════════════════════════════════════════════════════╡`
			`│ Copyright 2020 Justine Alexandra Roberts Tunney │`
			`│ │`
			`│ This program is free software; you can redistribute it and/or modify │`
			`│ it under the terms of the GNU General Public License as published by │`
			`│ the Free Software Foundation; version 2 of the License. │`
			`│ │`
			`│ This program is distributed in the hope that it will be useful, but │`
			`│ WITHOUT ANY WARRANTY; without even the implied warranty of │`
			`│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │`
			`│ General Public License for more details. │`
			`│ │`
			`│ You should have received a copy of the GNU General Public License │`
			`│ along with this program; if not, write to the Free Software │`
			`│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │`
			`│ 02110-1301 USA │`
			`╚─────────────────────────────────────────────────────────────────────────────*/`
			`#include "libc/macros.h"`

			`/ Performs 128-bit div+mod by 10 without using div or mod.`
			`/`
			`/ If we didn't have this one-off function, our palandprintf()`
			`/ implementation would cause nearly everything to need a soft`
			`/ math library. It also somehow goes faster than 64-bit IDIV.`
			`/`
			`/ @param rdi:rsi is the number`
			`/ @param rdx points to where remainder goes`
			`/ @return rax:rdx is result of division`
			`/ @see “Division by Invariant Integers using Multiplication”`
			`/ @see llog10() and div10int64() is a tiny bit faster`
			`div10: .leafprologue`
			`.profilable`
			`push %rbx`
			`mov %rdx,%r8`
			`test %rsi,%rsi`
			`je 1f`
			`bsr %rsi,%r10`
			`xor $63,%r10d`
			`mov $125,%r9d`
			`sub %r10d,%r9d`
			`cmp $64,%r9d`
			`jne 6f`
			`xor %eax,%eax`
			`xor %r11d,%r11d`
			`jmp 9f`
			`1: test %r8,%r8`
			`je 3f`
			`movabs $0xcccccccccccccccd,%rcx`
			`mov %rdi,%rax`
			`mul %rcx`
			`shr $3,%rdx`
			`add %edx,%edx`
			`lea (%rdx,%rdx,4),%eax`
			`mov %edi,%ecx`
			`sub %eax,%ecx`
			`mov %ecx,(%r8)`
			`3: movabs $0xcccccccccccccccd,%rcx`
			`mov %rdi,%rax`
			`mul %rcx`
			`mov %rdx,%rax`
			`shr $3,%rax`
			`xor %edi,%edi`
			`jmp 14f`
			`6: mov %r9d,%ecx`
			`neg %cl`
			`cmp $62,%r10d`
			`jb 8f`
			`mov %rdi,%rdx`
			`shl %cl,%rdx`
			`mov %rsi,%rax`
			`mov %r9d,%ecx`
			`shr %cl,%rax`
			`shrd %cl,%rsi,%rdi`
			`xor %r11d,%r11d`
			`mov %rdi,%rsi`
			`mov %rdx,%rdi`
			`jmp 9f`
			`8: mov %rdi,%r11`
			`shl %cl,%r11`
			`mov %rsi,%rax`
			`shl %cl,%rax`
			`mov %r9d,%ecx`
			`shr %cl,%rdi`
			`or %rax,%rdi`
			`shr %cl,%rsi`
			`xor %eax,%eax`
			`9: add $-125,%r10d`
			`xor %ecx,%ecx`
			`mov $9,%r9d`
			`10: shld $1,%rsi,%rax`
			`shld $1,%rdi,%rsi`
			`shld $1,%r11,%rdi`
			`mov %r11,%rdx`
			`add %r11,%rdx`
			`mov %rcx,%r11`
			`or %rdx,%r11`
			`cmp %rsi,%r9`
			`mov $0,%ebx`
			`sbb %rax,%rbx`
			`sar $63,%rbx`
			`mov %ebx,%ecx`
			`and $1,%ecx`
			`and $10,%ebx`
			`sub %rbx,%rsi`
			`sbb $0,%rax`
			`inc %r10d`
			`jne 10b`
			`test %r8,%r8`
			`je 13f`
			`mov %esi,(%r8)`
			`13: lea (%rcx,%r11,2),%rax`
			`shld $1,%rdx,%rdi`
			`14: mov %rdi,%rdx`
			`pop %rbx`
			`.leafepilogue`
			`.endfn div10,globl,hidden`
Add scouts honor escape hatch for source embedding 2020-06-16 02:01:28 +00:00			`.source __FILE__`