/* mpn_sqrtrem -- square root and remainder
Contributed to the GNU project by Paul Zimmermann ( most code ) ,
Torbjorn Granlund ( mpn_sqrtrem1 ) and Marco Bodrato ( mpn_dc_sqrt ) .
THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
INTERFACES . IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES .
IN FACT , IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
FUTURE GMP RELEASE .
Copyright 1999 - 2002 , 2004 , 2005 , 2008 , 2010 , 2012 , 2015 , 2017 Free Software
Foundation , Inc .
This file is part of the GNU MP Library .
The GNU MP Library is free software ; you can redistribute it and / or modify
it under the terms of either :
* the GNU Lesser General Public License as published by the Free
Software Foundation ; either version 3 of the License , or ( at your
option ) any later version .
or
* the GNU General Public License as published by the Free Software
Foundation ; either version 2 of the License , or ( at your option ) any
later version .
or both in parallel , as here .
The GNU MP Library is distributed in the hope that it will be useful , but
WITHOUT ANY WARRANTY ; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE . See the GNU General Public License
for more details .
You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library . If not ,
see https://www.gnu.org/licenses/. */
/* See "Karatsuba Square Root", reference in gmp.texi. */
#include <stdio.h>
#include <stdlib.h>
#include "gmp-impl.h"
#include "longlong.h"
#define USE_DIVAPPR_Q
1
#define TRACE(x)
static const unsigned char invsqrttab[
384 ] =
/* The common 0x100 was removed */
{
0 xff,
0 xfd,
0 xfb,
0 xf9,
0 xf7,
0 xf5,
0 xf3,
0 xf2,
/* sqrt(1/80)..sqrt(1/87) */
0 xf0,
0 xee,
0 xec,
0 xea,
0 xe9,
0 xe7,
0 xe5,
0 xe4,
/* sqrt(1/88)..sqrt(1/8f) */
0 xe2,
0 xe0,
0 xdf,
0 xdd,
0 xdb,
0 xda,
0 xd8,
0 xd7,
/* sqrt(1/90)..sqrt(1/97) */
0 xd5,
0 xd4,
0 xd2,
0 xd1,
0 xcf,
0 xce,
0 xcc,
0 xcb,
/* sqrt(1/98)..sqrt(1/9f) */
0 xc9,
0 xc8,
0 xc6,
0 xc5,
0 xc4,
0 xc2,
0 xc1,
0 xc0,
/* sqrt(1/a0)..sqrt(1/a7) */
0 xbe,
0 xbd,
0 xbc,
0 xba,
0 xb9,
0 xb8,
0 xb7,
0 xb5,
/* sqrt(1/a8)..sqrt(1/af) */
0 xb4,
0 xb3,
0 xb2,
0 xb0,
0 xaf,
0 xae,
0 xad,
0 xac,
/* sqrt(1/b0)..sqrt(1/b7) */
0 xaa,
0 xa9,
0 xa8,
0 xa7,
0 xa6,
0 xa5,
0 xa4,
0 xa3,
/* sqrt(1/b8)..sqrt(1/bf) */
0 xa2,
0 xa0,
0 x9f,
0 x9e,
0 x9d,
0 x9c,
0 x9b,
0 x9a,
/* sqrt(1/c0)..sqrt(1/c7) */
0 x99,
0 x98,
0 x97,
0 x96,
0 x95,
0 x94,
0 x93,
0 x92,
/* sqrt(1/c8)..sqrt(1/cf) */
0 x91,
0 x90,
0 x8f,
0 x8e,
0 x8d,
0 x8c,
0 x8c,
0 x8b,
/* sqrt(1/d0)..sqrt(1/d7) */
0 x8a,
0 x89,
0 x88,
0 x87,
0 x86,
0 x85,
0 x84,
0 x83,
/* sqrt(1/d8)..sqrt(1/df) */
0 x83,
0 x82,
0 x81,
0 x80,
0 x7f,
0 x7e,
0 x7e,
0 x7d,
/* sqrt(1/e0)..sqrt(1/e7) */
0 x7c,
0 x7b,
0 x7a,
0 x79,
0 x79,
0 x78,
0 x77,
0 x76,
/* sqrt(1/e8)..sqrt(1/ef) */
0 x76,
0 x75,
0 x74,
0 x73,
0 x72,
0 x72,
0 x71,
0 x70,
/* sqrt(1/f0)..sqrt(1/f7) */
0 x6f,
0 x6f,
0 x6e,
0 x6d,
0 x6d,
0 x6c,
0 x6b,
0 x6a,
/* sqrt(1/f8)..sqrt(1/ff) */
0 x6a,
0 x69,
0 x68,
0 x68,
0 x67,
0 x66,
0 x66,
0 x65,
/* sqrt(1/100)..sqrt(1/107) */
0 x64,
0 x64,
0 x63,
0 x62,
0 x62,
0 x61,
0 x60,
0 x60,
/* sqrt(1/108)..sqrt(1/10f) */
0 x5f,
0 x5e,
0 x5e,
0 x5d,
0 x5c,
0 x5c,
0 x5b,
0 x5a,
/* sqrt(1/110)..sqrt(1/117) */
0 x5a,
0 x59,
0 x59,
0 x58,
0 x57,
0 x57,
0 x56,
0 x56,
/* sqrt(1/118)..sqrt(1/11f) */
0 x55,
0 x54,
0 x54,
0 x53,
0 x53,
0 x52,
0 x52,
0 x51,
/* sqrt(1/120)..sqrt(1/127) */
0 x50,
0 x50,
0 x4f,
0 x4f,
0 x4e,
0 x4e,
0 x4d,
0 x4d,
/* sqrt(1/128)..sqrt(1/12f) */
0 x4c,
0 x4b,
0 x4b,
0 x4a,
0 x4a,
0 x49,
0 x49,
0 x48,
/* sqrt(1/130)..sqrt(1/137) */
0 x48,
0 x47,
0 x47,
0 x46,
0 x46,
0 x45,
0 x45,
0 x44,
/* sqrt(1/138)..sqrt(1/13f) */
0 x44,
0 x43,
0 x43,
0 x42,
0 x42,
0 x41,
0 x41,
0 x40,
/* sqrt(1/140)..sqrt(1/147) */
0 x40,
0 x3f,
0 x3f,
0 x3e,
0 x3e,
0 x3d,
0 x3d,
0 x3c,
/* sqrt(1/148)..sqrt(1/14f) */
0 x3c,
0 x3b,
0 x3b,
0 x3a,
0 x3a,
0 x39,
0 x39,
0 x39,
/* sqrt(1/150)..sqrt(1/157) */
0 x38,
0 x38,
0 x37,
0 x37,
0 x36,
0 x36,
0 x35,
0 x35,
/* sqrt(1/158)..sqrt(1/15f) */
0 x35,
0 x34,
0 x34,
0 x33,
0 x33,
0 x32,
0 x32,
0 x32,
/* sqrt(1/160)..sqrt(1/167) */
0 x31,
0 x31,
0 x30,
0 x30,
0 x2f,
0 x2f,
0 x2f,
0 x2e,
/* sqrt(1/168)..sqrt(1/16f) */
0 x2e,
0 x2d,
0 x2d,
0 x2d,
0 x2c,
0 x2c,
0 x2b,
0 x2b,
/* sqrt(1/170)..sqrt(1/177) */
0 x2b,
0 x2a,
0 x2a,
0 x29,
0 x29,
0 x29,
0 x28,
0 x28,
/* sqrt(1/178)..sqrt(1/17f) */
0 x27,
0 x27,
0 x27,
0 x26,
0 x26,
0 x26,
0 x25,
0 x25,
/* sqrt(1/180)..sqrt(1/187) */
0 x24,
0 x24,
0 x24,
0 x23,
0 x23,
0 x23,
0 x22,
0 x22,
/* sqrt(1/188)..sqrt(1/18f) */
0 x21,
0 x21,
0 x21,
0 x20,
0 x20,
0 x20,
0 x1f,
0 x1f,
/* sqrt(1/190)..sqrt(1/197) */
0 x1f,
0 x1e,
0 x1e,
0 x1e,
0 x1d,
0 x1d,
0 x1d,
0 x1c,
/* sqrt(1/198)..sqrt(1/19f) */
0 x1c,
0 x1b,
0 x1b,
0 x1b,
0 x1a,
0 x1a,
0 x1a,
0 x19,
/* sqrt(1/1a0)..sqrt(1/1a7) */
0 x19,
0 x19,
0 x18,
0 x18,
0 x18,
0 x18,
0 x17,
0 x17,
/* sqrt(1/1a8)..sqrt(1/1af) */
0 x17,
0 x16,
0 x16,
0 x16,
0 x15,
0 x15,
0 x15,
0 x14,
/* sqrt(1/1b0)..sqrt(1/1b7) */
0 x14,
0 x14,
0 x13,
0 x13,
0 x13,
0 x12,
0 x12,
0 x12,
/* sqrt(1/1b8)..sqrt(1/1bf) */
0 x12,
0 x11,
0 x11,
0 x11,
0 x10,
0 x10,
0 x10,
0 x0f,
/* sqrt(1/1c0)..sqrt(1/1c7) */
0 x0f,
0 x0f,
0 x0f,
0 x0e,
0 x0e,
0 x0e,
0 x0d,
0 x0d,
/* sqrt(1/1c8)..sqrt(1/1cf) */
0 x0d,
0 x0c,
0 x0c,
0 x0c,
0 x0c,
0 x0b,
0 x0b,
0 x0b,
/* sqrt(1/1d0)..sqrt(1/1d7) */
0 x0a,
0 x0a,
0 x0a,
0 x0a,
0 x09,
0 x09,
0 x09,
0 x09,
/* sqrt(1/1d8)..sqrt(1/1df) */
0 x08,
0 x08,
0 x08,
0 x07,
0 x07,
0 x07,
0 x07,
0 x06,
/* sqrt(1/1e0)..sqrt(1/1e7) */
0 x06,
0 x06,
0 x06,
0 x05,
0 x05,
0 x05,
0 x04,
0 x04,
/* sqrt(1/1e8)..sqrt(1/1ef) */
0 x04,
0 x04,
0 x03,
0 x03,
0 x03,
0 x03,
0 x02,
0 x02,
/* sqrt(1/1f0)..sqrt(1/1f7) */
0 x02,
0 x02,
0 x01,
0 x01,
0 x01,
0 x01,
0 x00,
0 x00
/* sqrt(1/1f8)..sqrt(1/1ff) */
};
/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */
#if GMP_NUMB_BITS >
32
#define MAGIC CNST_LIMB(
0 x10000000000)
/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
#else
#define MAGIC CNST_LIMB(
0 x100000)
/* 0xfee6f < MAGIC < 0x29cbc8 */
#endif
static mp_limb_t
mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
{
#if GMP_NUMB_BITS >
32
mp_limb_t a1;
#endif
mp_limb_t x0, t2, t, x2;
unsigned abits;
ASSERT_ALWAYS (GMP_NAIL_BITS ==
0 );
ASSERT_ALWAYS (GMP_LIMB_BITS ==
32 || GMP_LIMB_BITS ==
64 );
ASSERT (a0 >= GMP_NUMB_HIGHBIT /
2 );
/* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
since we can do the former without division . As part of the last
iteration convert from 1/sqrt(a) to sqrt(a). */
abits = a0 >> (GMP_LIMB_BITS -
1 -
8 );
/* extract bits for table lookup */
x0 =
0 x100 | invsqrttab[abits -
0 x80];
/* initial 1/sqrt(a) */
/* x0 is now an 8 bits approximation of 1/sqrt(a0) */
#if GMP_NUMB_BITS >
32
a1 = a0 >> (GMP_LIMB_BITS -
1 -
32 );
t = (mp_limb_signed_t) (CNST_LIMB(
0 x2000000000000) -
0 x30000 - a1 * x0 * x0) >>
16 ;
x0 = (x0 <<
16 ) + ((mp_limb_signed_t) (x0 * t) >> (
16 +
2 ));
/* x0 is now a 16 bits approximation of 1/sqrt(a0) */
t2 = x0 * (a0 >> (
32 -
8 ));
t = t2 >>
25 ;
t = ((mp_limb_signed_t) ((a0 <<
14 ) - t * t - MAGIC) >> (
32 -
8 ));
x0 = t2 + ((mp_limb_signed_t) (x0 * t) >>
15 );
x0 >>=
32 ;
#else
t2 = x0 * (a0 >> (
16 -
8 ));
t = t2 >>
13 ;
t = ((mp_limb_signed_t) ((a0 <<
6 ) - t * t - MAGIC) >> (
16 -
8 ));
x0 = t2 + ((mp_limb_signed_t) (x0 * t) >>
7 );
x0 >>=
16 ;
#endif
/* x0 is now a full limb approximation of sqrt(a0) */
x2 = x0 * x0;
if (x2 +
2 *x0 <= a0 -
1 )
{
x2 +=
2 *x0 +
1 ;
x0++;
}
*rp = a0 - x2;
return x0;
}
#define Prec (GMP_NUMB_BITS >>
1 )
#if !
defined (SQRTREM2_INPLACE)
#define SQRTREM2_INPLACE
0
#endif
/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
#if SQRTREM2_INPLACE
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
static mp_limb_t
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
{
mp_srcptr np = rp;
#else
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
static mp_limb_t
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
{
#endif
mp_limb_t q, u, np0, sp0, rp0, q2;
int cc;
ASSERT (np[
1 ] >= GMP_NUMB_HIGHBIT /
2 );
np0 = np[
0 ];
sp0 = mpn_sqrtrem1 (rp, np[
1 ]);
rp0 = rp[
0 ];
/* rp0 <= 2*sp0 < 2^(Prec + 1) */
rp0 = (rp0 << (Prec -
1 )) + (np0 >> (Prec +
1 ));
q = rp0 / sp0;
/* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
q -= q >> Prec;
/* now we have q < 2^Prec */
u = rp0 - q * sp0;
/* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
sp0 = (sp0 << Prec) | q;
cc = u >> (Prec -
1 );
rp0 = ((u << (Prec +
1 )) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (
1 ) << (Prec +
1 )) -
1 ));
/* subtract q * q from rp */
q2 = q * q;
cc -= rp0 < q2;
rp0 -= q2;
if (cc <
0 )
{
rp0 += sp0;
cc += rp0 < sp0;
--sp0;
rp0 += sp0;
cc += rp0 < sp0;
}
rp[
0 ] = rp0;
sp[
0 ] = sp0;
return cc;
}
/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
and in { np , n } the low n limbs of the remainder , returns the high
limb of the remainder ( which is 0 or 1 ) .
Assumes { np , 2 n } is normalized , i . e . np [ 2 n - 1 ] > = B / 4
where B = 2 ^ GMP_NUMB_BITS .
Needs a scratch of n/2+1 limbs. */
static mp_limb_t
mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
{
mp_limb_t q;
/* carry out of {sp, n} */
int c, b;
/* carry out of remainder */
mp_size_t l, h;
ASSERT (n >
1 );
ASSERT (np[
2 * n -
1 ] >= GMP_NUMB_HIGHBIT /
2 );
l = n /
2 ;
h = n - l;
if (h ==
1 )
q = CALL_SQRTREM2_INPLACE (sp + l, np +
2 * l);
else
q = mpn_dc_sqrtrem (sp + l, np +
2 * l, h,
0 , scratch);
if (q !=
0 )
ASSERT_CARRY (mpn_sub_n (np +
2 * l, np +
2 * l, sp + l, h));
TRACE(printf(
"tdiv_qr(,,,,%u,,%u) -> %u\n" , (
unsigned ) n, (
unsigned ) h, (
unsigned ) (n - h +
1 )));
mpn_tdiv_qr (scratch, np + l, 0 , np + l, n, sp + l, h);
q += scratch[l];
c = scratch[0 ] & 1 ;
mpn_rshift (sp, scratch, l, 1 );
sp[l - 1 ] |= (q << (GMP_NUMB_BITS - 1 )) & GMP_NUMB_MASK;
if (UNLIKELY ((sp[0 ] & approx) != 0 )) /* (sp[0] & mask) > 1 */
return 1 ; /* Remainder is non-zero */
q >>= 1 ;
if (c != 0 )
c = mpn_add_n (np + l, np + l, sp + l, h);
TRACE(printf("sqr(,,%u)\n" , (unsigned ) l));
mpn_sqr (np + n, sp, l);
b = q + mpn_sub_n (np, np, np + n, 2 * l);
c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1 , (mp_limb_t) b);
if (c < 0 )
{
q = mpn_add_1 (sp + l, sp + l, h, q);
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
#else
c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2 )) + 2 * q;
#endif
c -= mpn_sub_1 (np, np, n, CNST_LIMB(1 ));
q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1 ));
}
return c;
}
#if USE_DIVAPPR_Q
static void
mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
{
gmp_pi1_t inv;
mp_limb_t qh;
ASSERT (dn > 2 );
ASSERT (nn >= dn);
ASSERT ((dp[dn-1 ] & GMP_NUMB_HIGHBIT) != 0 );
MPN_COPY (scratch, np, nn);
invert_pi1 (inv, dp[dn-1 ], dp[dn-2 ]);
if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
else
{
mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0 );
TMP_DECL;
TMP_MARK;
/* Sadly, scratch is too small. */
qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
TMP_FREE;
}
qp [nn - dn] = qh;
}
#endif
/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
returns zero if the operand was a perfect square , one otherwise .
Assumes { np , 2 n - odd } * 4 ^ nsh is normalized , i . e . B > np [ 2 n - 1 - odd ] * 4 ^ nsh > = B / 4
where B = 2 ^ GMP_NUMB_BITS .
THINK : In the odd case , three more ( dummy ) limbs are taken into account ,
when nsh is maximal , two limbs are discarded from the result of the
division. Too much? Is a single dummy limb enough? */
static int
mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
{
mp_limb_t q; /* carry out of {sp, n} */
int c; /* carry out of remainder */
mp_size_t l, h;
mp_ptr qp, tp, scratch;
TMP_DECL;
TMP_MARK;
ASSERT (np[2 * n - 1 - odd] != 0 );
ASSERT (n > 4 );
ASSERT (nsh < GMP_NUMB_BITS / 2 );
l = (n - 1 ) / 2 ;
h = n - l;
ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
if (nsh != 0 )
{
/* o is used to exactly set the lowest bits of the dividend, is it needed? */
int o = l > (1 + odd);
ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
}
else
MPN_COPY (tp, np + l - 1 - odd, n + h + 1 );
q = mpn_dc_sqrtrem (sp + l, tp + l + 1 , h, 0 , scratch);
if (q != 0 )
ASSERT_CARRY (mpn_sub_n (tp + l + 1 , tp + l + 1 , sp + l, h));
qp = tp + n + 1 ; /* l + 2 */
TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n" , (unsigned ) n+1 , (unsigned ) h, (unsigned ) (n + 1 - h + 1 )));
#if USE_DIVAPPR_Q
mpn_divappr_q (qp, tp, n + 1 , sp + l, h, scratch);
#else
mpn_div_q (qp, tp, n + 1 , sp + l, h, scratch);
#endif
q += qp [l + 1 ];
c = 1 ;
if (q > 1 )
{
/* FIXME: if s!=0 we will shift later, a noop on this area. */
MPN_FILL (sp, l, GMP_NUMB_MAX);
}
else
{
/* FIXME: if s!=0 we will shift again later, shift just once. */
mpn_rshift (sp, qp + 1 , l, 1 );
sp[l - 1 ] |= q << (GMP_NUMB_BITS - 1 );
if (((qp[0 ] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
(qp[1 ] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1 )))) == 0 )
{
mp_limb_t cy;
/* Approximation is not good enough, the extra limb(+ nsh bits)
is smaller than needed to absorb the possible error. */
/* {qp + 1, l + 1} equals 2*{sp, l} */
/* FIXME: use mullo or wrap-around, or directly evaluate
remainder with a single sqrmod_bnm1. */
TRACE(printf("mul(,,%u,,%u)\n" , (unsigned ) h, (unsigned ) (l+1 )));
ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1 , l + 1 ));
/* Compute the remainder of the previous mpn_div(appr)_q. */
cy = mpn_sub_n (tp + 1 , tp + 1 , scratch, h);
#if USE_DIVAPPR_Q || WANT_ASSERT
MPN_DECR_U (tp + 1 + h, l, cy);
#if USE_DIVAPPR_Q
ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0 );
if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0 )
{
/* May happen only if div result was not exact. */
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
cy = mpn_addlsh1_n_ip1 (tp + 1 , sp + l, h);
#else
cy = mpn_addmul_1 (tp + 1 , sp + l, h, CNST_LIMB(2 ));
#endif
ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
MPN_DECR_U (sp, l, 1 );
}
/* Can the root be exact when a correction was needed? We
did not find an example , but it depends on divappr
internals, and we can not assume it true in general...*/
/* else */
#else /* WANT_ASSERT */
ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0 );
#endif
#endif
if (mpn_zero_p (tp + l + 1 , h - l))
{
TRACE(printf("sqr(,,%u)\n" , (unsigned ) l));
mpn_sqr (scratch, sp, l);
c = mpn_cmp (tp + 1 , scratch + l, l);
if (c == 0 )
{
if (nsh != 0 )
{
mpn_lshift (tp, np, l, 2 * nsh);
np = tp;
}
c = mpn_cmp (np, scratch + odd, l - odd);
}
if (c < 0 )
{
MPN_DECR_U (sp, l, 1 );
c = 1 ;
}
}
}
}
TMP_FREE;
if ((odd | nsh) != 0 )
mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0 ));
return c;
}
mp_size_t
mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
{
mp_limb_t cc, high, rl;
int c;
mp_size_t rn, tn;
TMP_DECL;
ASSERT (nn > 0 );
ASSERT_MPN (np, nn);
ASSERT (np[nn - 1 ] != 0 );
ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1 ) / 2 , rp, nn));
ASSERT (! MPN_OVERLAP_P (sp, (nn + 1 ) / 2 , np, nn));
high = np[nn - 1 ];
if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2 )))
c = 0 ;
else
{
count_leading_zeros (c, high);
c -= GMP_NAIL_BITS;
c = c / 2 ; /* we have to shift left by 2c bits to normalize {np, nn} */
}
if (nn == 1 ) {
if (c == 0 )
{
sp[0 ] = mpn_sqrtrem1 (&rl, high);
if (rp != NULL)
rp[0 ] = rl;
}
else
{
cc = mpn_sqrtrem1 (&rl, high << (2 *c)) >> c;
sp[0 ] = cc;
if (rp != NULL)
rp[0 ] = rl = high - cc*cc;
}
return rl != 0 ;
}
if (nn == 2 ) {
mp_limb_t tp [2 ];
if (rp == NULL) rp = tp;
if (c == 0 )
{
#if SQRTREM2_INPLACE
rp[1 ] = high;
rp[0 ] = np[0 ];
cc = CALL_SQRTREM2_INPLACE (sp, rp);
#else
cc = mpn_sqrtrem2 (sp, rp, np);
#endif
rp[1 ] = cc;
return ((rp[0 ] | cc) != 0 ) + cc;
}
else
{
rl = np[0 ];
rp[1 ] = (high << (2 *c)) | (rl >> (GMP_NUMB_BITS - 2 *c));
rp[0 ] = rl << (2 *c);
CALL_SQRTREM2_INPLACE (sp, rp);
cc = sp[0 ] >>= c; /* c != 0, the highest bit of the root cc is 0. */
rp[0 ] = rl -= cc*cc; /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
return rl != 0 ;
}
}
tn = (nn + 1 ) / 2 ; /* 2*tn is the smallest even integer >= nn */
if ((rp == NULL) && (nn > 8 ))
return mpn_dc_sqrt (sp, np, tn, c, nn & 1 );
TMP_MARK;
if (((nn & 1 ) | c) != 0 )
{
mp_limb_t s0[1 ], mask;
mp_ptr tp, scratch;
TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1 );
tp[0 ] = 0 ; /* needed only when 2*tn > nn, but saves a test */
if (c != 0 )
mpn_lshift (tp + (nn & 1 ), np, nn, 2 * c);
else
MPN_COPY (tp + (nn & 1 ), np, nn);
c += (nn & 1 ) ? GMP_NUMB_BITS / 2 : 0 ; /* c now represents k */
mask = (CNST_LIMB (1 ) << c) - 1 ;
rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0 , scratch);
/* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
s0[0 ] = sp[0 ] & mask; /* S mod 2^k */
rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0 ]); /* R = R + 2*s0*S */
cc = mpn_submul_1 (tp, s0, 1 , s0[0 ]);
rl -= (tn > 1 ) ? mpn_sub_1 (tp + 1 , tp + 1 , tn - 1 , cc) : cc;
mpn_rshift (sp, sp, tn, c);
tp[tn] = rl;
if (rp == NULL)
rp = tp;
c = c << 1 ;
if (c < GMP_NUMB_BITS)
tn++;
else
{
tp++;
c -= GMP_NUMB_BITS;
}
if (c != 0 )
mpn_rshift (rp, tp, tn, c);
else
MPN_COPY_INCR (rp, tp, tn);
rn = tn;
}
else
{
if (rp != np)
{
if (rp == NULL) /* nn <= 8 */
rp = TMP_SALLOC_LIMBS (nn);
MPN_COPY (rp, np, nn);
}
rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0 , TMP_ALLOC_LIMBS(tn / 2 + 1 )));
}
MPN_NORMALIZE (rp, rn);
TMP_FREE;
return rn;
}
Messung V0.5 in Prozent C=85 H=100 G=92
¤ Dauer der Verarbeitung: 0.9 Sekunden
¤
*© Formatika GbR, Deutschland