// file kernel/n/x86/smod.S: operations on residues modulo BASE^n - 1
/*-----------------------------------------------------------------------+
 |  Copyright 2005, Michel Quercia (michel.quercia@prepas.org)           |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                     Arithmtique modulo BASE^n - 1                    |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                    # +----------------------------------+
                    # |  Soustraction modulo BASE^n - 1  |
                    # +----------------------------------+

# void xn(ssub)(chiffre *a, long la, chiffre *b, long lb)
#
# entre :
# a = naturel de longueur la
# b = naturel de longueur n > 0
#
# sortie :
# b <- (a - b) mod (BASE^n - 1), non normalis

#ifdef assembly_sn_ssub
#undef L
#define L(x) .Lsn_ssub_##x
ENTER(sn_ssub)

        # variables locales
        #undef _a_
        #undef _b_
        #undef _la_
        #undef _lb_
        #define _a_  20(%esp)
        #define _la_ 24(%esp)
        #define _b_  28(%esp)
        #define _lb_ 32(%esp)

        movl   _a_,     %esi
        movl   _la_,    %edx
        movl   _b_,     %edi
        movl   _lb_,    %ecx
        movl   %edi,    %ebx            # ebx <- &b
        subl   %ecx,    %edx            # edx <- la - lb
        ja     L(big_a)

        # cas a petit

        # soustrait les chiffres communs
        negl   %edx
        pushl  %edx                     # sauve lb-la
        subl   %edx,    %ecx            # ecx <- la
        jecxz  1f
        call   .Lsn_fsub_1
1:
        # soustrait le reste de b de 0
        popl   %ecx                     # ecx <- lb-la
        jecxz  3f
        leal   (%edi,%ecx,4), %edi      # edi <- &b[lb]
        not %ecx; incl %ecx             # ecx <- la-lb
        ALIGN(4)
2:
        movl   $0,     %eax
        sbbl   (%edi,%ecx,4), %eax
        movl   %eax,   (%edi,%ecx,4)
        incl   %ecx
        jne    2b
3:
        # recycle la retenue ngative
        jnb    L(done)
4:
        movl   _lb_,    %ecx
        not    %ecx; incl %ecx          # ecx <- -lb
5:
        subl   $1,      (%edi,%ecx,4)
        jnb    L(done)
        incl   %ecx
        jne    5b
        jmp    4b
        
        # cas la > lb
        ALIGN(4)
L(big_a):

        # soustrait les chiffres communs et recycle la retenue ngative
        movl   %edx,    _la_            # la -= lb
        call   .Lsn_fsub_1
        movl   %esi,    %ebx            # ebx <- &a[lb]
        jnb    L(next)
1:
        movl   _lb_,    %ecx
        not    %ecx; incl %ecx          # ecx <- -lb
2:
        subl   $1,      (%edi,%ecx,4)
        jnb    L(next)
        incl   %ecx
        jne    2b
        jmp    1b

        # additionne les lb chiffres suivants et recycle la retenue positive
        ALIGN(4)
L(loop):
        call  .Lsn_finc_1
        jnc    L(next)
1:
        movl   _lb_,    %ecx
        not    %ecx; incl %ecx          # ecx <- -lb
2:
        incl   (%esi,%ecx,4)
        jne    L(next)
        incl   %ecx
        jne    2b
        jmp    1b
        
        # tranche suivante
        ALIGN(4)
L(next):
        movl   _b_,     %esi
        movl   _lb_,    %ecx
        subl   %ecx,    _la_            # la -= lb
        jae    L(loop)

        # dernire tranche, incomplte
        movl   %ecx,    %edx            # edx <- lb
        addl   _la_,    %ecx            # ecx <- la
        jz     L(done)
        call   .Lsn_finc
        jnc    L(done)
1:
        movl   _lb_,    %ecx
        not    %ecx; incl %ecx          # ecx <- -lb
2:
        incl   (%esi,%ecx,4)
        jne    L(done)
        incl   %ecx
        jne    2b
        jmp    1b

        # Termin       
        ALIGN(4)
L(done):
        RETURN_WITH_SP

#endif /* assembly_sn_ssub */

        

                      # +-------------------------------+
                      # |  Rduction modulo BASE^n - 1  |
                      # +-------------------------------+

# entre :
#   a = naturel de longueur la    esi = &a,    edx = la
#   b = naturel de longueur n     edi = &b,    ecx = n
#
# contraintes : n > 0, la >= 0
#
# sortie :
#   b <- a mod BASE^n - 1
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.

        ALIGN(32)
.Lsn_fsred:
        
#undef L
#define L(x) .Lsn_fsred_##x

        # si la <= n, copie a dans b et complte par des zros
        cld
        cmpl   %edx,    %ecx
        jb     L(big)
        xchgl  %ecx,    %edx            # ecx <- la
        subl   %ecx,    %edx            # edx <- n-la
        jecxz  1f
        rep movsl                       # b <- a
1:
        movl   %edx,    %ecx
        jecxz  2f
        xorl   %eax,    %eax
        rep    stosl                    # complte avec n-la zros
2:
        ret

        # variables locales
        #undef  _b_
        #undef  _la_
        #undef  _n_
        #undef  _r_
        #define _b_  12(%esp)
        #define _la_  8(%esp)
        #define _n_   4(%esp)
        #define _r_    (%esp)

        ALIGN(4)
L(big):
        pushl  %edi                     # sauve &b
        subl   %ecx,    %edx            # la -= n
        pushl  %edx                     # sauve la
        pushl  %ecx                     # sauve n
        pushl  $0                       # init retenue
        rep    movsl                  # copie les n premiers chiffres de a dans b
        movl   %esi,    %ebx            # ebx <- &a[n]

        # cumule par blocs de n chiffres
L(loop):
        movl   _n_,    %ecx
        movl   _b_,    %esi
        movl   _la_,   %edx
        subl   %ecx,   %edx
        jbe    L(last)
        movl   %edx,   _la_             # la -= n
        call   .Lsn_finc_1              # ajoute un bloc
        adcl   %ecx,   _r_              # maj retenue
        jmp    L(loop)
        ALIGN(4)

        # dernier bloc
L(last):
        xchgl  %ecx,   %edx             # edx <- n
        addl   %edx,   %ecx             # ecx <- la
        call   .Lsn_finc                # ajoute le dernier bloc
        movl   _r_,    %eax
        movl   _n_,    %ecx
        not    %ecx
        incl   %ecx
        adcl   %eax,   (%esi,%ecx,4)    # recycle la retenue
        jnc    2f
        incl   %ecx
1:
        incl   (%esi,%ecx,4)
        jne    2f
        incl   %ecx
        jne    1b
        movl   _n_,     %ecx
        negl   %ecx
        jmp    1b
        ALIGN(4)
2:
        leal   16(%esp), %esp           # nettoie la pile
        ret

             # +-------------------------------------------------+
             # |  Dcomposition modulo BASE^p - 1 et BASE^p + 1  |
             # +-------------------------------------------------+

# entre :
#  a = naturel de longueur 2p       esi = &a,   edx = p
#  b = naturel de longueur p        edi = &b
#  c = naturel de longueur p+1      ebx = &c
#
# contrainte :  p > 0, a,b,c non confondus
#
# sortie :
#   b  <- a mod BASE^p - 1
#   c  <- a mod BASE^p + 1
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.

#if defined(assembly_sn_smul) || defined(assembly_sn_ssqr)
        ALIGN(32)
.Lsn_fsplit_even:

#ifdef use_sse2

        movl   %edx,    %ecx
        negl   %edx
        pushl  %edx                     # sauve -p
        movl   %ebx,    %edx            # edx <- &c
        leal  (%esi,%ecx,4), %ebx       # ebx <- &a[p]
        call   .Lsn_fadd_sub
        movl   %ecx,   (%edx)           # c[p] <- 0
        
        # recycle la retenue de a0+a1
        jnc    2f
        movl   (%esp),  %ecx            # ecx <- -p
1:
        incl   %ecx
        incl -4(%edi,%ecx,4)
        jz     1b
2:

        # recycle la retenue de a0-a1
        popl   %ecx                     # ecx <- -p
        testl  %eax,    %eax
        jz     2f
1:
        incl   %ecx
        incl -4(%edx,%ecx,4)
        jz     1b
2:

        emms
        ret             
        
#else /* use_sse2 */
        
        # variables locales
        #undef  _a_
        #undef  _c_
        #undef  _p_
        #define _a_   8(%esp)
        #define _c_   4(%esp)
        #define _p_    (%esp)
        pushl   %esi
        pushl   %ebx
        pushl   %edx

        # b[p..2p-1] <- a mod BASE^p - 1
        movl   %edx,     %ecx
        leal   (%esi,%ecx,4), %ebx      # ebx <- &a[p]
        call   .Lsn_fadd_1              # b <- a0 + a1
        jnc    2f
        movl   _p_,     %ecx            # recycle la retenue
        not    %ecx
1:
        incl   %ecx
        incl  (%edi,%ecx,4)
        jz     1b
2:

        # c <- a mod BASE^p + 1
        movl   %esi,    %ebx            # ebx <- &a[p]
        movl   _a_,     %esi
        movl   _c_,     %edi
        movl   _p_,     %ecx
        call   .Lsn_fsub_1              # c[0..p-1] <- a0 - a1
        movl   %ecx,   (%edi)           # c[p] <- 0
        jnc    2f
        movl   _p_,     %ecx            # recycle la retenue
        not    %ecx
1:
        incl   %ecx
        incl  (%edi,%ecx,4)
        jz     1b
2:
        leal 12(%esp), %esp             # nettoie la pile
        ret

#endif /* use_sse2 */
        
#endif /* defined(assembly_sn_smul) || defined(assembly_sn_ssqr) */
        
       # +-------------------------------------------------------------+
       # |  Dcomposition modulo BASE^(p+1/2) - 1 et BASE^(p+1/2) + 1  |
       # +-------------------------------------------------------------+

# entre :
#  a = naturel de longueur 2p+1     esi = &a,   edx = p
#  b = naturel de longueur p+1      edi = &b
#  c = naturel de longueur p+1      ebx = &c
#
# contrainte :  p > 0, a,b,c non confondus
#
# sortie :
#   b <- a mod BASE^(p+1/2) - 1
#   c <- a mod BASE^(p+1/2) + 1
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.

#if defined(assembly_sn_smul) || defined(assembly_sn_ssqr)
        ALIGN(32)
.Lsn_fsplit_odd:

#ifdef use_sse2

        movd  (%esi),   %mm2            # mm2 <- a0[0]
        movq   %mm2,    %mm3            # mm3 <- a0[0]
        leal  (%esi,%edx,4), %eax       # eax <- &a[p]
        leal 4(%esi,%edx,8), %esi       # esi <- &a[2p+1]
        leal  (%edi,%edx,4), %edi       # edi <- &b[p]
        leal  (%ebx,%edx,4), %ebx       # ebx <- &c[p]
        negl   %edx                     # edx <- -p
        movl   %edx,    %ecx            # ecx <- -p
        
        ALIGN(4)
1:
        movd  (%esi,%ecx,4), %mm1       # mm1 <- a1[i]
        psllq  $16,       %mm1          # mm1 <- a1[i] << 16
        paddq  %mm1,      %mm2          # mm2 <- ret + a0[i] + a1[i] << 16
        psubq  %mm1,      %mm3          # mm3 <- ret + a0[i] - a1[i] << 16
        movd   %mm2,     (%edi,%ecx,4)  # sauve b[i]
        movd   %mm3,     (%ebx,%ecx,4)  # sauve c[i]
        incl   %ecx
        psrlq  $32,       %mm2          # mm2 <- nouvelle retenue sur b
        pshufw $0xfe, %mm3, %mm3        # mm3 <- nouvelle retenue sur c
        movd  (%eax,%ecx,4), %mm0       # mm0 <- a0[i+1]
        paddq  %mm0,      %mm2          # mm2 <- ret + a0[i+1]
        paddq  %mm0,      %mm3          # mm3 <- ret + a0[i+1]
        jne    1b

        movd   %mm2,     (%edi)         # sauve b[p]
        movd   %mm3,     (%ebx)         # sauve c[p]
        psrlq  $32,       %mm2          # mm2 <- nouvelle retenue sur b
        pshufw $0xfe, %mm3, %mm3        # mm3 <- nouvelle retenue sur c

        # recycle la retenue sortant de b
        movd   %mm2,    %eax
        testl  %eax,    %eax
        jz     2f       
        movl   %edx,    %ecx
        addl   $0x10000, (%edi,%ecx,4)
        jnc    2f
1:
        incl   %ecx
        incl   -4(%edi,%ecx,4)
        jz     1b
2:

        # recycle la retenue sortant de c
        movd   %mm3,    %eax
        testl  %eax,    %eax
        jz     2f       
        andl   $0xffff, (%ebx)
        movl   %edx,    %ecx
1:
        incl   %ecx
        incl -4(%ebx,%ecx,4)
        jz     1b
2:

        emms
        ret

#else /* use_sse2 */
        
        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _p_
        #define _a_  12(%esp)
        #define _b_   8(%esp)
        #define _c_   4(%esp)
        #define _p_    (%esp)
        pushl   %esi
        pushl   %edi
        pushl   %ebx
        pushl   %edx

        # b <- a1 << 16
        leal   4(%esi,%edx,4), %esi     # esi <- &a[p+1]
        movl   $16,     %ecx
        call   .Lsn_fshift_up
        shrl   %cl,     %eax
        movl   %eax,   (%edi)

        # c <- a mod BASE^(p+1/2) + 1
        movl   _a_,     %esi
        movl   _b_,     %ebx
        movl   _c_,     %edi
        movl   _p_,     %ecx
        incl   %ecx                     # ecx <- p+1
        call   .Lsn_fsub_1
        jnb    2f
        andl   $0xffff, -4(%edi)        # recycle la retenue
        movl   _p_,     %ecx
        not    %ecx
1:
        incl   %ecx
        incl   -4(%edi,%ecx,4)
        jz     1b
2:

        # b <- a mod BASE^(p+1/2) - 1
        movl   _b_,     %esi
        movl   _a_,     %ebx
        movl   _p_,     %ecx
        incl   %ecx                     # ecx <- p+1
        call   .Lsn_finc_1
        jnc    2f
        movl   _p_,    %ecx             # recycle la retenue
        negl   %ecx
        addl   $0x10000, -4(%esi,%ecx,4)
        jnc    2f
1:
        incl   %ecx
        incl   -4(%esi,%ecx,4)
        jz     1b
2:
        leal  16(%esp), %esp            # nettoie la pile
        ret

#endif /* use_sse2 */
        
#endif /* defined(assembly_sn_smul) || defined(assembly_sn_ssqr) */
        

                   # +------------------------------------+
                   # |  Multiplication modulo BASE^n - 1  |
                   # +------------------------------------+

# entre :
#   a = naturel de longueur n     esi = &a,   ecx = n
#   b = naturel de longueur n     ebx = &b
#   c = naturel de longueur n     edi = &c
#
# contrainte : n > 0
#   
# sortie :
#   c <- a*b mod BASE^n - 1
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.

#ifdef assembly_sn_smul
        ALIGN(32)
.Lsn_fsmul:

#undef L
#define L(x) .Lsn_fsmul_##x

        # aiguillage selon la longueur et la parit
        movl    %ecx,   %edx
        shrl    $1,     %edx            # edx <- p = n/2
        jnc     1f
        cmpl    $smul_lim_odd, %ecx
        jbe     L(small)
        jmp     L(big_odd)
        ALIGN(4)
1:
        cmpl    $smul_lim_even, %ecx
        jg      L(big_even)
        
        # petite multiplication => Toom
L(small):
        leal   (,%ecx,8), %eax
        subl   %eax,    %esp            # rserve 2n chiffres dans la pile
        pushl  %ecx                     # sauve n
        pushl  %edi                     # sauve &c
        movl   %ecx,    %edx            # edx <- n
        leal   8(%esp), %edi            # edi <- &d
        call   .Lsn_ftoommul            # d <- a*b
        
        # point de chute pour fssqr
.Lsn_smul_aux_small:
        popl   %edi                     # rcupre &c
        leal   4(%esp), %esi            # esi <- &d
        movl   (%esp),  %ecx            # ecx <- n
        leal   (%esi,%ecx,4), %ebx      # ebx <- &d[n]
        call   .Lsn_fadd_1              # c <- d[0..n-1] + d[n..2n-1]
        jnc    3f
1:
        movl   (%esp),  %ecx            # recycle la retenue
        negl   %ecx
2:
        incl   (%edi,%ecx,4)
        jne    3f
        incl   %ecx
        jne    2b
        jmp    1b
        ALIGN(4)
3:
        popl   %ecx
        leal   (%esp,%ecx,8), %esp      # nettoie la pile
        ret
        ALIGN(4)

        # cas n grand pair : dcompose en deux produits modulaires
L(big_even):

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #undef  _r_
        #define _d_ 16(%esp)
        #define _a_ 12(%esp)
        #define _b_  8(%esp)
        #define _c_  4(%esp)
        #define _p_   (%esp)
        #define _r_  _a_
        
        leal  1(%edx,%edx,2), %eax      # rserve 3p+1 chiffres dans la pile
        negl   %eax
        leal  (%esp,%eax,4),  %esp
        pushl  %esi                     # sauve &a
        pushl  %ebx                     # sauve &b
        pushl  %edi                     # sauve &c
        pushl  %edx                     # sauve p

        # dcompose a et b
        movl   %edi,    %ebx
        leal   _d_,     %edi
        leal  4(%edi,%edx,4), %edi      # edi <- &d[p+1]
        call  .Lsn_fsplit_even
        movl   _b_,     %esi
        movl   _p_,     %edx
        leal   _d_,     %ebx
        leal  4(%ebx,%edx,8), %edi      # edi <- &d[2p+1]
        call  .Lsn_fsplit_even

        # c[0..p] <- a*b mod BASE^p + 1
        movl   _c_,     %esi
        leal   _d_,     %ebx
        movl   _p_,     %ecx
        pushl  %ecx
        pushl  %ebx
        pushl  %esi
        call   .Lsn_mmul
        leal   12(%esp), %esp

        # c[p..2p-1] <- a*b mod BASE^p - 1
        movl   _c_,     %edi
        movl   _p_,     %ecx
        leal   (%edi,%ecx,4), %edi      # edi <- &c[p]
        leal   _d_,     %esi
        leal  4(%esi,%ecx,4), %esi      # esi <- &d[p+1]
        leal   (%esi,%ecx,4), %ebx      # ebx <- &d[2p+1]
        movl   (%edi),  %eax
        movl   %eax,    _r_             # r <- c[p]
        call   .Lsn_fsmul

        # point de chute pous ssqr
.Lsn_smul_aux_big_even:

#ifdef use_sse2

        # force c0+c1 pair en ajoutant BASE^p+1  c0 si ncessaire
        movl   _c_,     %ebx
        movl   _p_,     %ecx
        leal  (%ebx,%ecx,4), %esi       # esi <- &c[p]
        movl  (%ebx),   %eax
        addl  (%esi),   %eax
        shrl   $1,      %eax
        jnc    3f
        negl   %ecx
1:
        adcl   $0,     (%esi,%ecx,4)    # c0 += 1
        jnc    2f
        incl   %ecx
        jnz    1b
2:      
        adcl   $1,      _r_             # r += 1 + retenue
        movl   _p_,     %ecx
3:
        
        # c0 <- (c0+c1)/2, c1 <- (c1-c0)/2
        movl   %ebx,    %edi            # edi <- &c0
        movl   %esi,    %edx            # edi <- &c1
        call   .Lsn_fhalf_add_sub
        movl   _r_,     %edx            # edx <- sauvegarde de c0[p]
        shrl   $1,      %edx            # isole la partie paire
        movl   %edx,    %ebx
        jnc    1f
        addl   $0x80000000, -4(%edi)    # c += BASE^p/2
        adcl   %ecx,    %edx            # edx <- retenue sortant de c0
        subl   $0x80000000, -4(%esi)    # c -= BASE^(2p)/2
        adcl   %ecx,    %ebx            # ebx <- retenue sortant de c1
1:      
        subl   %eax,    %ebx            # ebx += retenue de c1-c0
        
        # propage la retenue sortant de c0
        movl   _p_,     %ecx
        negl   %ecx
        addl   %edx, (%esi,%ecx,4)
        jnc    3f
        incl   %ecx
2:
        incl   (%esi,%ecx,4)
        jne    3f
        incl   %ecx
        jne    2b
        decl   %ebx
        # rmq:   on a forcment ebx >= 0 ici car ebx < 0 implique r=1, pas
        # de retenue pour c0-c1, et (c0-c1)/2 = BASE^p - 1 : impossible
3:      

        # recycle la retenue sortant de c1
        testl  %ebx,    %ebx
        jz     4f
2:      
        movl   _p_,     %ecx
        leal  (,%ecx,2),%ecx
        negl   %ecx                     # ecx <- -2p
        subl   %ebx,   (%esi,%ecx,4)
        jnb    4f
        movl   $1,      %ebx
        incl   %ecx
3:      
        subl   $1,     (%esi,%ecx,4)
        jnb    4f
        incl   %ecx
        jne    3b
        jmp    2b                       # si la retenue ressort, la rinjecte
        ALIGN(4)
4:      
        emms

#else /* use_sse2 */

        # c[p..2p-1] <- (c[p..2p-1] - c[0..p])/2 mod BASE^p - 1
        movl   _c_,     %ebx
        movl   _p_,     %ecx
        leal   (%ebx,%ecx,4), %esi      # esi <- &c[p]
        call   .Lsn_fdec_1              # c[p..2p-1] -= c[0..p-1]
        movl   _r_,     %eax            # eax <- sauvegarde de c[p]
1:
        movl   _p_,     %ecx
        not    %ecx
        incl   %ecx
2:
        sbbl   %eax,    (%esi,%ecx,4)   # la retranche et propage la retenue
        jnb    3f
        movl   $0,      %eax
        incl   %ecx
        jne    2b
        jmp    1b
        ALIGN(4)
3:
        movl   _c_,     %ebx
        movl   _p_,     %ecx
        leal   (%ebx,%ecx,4), %esi      # esi <- &c[p]
        leal   (%ebx,%ecx,8), %edi      # edi <- &c[2p]
        call   .Lsn_fhalf               # c[p..2p-1] /= 2
        jnc    4f
        bts    $31,     -4(%edi)        # recycle la retenue
4:

        # c += c[p..2p-1]
        xchgl  %esi,    %ebx            # esi <- &c, ebx <- &c[p]
        movl   _p_,     %ecx
        call   .Lsn_finc_1
        movl   _r_,     %eax
        adcl   %eax,    (%esi)          # rincorpore la sauvegarde de c[p]
        jnc    2f
        movl   _p_,     %ecx            # recycle la retenue
        negl   %ecx
        incl   %ecx
1:
        incl   (%edi,%ecx,4)
        jne    2f
        incl   %ecx
        jne    1b
        movl   _p_,     %ecx
        leal   (,%ecx,2), %ecx
        negl   %ecx
        jmp    1b
        ALIGN(4)
2:

#endif /* use_sse2 */
        
        # termin
        movl   _p_,     %ecx            # nettoie la pile
        leal   1(%ecx,%ecx,2), %ecx
        leal   16(%esp,%ecx,4), %esp
        ret

        # cas n grand impair : dcompose en deux produits
        ALIGN(4)
L(big_odd):
        
        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #define _d_ 16(%esp)
        #define _a_ 12(%esp)
        #define _b_  8(%esp)
        #define _c_  4(%esp)
        #define _p_   (%esp)

        leal   3(%ecx,%ecx,2), %ecx     # ecx <- 6p + 6
        negl   %ecx   
        leal   (%esp,%ecx,4), %esp      # rserve 6p+6 chiffres dans la pile
        pushl  %esi                     # sauve &a
        pushl  %ebx                     # sauve &b
        pushl  %edi                     # sauve &c
        pushl  %edx                     # sauve p

        # dcompose a et b
        leal   _d_,     %edi
        leal  8(%edi,%edx,8), %ebx      # ebx <- &d[2p+2]
        call  .Lsn_fsplit_odd
        movl   _b_,     %esi
        movl   _p_,     %edx
        leal   _d_,     %edi
        leal  4(%edi,%edx,4), %edi      # edi <- &d[p+1]
        leal  8(%edi,%edx,8), %ebx      # ebx <- &d[3p+3]
        call  .Lsn_fsplit_odd

        # d[4p+4..6p+5] <- a*b mod BASE^(p+1/2) + 1
        movl   _p_,     %ecx
        incl   %ecx                     # ecx <- p+1
        movl   %ecx,    %edx            # edx <- p+1
        leal   _d_,     %esi
        leal   (%esi,%ecx,8), %esi      # esi <- &d[2p+2]
        leal   (%esi,%ecx,8), %edi      # edi <- &d[4p+4]
        leal   (%esi,%ecx,4), %ebx      # ebx <- &d[3p+3]
        leal   (%edi,%ecx,8), %eax      # eax <- &d[6p+6]
        pushl  %eax                     # sauve l adresse
        call   .Lsn_ftoommul
        popl   %edi                     # edi <- &d[6p+6]
        movl   -4(%edi), %eax
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        negl   %ecx
        addl   %eax, -4(%edi,%ecx,4)    # rinjecte le chiffre de rang 2p+1
        jnc    2f
1:
        incl   (%edi,%ecx,4)
        jne    2f
        incl   %ecx
        jne    1b
        movl   _p_,     %ecx
        leal   2(,%ecx,2), %ecx
        negl   %ecx
        jmp    1b
        ALIGN(4)
2:
        
        # d[2p+2..4p+3] <- a*b mod BASE^(p+1/2) - 1
        movl   _p_,     %ecx
        incl   %ecx                     # ecx <- p+1
        movl   %ecx,    %edx            # edx <- p+1
        leal   _d_,     %esi
        leal   (%esi,%ecx,4), %ebx      # ebx <- &d[p+1]
        leal   (%ebx,%ecx,4), %edi      # edi <- &d[2p+2]
        leal   (%edi,%ecx,8), %eax      # eax <- &d[4p+4]
        pushl  %eax                     # sauve l adresse
        call   .Lsn_ftoommul
        
        ALIGN(32)                       # point de chute pous ssqr
.Lsn_smul_aux_big_odd:
        
        popl   %edi                     # edi <- &d[6p+6]
        movl   -4(%edi), %eax
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        negl   %ecx
        addl   %eax, -4(%edi,%ecx,4)    # rinjecte le chiffre de rang 2p+1
        jnc    2f
1:
        incl   (%edi,%ecx,4)
        jne    2f
        incl   %ecx
        jne    1b
        movl   _p_,     %ecx
        leal   2(,%ecx,2), %ecx
        negl   %ecx
        jmp    1b
        ALIGN(4)
2:

#ifdef use_sse2

        # c <- d[2p+2..4p+2] + d[4p+4..6p+4]  mod BASE^(2p+1)-1
        # d <- d[2p+2..4p+2] - d[4p+4..6p+4]  mod BASE^(2p+1)-1
        movl   _p_,     %ecx
        leal 1(,%ecx,2),%ecx            # ecx <- 2p+1
        movl   _c_,     %edi
        leal   _d_,     %edx
        leal 4(%edx,%ecx,4), %esi       # esi <- &d[2p+2]
        leal 8(%edx,%ecx,8), %ebx       # ebx <- &d[4p+4]
        call  .Lsn_fadd_sub

        # recycle la retenue sortant de c
        jnc   3f
1:      
        movl   _p_,     %ecx
        leal 1(,%ecx,2),%ecx            # ecx <- 2p+1
        negl  %ecx
2:
        incl  (%edi,%ecx,4)
        jne    3f
        incl   %ecx
        jne    2b
        jmp    1b
        ALIGN(4)
3:
        
        # recycle la retenue sortant de d
        shrl   $1,      %eax
        jnc    3f
1:      
        movl   _p_,     %ecx
        leal 1(,%ecx,2),%ecx            # ecx <- 2p+1
        negl  %ecx
2:
        subl  $1,  (%edx,%ecx,4)
        jnb   3f
        incl  %ecx
        jne   2b
        jmp   1b
        ALIGN(4)
3:

        # c <- (c + d*BASE^(p+1/2))/2 mod BASE^(2p+1)-1
        movl   _p_,     %ecx
        negl   %ecx
        leal -4(%edi,%ecx,4), %esi      # esi <- &c[p]
        leal   (%edx,%ecx,4), %ebx      # ebx <- &d[p+1]
        leal -1(%ecx),  %eax            # eax <- -p-1

        # chiffre 0
        movd  (%esi,%ecx,4), %mm3       # mm2 <- c[0]
        movq   %mm3,    %mm4
        psllq  $63,     %mm4
        psrlq  $63,     %mm4            # mm4 <- bit 0 de c
        movd  (%edx,%ecx,4), %mm1
        psllq  $16,     %mm1            # mm1 <- d[p+1] << 16
        paddq  %mm1,    %mm3            # mm3 <- c[0] + d[p+1] << 16
        movq   %mm3,    %mm2
        psrlq  $32,     %mm2            # mm2 <- retenue
        incl   %ecx
        jmp    2f
        
        # chiffres 1..p-1
        ALIGN(4)
1:
        movd   %mm3, -8(%esi,%ecx,4)    # sauve c[i-2]
        psrlq  $31,     %mm3            # termine le dcalage de mm3
2:
        movd  (%esi,%ecx,4), %mm0       # mm0 <- c[i]
        movd  (%edx,%ecx,4), %mm1
        paddq  %mm0,    %mm2            # mm2 <- ret + c[i]
        psllq  $16,     %mm1            # mm1 <- d[i+p+1] << 16
        incl   %ecx
        paddq  %mm1,    %mm2            # mm2 <- ret + c[i] + d[i+p+1] << 16
        punpckldq %mm2, %mm3            # high(mm3) <- low(mm2)
        psrlq  $32,     %mm2            # mm2 <- nouvelle retenue
        psrlq  $1,      %mm3            # dbut du dcalage de mm3
        jne    1b

        # chiffres p..2p-1
        ALIGN(4)
1:
        movd   %mm3, -8(%edi,%eax,4)    # sauve c[p+i-2]
        psrlq  $31,     %mm3            # termine le dcalage de mm3
        movd  (%edi,%eax,4), %mm0       # mm0 <- c[p+i]
        movd  (%ebx,%eax,4), %mm1
        paddq  %mm0,    %mm2            # mm2 <- ret + c[p+i]
        psllq  $16,     %mm1            # mm1 <- d[i] << 16
        incl   %eax
        paddq  %mm1,    %mm2            # mm2 <- ret + c[p+i] + d[i] << 16
        punpckldq %mm2, %mm3            # high(mm3) <- low(mm2)
        psrlq  $32,     %mm2            # mm2 <- nouvelle retenue
        psrlq  $1,      %mm3            # dbut du dcalage de mm3
        jne    1b

        # dernier chiffre
        movd   %mm3, -8(%edi)           # sauve c[2p-1]
        psrlq  $31,     %mm3            # termine le dcalage de mm3
        paddq  %mm4,    %mm2            # rinjecte le bit 0 de c
        punpckldq %mm2, %mm3            # high(mm3) <- low(mm2)
        psrlq  $1,      %mm3            # dbut du dcalage de mm3
        movd   %mm3, -4(%edi)           # sauve c[2p]
        psrlq  $32,     %mm3            # termine le dcalage de mm3

        # recycle la retenue
        movd   %mm3,    %eax            # eax <- retenue sortante
1:      
        movl   _p_,     %ecx
        leal  (,%ecx,2), %ecx
        negl   %ecx                     # ecx <- -(2p)
        addl   %eax,-4(%edi,%ecx,4)
        jnc    3f
        movl   $1,      %eax
2:
        incl   (%edi,%ecx,4)
        jnz    3f
        incl   %ecx
        jne    2b
        jmp    1b                       # si la retenue sort, la rinjecte
        ALIGN(4)
3:      
        emms

#else /* use_sse2 */

        # d[2p+2..4p+2] <- (d[2p+2..4p+2] - d[4p+4..6p+4])/2 mod BASE^(2p+1) - 1
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        leal   _d_,     %esi
        leal   4(%esi,%ecx,4), %esi     # esi <- &d[2p+2]
        leal   4(%esi,%ecx,4), %ebx     # ebx <- &d[4p+4]
        call   .Lsn_fdec_1              # soustrait les deux rsidus
        jnb    3f
1:
        movl   _p_,     %ecx            # recycle la retenue
        leal   1(,%ecx,2), %ecx
        negl   %ecx
2:
        subl   $1, (%esi,%ecx,4)
        jnb    3f
        incl   %ecx
        jne    2b
        jmp    1b
        ALIGN(4)
3:
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        leal   4(%esi), %ebx            # ebx <- &d[4p+4]
        leal   _d_,     %esi
        leal   4(%esi,%ecx,4), %esi     # esi <- &d[2p+2]
        call   .Lsn_fhalf               # d[2p+2..4p+2] /= 2
        jnc    4f
        bts    $31,    -8(%ebx)         # recycle la retenue
4:

        # c <- d[2p+2..4p+2] + d[4p+4..6p+4]
        movl   _c_,     %edi
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        call   .Lsn_fadd_1
        jnc    2f
        movl   _c_,     %edi
1:
        incl   (%edi)
        leal   4(%edi), %edi
        jz     1b
2:

        # d[p+2..3p+2] <- d[2p+2..4p+2] << 16
        movl   _p_,     %edx
        leal   1(,%edx,2), %edx         # edx <- 2p+1
        leal   _d_,     %esi
        leal   4(%esi,%edx,4), %esi     # esi <- &d[2p+2]
        movl   %esi,    %edi            # edi <- &d[2p+2]
        movl   $16,     %ecx
        call   .Lsn_fshift_up
        shrl   %cl,     %eax
        movl   _p_,     %ecx
        leal   _d_,     %edi
        leal   8(%edi,%ecx,4), %edi     # edi <- &d[p+2]
        orl    %eax, (%edi,%ecx,4)      # rinjecte les bits sortis
        leal   4(%edi,%ecx,8), %esi     # esi <- &d[3p+3]
        cld;   rep movsl                # d[p+2..2p+1] <- d[3p+3..4p+2]

        # c <- c + d[p+2..3p+2]
        movl   _p_,     %ecx
        leal   _d_,     %ebx
        leal   8(%ebx,%ecx,4), %ebx     # ebx <- &d[p+2]
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        movl   _c_,     %esi
        call   .Lsn_finc_1
        jnc    2f
        movl   _p_,     %ecx            # recycle la retenue
        leal   1(,%ecx,2), %ecx
        not    %ecx
1:
        incl   %ecx
        incl   (%esi,%ecx,4)
        jz     1b
2:

#endif /* use_sse2 */

        # termin
        movl   _p_,     %ecx
        leal   3(%ecx,%ecx,2), %ecx     # ecx <- 3p+3
        leal   _d_,     %esp            # nettoie la pile
        leal   (%esp,%ecx,8),  %esp
        ret
        
        
                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
                
# void xn(smul) (chiffre *a, long la, chiffre *b, long lb, chiffre *c, long n)
#
# entre :
# a = naturel de longueur la
# b = naturel de longueur lb
# c = naturel de longueur n
#
# contraintes : n > 0, 0 <= lb <= la
#
# sortie :
# c <- a*b mod (BASE^n - 1)

#ifdef debug_smul
ENTER(sn_smul_buggy)
#else
ENTER(sn_smul)
#endif

        # rduit a modulo BASE^n - 1
        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- la
        movl   arg6,    %ecx            # ecx <- n
        leal   (,%ecx,4), %eax
        subl   %eax,    %esp            # rserve n chiffres dans la pile
        movl   %esp,    %edi            # edi <- &x
        pushl  %ebp
        call   .Lsn_fsred
        popl   %ebp
        
        # rduit b modulo BASE^n - 1
        movl   arg3,    %esi            # esi <- &b
        movl   arg4,    %edx            # edx <- lb
        movl   arg6,    %ecx            # ecx <- n
        leal   (,%ecx,4), %eax
        subl   %eax,    %esp            # rserve n chiffres dans la pile
        movl   %esp,    %edi            # edi <- &x
        pushl  %ebp
        call   .Lsn_fsred
        popl   %ebp

        # multiplie les rsidus
        movl   arg6,    %ecx            # ecx <- n
        movl   %esp,    %ebx            # ebx <- &y
        leal   (%ebx,%ecx,4), %esi      # esi <- &x
        movl   arg5,    %edi            # edi <- &c
        pushl  %ebp
#ifdef debug
        movl   $1,%eax
        movd %eax,%xmm7
#endif
        call   .Lsn_fsmul
        popl   %ebp
        RETURN_WITH_BP
        
#endif /* assembly_sn_smul */   
        
        

                        # +---------------------------+
                        # |  Carr modulo BASE^n - 1  |
                        # +---------------------------+

# entre :
#   a = naturel de longueur n     esi = &a,   ecx = n
#   c = naturel de longueur n     edi = &c
#
# contrainte : n > 0
#   
# sortie :
#   c <- a^2 mod BASE^n - 1
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.

#ifdef assembly_sn_ssqr
        ALIGN(32)
.Lsn_fssqr:

#undef L
#define L(x) .Lsn_fssqr_##x

        # aiguillage selon la longueur et la parit
        movl    %ecx,   %edx
        shrl    $1,     %edx            # edx <- p = n/2
        jnc     1f
        cmpl    $ssqr_lim_odd, %ecx
        jbe     L(small)
        jmp     L(big_odd)
        ALIGN(4)
1:
        cmpl    $ssqr_lim_even, %ecx
        jg      L(big_even)
        
        # petit carr => Toom
L(small):
        leal   (,%ecx,8), %eax
        subl   %eax,    %esp            # rserve 2n chiffres dans la pile
        pushl  %ecx                     # sauve n
        pushl  %edi                     # sauve &c
        movl   %ecx,    %edx            # edx <- n
        leal   8(%esp), %edi            # edi <- &d
        call   .Lsn_ftoomsqr            # d <- a^2
        jmp    .Lsn_smul_aux_small      # continue avec smul
        ALIGN(4)

        # cas n grand pair : dcompose en deux produits modulaires
L(big_even):

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #undef  _r_
        #define _d_ 16(%esp)
        #define _a_ 12(%esp)
        #define _b_  8(%esp)
        #define _c_  4(%esp)
        #define _p_   (%esp)
        #define _r_   _a_
        
        leal  1(%edx,%edx,2), %eax      # rserve 3p+1 chiffres dans la pile
        negl   %eax
        leal  (%esp,%eax,4),  %esp
        pushl  %esi                     # sauve &z
        pushl  %esi                     # sauve &b (= &a)
        pushl  %edi                     # sauve &c
        pushl  %edx                     # sauve p

        # dcompose a
        movl   %edi,    %ebx
        leal   _d_,     %edi
        leal  4(%edi,%edx,4), %edi      # edi <- &d[p+1]
        call   .Lsn_fsplit_even

        # c[0..p] <- a^2 mod BASE^p + 1
        movl   _c_,     %esi
        movl   _p_,     %ecx
        pushl  %ecx
        pushl  %esi
        call  .Lsn_msqr
        leal   8(%esp), %esp

        # c[p..2p-1] <- a^2 mod BASE^p - 1
        movl   _c_,     %edi
        movl   _p_,     %ecx
        leal   (%edi,%ecx,4), %edi      # edi <- &c[p]
        leal   _d_,     %esi
        leal  4(%esi,%ecx,4), %esi      # esi <- &d[p+1]
        movl   (%edi),  %eax
        movl   %eax,    _r_             # r <- c[p]
        call   .Lsn_fssqr
        jmp    .Lsn_smul_aux_big_even   # continue avec smul
        ALIGN(4)

        # cas n grand impair : dcompose en deux carrs
L(big_odd):     

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #define _d_ 16(%esp)
        #define _a_ 12(%esp)
        #define _b_  8(%esp)
        #define _c_  4(%esp)
        #define _p_   (%esp)

        leal   3(%ecx,%ecx,2), %ecx     # ecx <- 6p + 6
        negl   %ecx   
        leal   (%esp,%ecx,4), %esp      # rserve 6p+6 chiffres dans la pile
        pushl  %esi                     # sauve &a
        pushl  %esi                     # sauve &b (= &a)
        pushl  %edi                     # sauve &c
        pushl  %edx                     # sauve p

        # dcompose a
        leal   _d_,     %edi
        leal  8(%edi,%edx,8), %ebx      # ebx <- &d[2p+2]
        call  .Lsn_fsplit_odd

        # d[4p+4..6p+5] <- a^2 mod BASE^(p+1/2) + 1
        movl   _p_,     %ecx
        incl   %ecx                     # ecx <- p+1
        movl   %ecx,    %edx            # edx <- p+1
        leal   _d_,     %esi
        leal   (%esi,%ecx,8), %esi      # esi <- &d[2p+2]
        leal   (%esi,%ecx,8), %edi      # edi <- &d[4p+4]
        leal   (%edi,%ecx,8), %eax      # eax <- &d[6p+6]
        pushl  %eax                     # sauve l adresse
        call   .Lsn_ftoomsqr
        popl   %edi                     # edi <- &d[6p+6]
        movl   -4(%edi), %eax
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        negl   %ecx
        addl   %eax, -4(%edi,%ecx,4)    # rinjecte le chiffre de rang 2p+1
        jnc    2f
1:
        incl   (%edi,%ecx,4)
        jne    2f
        incl   %ecx
        jne    1b
        movl   _p_,     %ecx
        leal   2(,%ecx,2), %ecx
        negl   %ecx
        jmp    1b
        ALIGN(4)
2:
        
        # d[2p+2..4p+3] <- a^2 mod BASE^(p+1/2) - 1
        movl   _p_,     %ecx
        incl   %ecx                     # ecx <- p+1
        movl   %ecx,    %edx            # edx <- p+1
        leal   _d_,     %esi
        leal   (%esi,%ecx,4), %ebx      # ebx <- &d[p+1]
        leal   (%ebx,%ecx,4), %edi      # edi <- &d[2p+2]
        leal   (%edi,%ecx,8), %eax      # eax <- &d[4p+4]
        pushl  %eax                     # sauve l adresse
        call   .Lsn_ftoomsqr
        jmp    .Lsn_smul_aux_big_odd    # continue avec smul
        
        
                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
                
# void xn(ssqr) (chiffre *a, long la, chiffre *b, long n)
#
# entre :
# a = naturel de longueur la
# b = naturel de longueur n
#
# contraintes : n > 0, la >= 0
#
# sortie :
# b <- a^2 mod (BASE^n - 1)

#ifdef debug_smul
ENTER(sn_ssqr_buggy)
#else
ENTER(sn_ssqr)
#endif

        # rduit a modulo BASE^n - 1
        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- la
        movl   arg4,    %ecx            # ecx <- n
        leal   (,%ecx,4), %eax
        subl   %eax,    %esp            # rserve n chiffres dans la pile
        movl   %esp,    %edi            # edi <- &x
        pushl  %ebp
        call   .Lsn_fsred
        popl   %ebp
        
        # lve au carr
        movl   arg4,    %ecx            # ecx <- n
        movl   %esp,    %esi            # esi <- &x
        movl   arg3,    %edi            # edi <- &b
        pushl  %ebp
        call   .Lsn_fssqr
        popl   %ebp
        RETURN_WITH_BP
        
#endif /* assembly_sn_ssqr */   
        

                       # +----------------------------+
                       # |  Combinaison de 3 rsidus  |
                       # +----------------------------+

# void xn(sjoin3)(chiffre *a, long h, long k)
#
#  entre :
#  a = naturel de longueur n+p+q
#  n = (2h+2)k, p = (2h+1)k, q = (2h)k
#
#  contraintes : h >= 2, k >= 2
#
#  sortie :
#  a <- x mod ppcm(BASE^n - 1, BASE^p - 1, BASE^q - 1) normalis
#  avec
#    a[0..n-1]       = x mod (BASE^n - 1),
#    a[n..n+p-1]     = x mod (BASE^p - 1),
#    a[n+p..n+p+q-1] = x mod (BASE^q - 1)
#
#  remarque : ppcm = produit/(BASE^k - 1)/(BASE^(2k) - 1)

#ifdef assembly_sn_sjoin3
#undef L
#define L(x) .Lsn_sjoin3_##x
#ifdef debug_sjoin
ENTER(sn_sjoin3_buggy)
#else
ENTER(sn_sjoin3)
#endif

        # variables locales
        #undef _a_
        #undef _b_
        #undef _c_
        #undef _h_
        #undef _k_
        #undef _n_
        #undef _p_
        #undef _q_
        #undef _r_
        #define _k_ 52(%esp)
        #define _h_ 48(%esp)
        #define _a_ 44(%esp)
        #define _q_ 20(%esp)
        #define _p_ 16(%esp)
        #define _n_ 12(%esp)
        #define _b_  8(%esp)
        #define _c_  4(%esp)
        #define _r_   (%esp)

        movl   arg2,    %eax
        shll   $1,      %eax            # eax <- 2h
        movl   arg3,    %ebx            # ebx <- k
        mull   %ebx                     # eax <- q = 2*h*k
        pushl  %eax
        addl   %ebx,    %eax            # eax <- p = q+k
        pushl  %eax
        leal   (%eax,%ebx,1),%edx       # edx <- n = p+k
        pushl  %edx
        movl   arg1,    %esi            # esi <- &a
        leal   (%esi,%edx,4),%edi       # edi <- &b
        pushl  %edi
        leal   (%edi,%eax,4),%edi       # edi <- &c
        pushl  %edi
        pushl  $0                       # r <- 0
        cld
        
        # normalise a
        movl   $-1,     %eax
        movl   %esi,    %edi            # edi <- &a
        movl   %edx,    %ecx            # ecx <- n
        repe   scasl                    # a[0] = BASE-1 ?
        jne    1f
        incl   %eax                     # alors a <- 0
        movl   %edx,    %ecx
        movl   %esi,    %edi
        rep    stosl
        ALIGN(4)
1:
        
        # b <- (a - b) mod (BASE^p - 1)
        movl   _b_,     %ebx
        movl   %ebx,    %edi
        movl   _p_,     %ecx
        call   .Lsn_fsub_1              # retranche les p premiers chiffres
        sbbl   %ecx,    _r_             # r <- retenue
        movl   %esi,    %ebx            # ebx <- &a[p]
        movl   _b_,     %esi
        movl   _k_,     %ecx
        movl   _p_,     %edx
        call   .Lsn_finc                # ajoute les k derniers chiffres
        adcl   _r_,     %ecx            # ecx <- somme des retenues
        jz     5f

#if 0
#----------------------------------------------------------------------
# il n est pas possible d avoir une retenue positive, car ceci impliquerait
# a = BASE^n - 1 et b = 0, en contradiction avec la normalisation de a.
# code conserv pour le cas o, mais non test pour cause d impossibilit
        js     3f

1:                                       # recycle la retenue positive
        movl   _p_,     %ecx
        negl   %ecx
2:
        incl   (%esi,%ecx,4)
        jnz    5f
        incl   %ecx
        jne    2b
        jmp    1b
        ALIGN(4)
#----------------------------------------------------------------------
#endif
3:                                       # recycle la retenue ngative
        movl   _p_,     %ecx
        negl   %ecx
4:
        subl   $1,      (%esi,%ecx,4)
        jnb    5f
        incl   %ecx
        jne    4b
        jmp    3b
        ALIGN(4)
5:

        # normalise b vers le haut
        movl  _b_,      %edi
        movl  _p_,      %ecx
        xorl  %eax,     %eax
        repe  scasl                     # b = 0 ?
        jne   1f
        decl  %eax                      # alors b <- BASE^p - 1
        movl  _b_,      %edi
        movl  _p_,      %ecx
        repe  stosl
        ALIGN(4)
1:
        
        # c <- (c - a) + (BASE^k + 1)*b - (BASE^(2*k) - 1) mod (BASE^q - 1)
        movl   _c_,     %esi
        movl   _a_,     %ebx
        movl   _q_,     %ecx
        call   .Lsn_fdec_1              # c <- c - a[0..q-1]
        sbbl   %ecx,    %ecx
        movl   %ecx,    _r_             # r <- retenue
        movl   _c_,     %esi
        movl   _k_,     %ecx
        leal   (,%ecx,2), %ecx          # ecx <- 2k
        movl   _q_,     %edx
        call   .Lsn_fdec                # c <- c - a[q..n-1]
        sbbl   %ecx,    _r_             # maj retenue
        movl   _c_,     %esi
        movl   _q_,     %ecx
        call   .Lsn_finc_1              # c <- c + b[0..q-1]
        adcl   %ecx,    _r_             # maj retenue
        movl   _c_,     %esi
        movl   _k_,     %ecx
        movl   _q_,     %edx
        call   .Lsn_finc                # c <- c + b[q..p-1]
        adcl   %ecx,    _r_             # maj retenue
        movl   _c_,     %esi
        movl   _b_,     %ebx
        movl   _q_,     %ecx
        movl   _k_,     %edx
        leal   (%esi,%edx,4), %esi      # esi <- &c[k]
        subl   %edx,    %ecx            # ecx <- q-k
        call   .Lsn_finc_1              # c[k..q-1] += b[0..q-k-1]
        adcl   %ecx,    _r_             # maj retenue
        movl   _c_,     %esi
        movl   _k_,     %ecx
        leal   (,%ecx,2), %ecx          # ecx <- 2k
        call   .Lsn_finc_1              # c <- c + b[q-k..p-1]
        jc     2f
        movl   _q_,    %ecx             # s il n y a pas de retenue, retranche BASE^(2k)
        movl   _k_,    %eax
        leal   (,%eax,2), %eax
        subl   %eax,   %ecx             # ecx <- q-2k
        leal   (%esi,%ecx,4), %esi      # esi <- &c[q]
        negl   %ecx
1:
        subl   $1,  (%esi,%ecx,4)
        jnb    2f
        incl   %ecx
        jne    1b
        decl   _r_                      # maj retenue
        ALIGN(4)
2:
        movl   _c_,     %esi
        movl   _q_,     %ecx
        leal   (%esi,%ecx,4), %esi      # esi <- &c[q]
        negl   %ecx
        movl   _r_,     %eax
        incl   %eax                     # eax <- retenue + 1
        jz     6f
        js     4f

        addl   %eax,    (%esi,%ecx,4)   # recycle la retenue positive
        jnc    6f
        incl   %ecx
3:
        incl   (%esi,%ecx,4)
        jne    6f
        incl   %ecx
        jne    3b
        movl   _q_,     %ecx
        negl   %ecx
        jmp    3b
        ALIGN(4)
4:      addl   %eax,   (%esi,%ecx,4)   # recycle la retenue ngative
        jc     6f
        incl   %ecx
5:
        subl   $1,     (%esi,%ecx,4)
        jnb    6f
        incl   %ecx
        jne    5b
        movl   _q_,     %ecx
        negl   %ecx
        jmp    5b
        ALIGN(4)
6:

        # c = 0 mod (BASE^q - 1) ?
        movl   _c_,     %edi
        movl   (%edi),  %eax            # eax <- c[0]
        testl  %eax,    %eax            # c[0] = 0 ?
        jz     1f
        incl   %eax                     # c[0] = BASE-1 ?
        jnz    L(c_non_nul)
        decl   %eax
1:
        movl   _q_,     %ecx
        decl   %ecx                     # ecx <- q-1
        leal   4(%edi), %edi            # edi <- &c[1]
        repe   scasl                    # compare les autres chiffres  c[0]
        jne    L(c_non_nul)
        testl  %eax,    %eax            # si c = 0, alors c <- BASE^q - 1
        jne    2f
        movl   _c_,     %edi
        movl   _q_,     %ecx
        decl   %eax
        rep    stosl
2:
        movl   _b_,     %esi
        movl   _p_,     %ecx
        addl   _q_,     %ecx            # ecx <- p+q
        jmp    L(inc_b)                 # c:b += 1
        ALIGN(4)

        # si c <> 0 mod (BASE^q - 1),
        # alors c:b <- b + (BASE^p - 1)*(c/(1-BASE^(2*k)) - 1) + BASE^q
L(c_non_nul):
        movl   _c_,     %ebx
        movl   _q_,     %ecx
        movl   _k_,     %edx
        shll   $1,      %edx            # edx <- 2k
        leal   (%ebx,%edx,4), %esi      # esi <- &c[2k]
        subl   %edx,    %ecx            # ecx <- q-2k
        call   .Lsn_finc_1              # c <- c/(1-BASE^(2k))
        movl   _q_,     %ecx
        not    %ecx
1:
        incl   %ecx
        subl   $1,      (%esi,%ecx,4)   # c <- c-1
        jb     1b
        movl   _b_,     %esi
        movl   _c_,     %ebx
        movl   _q_,     %ecx
        call   .Lsn_fdec_1              # b[0..q-1] <- b[0..q-1] - c
        jb     3f                       # s il y a retenue, n ajoute pas BASE^q
        movl   _p_,     %ecx
L(inc_b):
        leal   (%esi,%ecx,4), %esi      # esi <- &c[q]
        negl   %ecx
2:
        incl   (%esi,%ecx,4)            # c:b += BASE^q
        jne    3f
        incl   %ecx
        jne    2b
3:

        # c:b:a <- a + (BASE^n - 1)*(c:b)/(1-BASE^k)
        movl   _b_,     %ebx
        movl   _q_,     %ecx
        leal   (,%ecx,2), %ecx          # ecx <- 2q
        movl   _k_,     %edx
        leal   (%ebx,%edx,4), %esi      # esi <- &b[k]
        call   .Lsn_finc_1              # b:c <- b:c/(1 - BASE^k)
        movl   _a_,     %esi
        movl   _b_,     %ebx
        movl   _p_,     %ecx
        addl   _q_,     %ecx            # ecx <- p+q
        call   .Lsn_fdec_1              # a[0..p+q-1] -= b:c
        jnb    2f
1:
        subl   $1,      (%esi)          # propage la retenue
        leal   4(%esi), %esi
        jb     1b
2:
        
        leal   24(%esp), %esp           # nettoie la pile
        RETURN_WITH_SP

#endif /* assembly_sn_sjoin3 */
