// file kernel/n/x86/montgomery.S: Montgomery modular exponentiation
/*-----------------------------------------------------------------------+
 |  Copyright 2005, Michel Quercia (michel.quercia@prepas.org)           |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                   Exponentiation modulaire de Montgomery              |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                          # +----------------------+
                          # |  Division modulaire  |
                          # +----------------------+


# void xn(mgdiv_n2)(chiffre *a, chiffre *c, chiffre d, long n)
#
# entre :
# a = naturel de longueur 2n+1
# c = naturel de longueur n
# d = -1/c mod BASE
#
# contraintes :
# n > 0, a[0..2n-1] <= (BASE^n - 1)^2, a,c non confondus
#
# sortie :
# a[n..2n-1] <- a[0..2n-1]/BASE^n mod c, non normalis

#ifdef assembly_sn_mgdiv_n2
#undef L
#define L(x) .Lsn_mgdiv_##x
        
ENTER(sn_mgdiv_n2)

#ifdef use_sse2

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edi            # edi <- &c
        movd   arg3,    %mm0            # mm0 <- d
        movl   arg4,    %edx            # edx <- n
        leal  (%esi,%edx,4), %esi       # esi <- &a[n]
        leal  (%edi,%edx,4), %edi       # edi <- &c[n]
        movl   $0,     (%esi,%edx,4)    # a[2n] <- 0
        negl   %edx                     # edx <- -n
        movl   %edx,    %ebp            # ebp <- -n

        # boucle sur i
        ALIGN(4)
L(loop_i):
        movd  (%esi,%edx,4), %mm1
        movq   %mm1,    %mm2            # mm2 <- a[i]
        pmuludq %mm0,   %mm1            # mm1 <- d*a[i]
        movl   %edx,    %ecx            # ecx <- -n

        # boucle sur j
        ALIGN(4)
L(loop_j):
        movd  (%edi,%ecx,4), %mm3
        pmuludq %mm1,   %mm3            # mm3 <- c[j]*(d*a[i] mod BASE)
        paddq  %mm3,    %mm2            # cumule  a[i+j]
        movd   %mm2,   (%esi,%ecx,4)    # et sauve dans a[i+j]
        incl   %ecx
        psrlq  $32,     %mm2            # mm2 <- nouvelle retenue
        movd  (%esi,%ecx,4), %mm3
        paddq  %mm3,    %mm2            # + a[i+j+1]
        jne    L(loop_j)

        # propage la retenue
        pextrw $2, %mm2, %eax
        movd   %mm2,    (%esi)
        testl  %eax,     %eax
        jz     2f
1:
        incl   %ecx
        incl  (%esi,%ecx,4)
        jz     1b
2:
        # fin de la boucle sur i
        leal  4(%esi),   %esi           # a++
        incl   %ebp                     # la--
        jne    L(loop_i)

        # recycle la retenue sortante
        testl  $-1,     (%esi)
        jz     2f
        pxor   %mm0,     %mm0
1:
        movd   (%esi,%edx,4), %mm1
        movd   (%edi,%edx,4), %mm2
        paddq   %mm1,   %mm0
        psubq   %mm2,   %mm0
        movd    %mm0,  (%esi,%edx,4)
        incl    %edx
        pshufw  $0xfe, %mm0, %mm0
        jne     1b
2:      
        emms
        RETURN_WITH_SP
        
#else /* use_sse2 */
        
        # variables locales
        #undef _a_
        #undef _c_
        #undef _d_
        #undef _n_
        #undef _i_
        #undef _j_
        #undef _br_
        #define _n_  44(%esp)
        #define _d_  40(%esp)
        #define _c_  36(%esp)
        #define _a_  32(%esp)
        #define _i_   8(%esp)
        #define _j_   4(%esp)
        #define _br_   (%esp)

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edi            # edi <- &c
        movl   arg4,    %ecx            # ecx <- n
        movl   $0,      (%esi,%ecx,8)   # a[2n] <- 0
        leal -8(%esi,%ecx,4), %esi      # esi <- &a[n-2]
        leal   (%edi,%ecx,4), %edi      # edi <- &c[n]
        pushl  %ecx                     # i <- n
        pushl  $0                       # rserve de la place pour j

        # prcalcule l adresse de saut dans la boucle interne
        call   L(here)
L(here):
        negl   %ecx                     # ecx <- -n
        movl   %ecx,   %edx
        andl   $31,     %edx            # edx <- (-n) % 32
        leal   (,%edx,8), %eax          # multiplication par 17 = 8*2 + 1
        leal   17+L(loop_j)-L(here)(%edx,%eax,2), %eax
        addl   %eax, _br_

        # boucle sur i
        ALIGN(4)
L(loop_i):

        movl  8(%esi,%ecx,4), %eax      # eax <- a[n-i]
        movl   (%edi,%ecx,4), %ebx      # ebx <- c[0]
        sarl   $5,      %ecx
        movl   %ecx,    _j_             # j <- -ceil(n/4)
        shll   $5,      %ecx            
        leal  4(%esi,%ecx,4), %esi      # recadre les pointeurs
        leal   (%edi,%ecx,4), %edi
        mull   _d_          
        movl   %eax,    %ebp            # ebp <- m = d*a[0] mod BASE
        mull   %ebx                     # edx:eax <- m*c[0]
        xorl   %ebx,    %ebx            # init retenues
        xorl   %ecx,    %ecx
        jmp    *_br_

        # corps de boucle  drouler. taille du code = 17 octets
        # entrer dans la boucle avec edx:eax = retenue, ebx = ecx = 0, CF = 0
        # code inspir de GMP (k7/mul_basecase.asm)
#undef BODY
#define BODY(x,y) \
          adcl   %eax,    %ecx           /* ecx += pfaible courant  */;\
          movl   x(%edi), %eax           /* eax <- c[2i+1]          */;\
          adcl   %edx,    %ebx           /* ebx <- pfort courant    */;\
          mull   %ebp                    /* multiplie par m         */;\
          addl   %ecx,    x(%esi)        /* a[2i+j] <- pfaible prc.*/;\
          movl   $0,      %ecx                                        ;\
          adcl   %eax,    %ebx           /* ebx += pfaible courant  */;\
          movl   y(%edi), %eax           /* eax <- c[2i+2]          */;\
          adcl   %edx,    %ecx           /* ecx <- pfort courant    */;\
          mull   %ebp                    /* multiplie par m         */;\
          addl   %ebx,    y(%esi)        /* a[2i+j+1] <- pf. prc.  */;\
          movl   $0,      %ebx
        
        # boucle de multiplication droule pour 32 chiffres
        ALIGN(4)
L(loop_j):
        BODY(0,4);    BODY(8,12);    BODY(16,20);   BODY(24,28)
        BODY(32,36);  BODY(40,44);   BODY(48,52);   BODY(56,60)
        BODY(64,68);  BODY(72,76);   BODY(80,84);   BODY(88,92)
        BODY(96,100); BODY(104,108); BODY(112,116); BODY(120,124)
        
        leal 128(%edi), %edi
        leal 128(%esi), %esi
        incl   _j_
        jne    L(loop_j)

        # propage la retenue
        adcl   %eax,    %ecx
        adcl   %ebx,    %edx
        addl   %ecx,   (%esi)
        adcl   %ebx,    %edx
        addl   %edx,  4(%esi)
        adcl   %ebx,  8(%esi)
        jnc    2f
        leal 12(%esi),  %ebx
1:
        incl   (%ebx)
        leal   4(%ebx), %ebx
        jz     1b
        ALIGN(4)
2:
        # fin de la boucle sur i
        movl   _n_,     %ecx
        negl   %ecx
        decl   _i_
        jne    L(loop_i)

        # s il y a retenue, retranche c
        testl  $-1,     8(%esi)
        jz     1f
        leal  8(%esi,%ecx,4), %esi      # esi <- &a[n]
        leal   (%edi,%ecx,4), %ebx      # edi <- &c
        negl   %ecx                     # ecx <- n
        call   .Lsn_fdec_1              # a <- a-c
1:
        leal   12(%esp), %esp           # nettoie la pile
        RETURN_WITH_SP
        
#endif /* use_sse2 */
        
#endif /* assembly_sn_mgdiv_n2 */
