// file kernel/n/x86/shift.S: shift of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                                 Dcalages                             |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                           # +---------------------+
                           # |  Dcalage  droite  |
                           # +---------------------+

# entre :
#   a = naturel de longueur l      esi = &a, ecx = l
#       
# contrainte : l > 0
#
# sortie :
#   a <- a/2
#
# registres modifis :
#   eax <- ind.
#   ecx = 0
#   CF  = bit 0 de a

#undef L
#define L(x) .Lsn_fhalf_##x
        
        ALIGN_32
.Lsn_fhalf:
        
        # calcule l adresse de saut dans la boucle
        call   L(here)
L(here):
        movl   %ecx,    %eax
        negl   %eax
        andl   $15,     %eax            # eax <- (-l) mod 16
        addl   %eax,    %ecx            # ecx <- 16*ceil(l/16)
        leal   L(loop)-L(here)(,%eax,4), %eax
        addl   %eax,    (%esp)
        clc
        ret

        # boucle droule 16 fois. Taille d une instruction = 4 octets
        ALIGN_4
L(loop):
        rcrl   $1,   -4(%esi,%ecx,4)
        rcrl   $1,   -8(%esi,%ecx,4)
        rcrl   $1,  -12(%esi,%ecx,4)
        rcrl   $1,  -16(%esi,%ecx,4)
        rcrl   $1,  -20(%esi,%ecx,4)
        rcrl   $1,  -24(%esi,%ecx,4)
        rcrl   $1,  -28(%esi,%ecx,4)
        rcrl   $1,  -32(%esi,%ecx,4)
        rcrl   $1,  -36(%esi,%ecx,4)
        rcrl   $1,  -40(%esi,%ecx,4)
        rcrl   $1,  -44(%esi,%ecx,4)
        rcrl   $1,  -48(%esi,%ecx,4)
        rcrl   $1,  -52(%esi,%ecx,4)
        rcrl   $1,  -56(%esi,%ecx,4)
        rcrl   $1,  -60(%esi,%ecx,4)
        rcrl   $1,  -64(%esi,%ecx,4)
        leal   -15(%ecx), %ecx
        loop   L(loop)

        ret
        

                 # +---------------------------------------+
                 # |  Dcalage par adresses dcroissantes  |
                 # +---------------------------------------+
        
# entre :
#   a = naturel de longueur la     esi = &a, edx = la
#   b = naturel de longueur la     edi = &b
#   k = entier                     ecx = k
# contraintes : la > 0, 0 < k < HW
#
# sortie :
#   b <- a >> k
#
# registres modifis :
#   eax = a[0],  ebx = b[0]
#   ecx = 32-k,  edx = 0

#ifdef assembly_sn_shift_down
#undef L
#define L(x) .Lsn_fshift_down_##x
        ALIGN_32
.Lsn_fshift_down:
        
        subl   $32,     %ecx
        negl   %ecx                     # ecx <- 32-k
        xorl   %ebx,    %ebx            # init retenues
        movl   %ebx,    %eax
        incl   %edx
        shrl   $1,      %edx            # si la est impair, ...
        jnc    2f                       # saute en milieu de boucle

        # corps de boucle droule 2 fois
        ALIGN_4
1:
        movl  -4(%esi,%edx,8), %ebx
        shldl  %cl, %ebx, %eax
        movl   %eax, -4(%edi,%edx,8)
2:
        movl   -8(%esi,%edx,8), %eax
        shldl  %cl, %eax, %ebx
        movl   %ebx, -8(%edi,%edx,8)
        decl   %edx
        jne    1b
        ret
        
                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
        
# chiffre xn(shift_down)(chiffre *a, long la, chiffre *b, int k)
#
#  entre :
#  a = naturel de longueur la > 0
#  b = naturel de longueur la, peut tre confondu avec a
#  k = entier tel que 0 <= k < HW
#
#  sortie :
#  b <- a >> k
#  retourne a mod 2^k


ENTER(sn_shift_down)

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- la
        movl   arg3,    %edi            # edi <- &b
        movl   arg4,    %ecx            # ecx <- k
        jecxz  L(copy)                  # si k=0, copie simple
        call   .Lsn_fshift_down         # sinon, effectue le dcalage
        decl   %edx
        shrl   %cl,     %edx            # edx <- 2^k - 1
        andl   %edx,    %eax            # isole les k bits de poids faible de a[0]
        RETURN_WITH_SP
        
        # si k=0, b <- a
        ALIGN_4
L(copy):
        leal  -4(%esi,%edx,4), %esi
        leal  -4(%edi,%edx,4), %edi
        movl   %edx,    %ecx
        std;   REP(movsl); cld
        xorl   %eax,   %eax
        RETURN_WITH_SP

        # cas o la version assembleur est dsactive :
        # sn_fshift_down renvoie vers la version C
#else
        ALIGN_32
.Lsn_fshift_down:

        pushl  (%esi)                   # sauve a[0]
        pushl  %ecx
        pushl  %edi
        pushl  %edx
        pushl  %esi
        call   SUBR(sn_shift_down)
        popl   %esi
        popl   %edx
        popl   %edi
        popl   %ecx
        subl   $32,     %ecx
        negl   %ecx                     # ecx <- 32-k
        popl   %eax                     # eax <- a[0]
        movl   (%edi),  %ebx            # ebx <- b[0]
        xorl   %edx,    %edx            # edx <- 0
        ret
        
#endif /* assembly_sn_shift_down */

                  # +-------------------------------------+
                  # |  Dcalage par adresses croissantes  |
                  # +-------------------------------------+

# entre :
#   a = naturel de longueur la     esi = &a, edx = la
#   b = naturel de longueur la     edi = &b
#   k = entier                     ecx = k
# contraintes : la > 0, 0 < k < HW
#
# sortie :
#   b <- a << k
#
# registres modifis :
#   eax = a[la-1],  ebx = b[la-1]
#   esi = &a[la],   edi = &b[la]
#   ecx = 32-k,     edx = 0

#ifdef assembly_sn_shift_up
#undef L
#define L(x) .Lsn_fshift_up_##x
        ALIGN_32
.Lsn_fshift_up:

        subl   $32,     %ecx
        negl   %ecx                     # ecx <- 32-k
        xorl   %ebx,    %ebx            # init retenues
        movl   %ebx,    %eax
        leal   (%esi,%edx,4), %esi      # esi <- &a[la]        
        leal   (%edi,%edx,4), %edi      # edi <- &b[la]        
        negl   %edx
        sarl   $1,      %edx            # si la est impair, ...
        jc     2f                       # saute en milieu de boucle

        # corps de boucle droule 2 fois
        ALIGN_4
1:
        movl   (%esi,%edx,8), %ebx
        shrdl  %cl, %ebx, %eax
        movl   %eax, (%edi,%edx,8)
2:
        movl   4(%esi,%edx,8), %eax
        shrdl  %cl, %eax, %ebx
        movl   %ebx, 4(%edi,%edx,8)
        incl   %edx
        jne    1b
        ret

        
                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
        
# chiffre xn(shift_up)(chiffre *a, long la, chiffre *b, int k)
#
# entre :
#   a = naturel de longueur la > 0
#   b = naturel de longueur la, peut tre confondu avec a
#   k = entier tel que 0 <= k < HW
#
# sortie :
#   b <- a << k
#   retourne les k bits de poids fort de a

ENTER(sn_shift_up)

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- la
        movl   arg3,    %edi            # edi <- &b
        movl   arg4,    %ecx            # ecx <- k
        jecxz  L(copy)                  # si k=0, copie simple
        call   .Lsn_fshift_up           # sinon, effectue le dcalage
        shrl   %cl,    %eax             # isole les k bits de poids fort de a[la-1]
        RETURN_WITH_SP
        
        # si k=0, b <- a
        ALIGN_4
L(copy):
        movl   %edx,    %ecx
        cld;   REP(movsl)
        xorl   %eax,    %eax
        RETURN_WITH_SP

        # cas o la version assembleur est dsactive :
        # sn_fshift_up renvoie vers la version C
#else
        ALIGN_32
.Lsn_fshift_up:

        pushl  -4(%esi,%edx,4)          # sauve a[la-1]
        pushl  %ecx
        pushl  %edi
        pushl  %edx
        pushl  %esi
        call   SUBR(sn_shift_up)
        popl   %esi
        popl   %edx
        popl   %edi
        popl   %ecx
        subl   $32,     %ecx
        negl   %ecx                     # ecx <- 32-k
        leal   (%esi,%edx,4), %esi      # esi <- &a[la-1]
        leal   (%edi,%edx,4), %edi      # esi <- &b[la-1]
        popl   %eax                     # eax <- a[la-1]
        movl   -4(%edi), %ebx           # ebx <- b[la-1]
        xorl   %edx,    %edx            # edx <- 0
        ret
        
#endif /* assembly_sn_shift_up */

