MODULE  ARM_MEMORY

        PUBLIC  ARM_MEMCPY
        PUBLIC  ARM_MEMSET
        PUBLIC  ARM_MEMSET8
        PUBLIC  ARM_MEMSET16
        PUBLIC  ARM_MEMSET32

        SECTION .text:CODE:NOROOT(2)
        CODE32

;-------------------------------------------------------------------------------
; void ARM_MEMCPY(void* pDest, void* pSrc, U32 NumBytes)
;
; Function description
;   Copy data in memory from source address to destination address.
;
; Register usage:
;
;   R0    pDest
;   R1    pSrc
;   R2    NumBytes
;
;   R3    Used for data transfers
;   R4    Used for data transfers
;   R12   Used for data transfers
;   R14   Used for data transfers
;
;   R13   SP
;   R14   LR (contains return address)
;   R15   PC
;
;-------------------------------------------------------------------------------
ARM_MEMCPY:
;-------------------------------------------------------------------------------
        cmp         R2, #+3                           ; R2 = NumBytes
        bls         ARM_MEMCPY_HandleTrailingBytes    ; If we have less than one complete word, use single byte transfer

        ands        R12, R0, #+3                      ; R0 = destination address
        beq         ARM_MEMCPY_DestIsDWordAligned     ; Is destination address already word aligned ?

;-------------------------------------------------------------------------------
; Handle as much bytes as necessary to align destination address
;
        ldrb        R3, [R1], #+1                     ; We need at least one byte to the next word alignment, so we read one.
        cmp         R12, #+2                          ; Set condition codes according to the mis-alignment
        add         R2, R2, R12                       ; Adjust NumBytes : 1, 2, 3
        ldrbls      R12, [R1], #+1                    ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address
        strb        R3, [R0], #+1
        ldrbcc      R3, [R1], #+1                     ; Carry clear (CC)? -> We need one more byte
        strbls      R12, [R0], #+1
        sub         R2, R2, #+4                       ; Adjust NumBytes
        strbcc      R3, [R0], #+1                     ; now destination address already is word aligned

;-------------------------------------------------------------------------------
; Choose best way to transfer data
;
ARM_MEMCPY_DestIsDWordAligned:
        ands        R3, R1, #+3
        beq         ARM_MEMCPY_HandleBulkWordData     ; If source and destination are aligned, use bulk word transfer

        subs        R2, R2, #+4
        bcc         ARM_MEMCPY_HandleTrailingBytes    ; If we have less than one complete word left, use single byte transfer

        ldr         R12, [R1, -R3]!                   ; Read first mis-aligned data word and word align source address
        cmp         R3, #+2
        beq         ARM_MEMCPY_Loop16BitShift

        bhi         ARM_MEMCPY_Loop24BitShift

;-------------------------------------------------------------------------------
; Handle data in units of word
;
; This is done by reading mis-aligned words from source address and
; shift them into the right alignment. After this the next data word
; will be read to complete the missing data part.
;
ARM_MEMCPY_Loop8BitShift:
        mov         R3, R12, LSR #+8           ; Shift data word into right position
        ldr         R12, [R1, #+4]!            ; Load next mis-aligned data word
        subs        R2, R2, #+4                ; Decrement NumBytes
        orr         R3, R3, R12, LSL #+24      ; Combine missing part of data to build full data word
        str         R3, [R0], #+4              ; Store complete word
        bcs         ARM_MEMCPY_Loop8BitShift

        add         R1, R1, #+1                ; Adjust source address
        b           ARM_MEMCPY_HandleTrailingBytes         ; Handle trailing bytes

ARM_MEMCPY_Loop16BitShift:
        mov         R3, R12, LSR #+16          ; Shift data word into right position
        ldr         R12, [R1, #+4]!            ; Load next mis-aligned data word
        subs        R2, R2, #+4                ; Decrement NumBytes
        orr         R3, R3, R12, LSL #+16      ; Combine missing part of data to build full data word
        str         R3, [R0], #+4              ; Store complete word
        bcs         ARM_MEMCPY_Loop16BitShift

        add         R1, R1, #+2                ; Adjust source address
        b           ARM_MEMCPY_HandleTrailingBytes         ; Handle trailing bytes

ARM_MEMCPY_Loop24BitShift:
        mov         R3, R12, LSR #+24          ; Shift data word into right position
        ldr         R12, [R1, #+4]!            ; Load next mis-aligned data word
        subs        R2, R2, #+4                ; Decrement NumBytes
        orr         R3, R3, R12, LSL #+8       ; Combine missing part of data to build full data word
        str         R3, [R0], #+4              ; Store complete word
        bcs         ARM_MEMCPY_Loop24BitShift

        add         R1, R1, #+3                ; Adjust source address
        b           ARM_MEMCPY_HandleTrailingBytes         ; Handle trailing bytes

;-------------------------------------------------------------------------------
; Handle large bulk data in blocks of 8 words (32 bytes)
;
ARM_MEMCPY_HandleBulkWordData:
        subs        R2, R2, #+0x20
        stmdb       SP!, {R4, LR}
        bcc         ARM_MEMCPY_HandleTrailingWords

ARM_MEMCPY_LoopHandleBulkWord:
        ldm         R1!, {R3, R4, R12, LR}     ; Transfer 16 bytes at once
        stm         R0!, {R3, R4, R12, LR}
        ldm         R1!, {R3, R4, R12, LR}     ; Transfer 16 bytes at once
        stm         R0!, {R3, R4, R12, LR}
        subs        R2, R2, #+0x20
        bcs         ARM_MEMCPY_LoopHandleBulkWord

;-------------------------------------------------------------------------------
; Handle trailing 7 words
;
ARM_MEMCPY_HandleTrailingWords:
        movs        R12, R2, LSL #28           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

        ldmcs       R1!, {R3, R4, R12, LR}     ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is set)
        stmcs       R0!, {R3, R4, R12, LR}
        ldmmi       R1!, {R3, R4}              ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is set)
        stmmi       R0!, {R3, R4}

        movs        R12, R2, LSL #+30          ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

        ldmia       SP!, {R4, LR}
        ldrcs       R3, [R1], #+4              ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is set)
        strcs       R3, [R0], #+4
        bxeq        LR

;-------------------------------------------------------------------------
; Handle trailing 3 bytes
;
; N Z C V Q  ***** I F T M4 3 2 1 0
; N = bit[31]
; C = last shift bit : shift
; C = 1 ADD/CMN has carry bit
; C = 0 SUB/CMP no borrow bit
; xxxxxxxxxxxxxxxxxxxx10 << 31 : N=0, C=1
; xxxxxxxxxxxxxxxxxxxx01 << 31 : N=1, C=0 ; BMI : N=1 ; BCS : C=1 ARM_MEMCPY_HandleTrailingBytes: 

movs R2, R2, LSL #+31 ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data 
ldrbmi R2, [R1], #+1 ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is set) 
ldrbcs R3, [R1], #+1 ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is set) 
ldrbcs R12, [R1], #+1 
strbmi R2, [R0], #+1 
strbcs R3, [R0], #+1 
strbcs R12, [R0], #+1 
bx LR ;-----------------------------------------------------------------  
void ARM_MEMSET(void* pDest, U32 c, U32 NumBytes) ; 
; Function description 
; Copy data in memory from source address to destination address. ; ; Register usage: ; ; R0 pDest ; R1 c ; R2 NumBytes ; ; R3 Used for data transfers ; R4 Used for data transfers ; R5 Used for data transfers ; R6 Used for data transfers ; ; R13 SP ; R14 LR (contains return address) ; R15 PC ; 
;--------------------------------------------------------------------- ARM_MEMSET: 
;----------------------------------------------------------------------- 
orr R1, R1, R1, LSL #+8 
orr R1, R1, R1, LSL #+16 
cmp R2, #+3 ; R2 = NumBytes bls ARM_MEMSET_HandleTrailingBytes 
; If we have less than one complete word, use single byte transfer ands R3, R0, #+3 ; R0 = destination address beq ARM_MEMSET_DestIsAligned 
; Is destination address already word aligned ? 
; Handle as much bytes as necessary to align destination address 
strb R1, [R0], #+1 ; We need at least one byte to the next word alignment, so we read one. 
cmp R3, #+2 ; Set condition codes according to the mis-alignment 
add R2, R2, R3 ; Adjust NumBytes 
strbls R1, [R0], #+1 ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address
        sub         R2, R2, #+4                ; Adjust NumBytes
        strbcc      R1, [R0], #+1              ; Carry clear (CC)? -> We need one more byte

; Choose best way to transfer data

ARM_MEMSET_DestIsAligned:                      ; destination is aligned, use bulk word transfer

; Handle large bulk data in blocks of 8 words (32 bytes)

ARM_MEMSET_HandleBulkWordData:
        stmdb       SP!, {R4, R5, R6}

        mov         R3, R1, LSL #+0           ; Transfer 16 bytes at once
        mov         R4, R1, LSL #+0
        mov         R5, R1, LSL #+0

        subs        R2, R2, #+0x20             ; 32 Bytes = 8 DWords
        bcc         ARM_MEMSET_HandleTrailingWords

ARM_MEMSET_LoopHandleBulkWord:
        stm         R0!, {R1, R3, R4, R5}
        stm         R0!, {R1, R3, R4, R5}
        subs        R2, R2, #+0x20
        bcs         ARM_MEMSET_LoopHandleBulkWord


; Handle trailing 7 words

ARM_MEMSET_HandleTrailingWords:
        movs        R6, R2, LSL #28            ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
        stmcs       R0!, {R1, R3, R4, R5}      ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is set)
        stmmi       R0!, {R1, R3}              ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is set)

        movs        R6, R2, LSL #+30           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
        strcs       R1, [R0], #+4              ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is set)

        ldmia       SP!, {R4, R5, R6}
        bxeq        LR                         ; Z flag contain no Trailing Bytes


; Handle trailing 3 bytes

ARM_MEMSET_HandleTrailingBytes:
        movs        R2, R2, LSL #+31           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
        strbmi      R1, [R0], #+1              ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is set)
        strbcs      R1, [R0], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is set)
        strbcs      R1, [R0], #+1
        bx          LR


;      int ARM_MEMSET8(void* pDest, U32 c, U32 NumBytes);
;-------------------------------------------------------------------------------
ARM_MEMSET8:
;-------------------------------------------------------------------------------
        stmdb       SP!, {R4, R5}
        cmp         R2, #4
        blt         ARM_MEMSET8_loop3

        ; Alignment is unknown
        tst         R0, #1
        strneb      R1, [R0], #1
        subne       R2, R2, #1

        ; Now we are 16-bit aligned (need to upgrade 'c' to 16-bit)
        orr         R1, R1, R1, LSL #8
        tst         R0, #2
        strneh      R1, [R0], #2
        subne       R2, R2, #2

        ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
        orr         R1, R1, R1, LSL #16
        mov         R3, R1
        cmp         R2, #16
        blt         ARM_MEMSET8_loop2
        tst         R0, #4
        strne       R1, [R0], #4
        subne       R2, R2, #4
        tst         R0, #8
        stmneia     R0!, {R1, R3}
        subne       R2, R2, #8

        ; Now we are 128-bit aligned
        mov         R4, R1
        mov         R5, R1
ARM_MEMSET8_loop1:
        ; Copy 4 32-bit values per loop iteration
        subs        R2, R2, #16
        stmgeia     R0!, {R1, R3, R4, R5}
        bge         ARM_MEMSET8_loop1
        add         R2, R2, #16

ARM_MEMSET8_loop2:
        ; Copy up to 3 remaining 32-bit values
        tst         R2, #8
        stmneia     R0!, {R1, R3}
        tst         R2, #4
        strne       R1, [R0], #4
        and         R2, R2, #3

ARM_MEMSET8_loop3:
        ; Copy up to 3 remaining bytes
        subs        R2, R2, #1
        strgeb      R1, [R0], #1
        subs        R2, R2, #1
        strgeb      R1, [R0], #1
        subs        R2, R2, #1
        strgeb      R1, [R0], #1
        ldmia       SP!, {R4, R5}
        bx          LR

; int ARM_MEMSET16(void* pDest, U32 c, U32 NumHalfWords);
;-------------------------------------------------------------------------------
ARM_MEMSET16:
;-------------------------------------------------------------------------------
        stmdb       SP!, {R4, R5}

        cmp         R2, #2
        blt         ARM_MEMSET16_HandleTrailingHalfWord    ; 1 or 0

        ; Alignment is known to be at least 16-bit
        tst         R0, #2
        strneh      R1, [R0], #2              ; xxxx-xx10 --->
        subne       R2, R2, #1                ; xxxx-xx00

        ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit )
        orr         R1, R1, R1, LSL #16
        mov         R4, R1

        cmp         R2, #8
        blt         ARM_MEMSET16_HandleTrailingWords       ; 7, 6, ... 0

        tst         R0, #4
        strne       R1, [R0], #4              ; xxxx-x100 --->
        subne       R2, R2, #2                ; xxxx-x000 --->

        ; Now we are 64-bit aligned
        tst         R0, #8
        stmneia     R0!, {R1, R4}             ; xxxx-1000 --->
        subne       R2, R2, #4                ; xxxx-0000 --->

ARM_MEMSET16_HandleBulkWordData:
        ; Now we are 128-bit aligned
        mov         R5, R1
        mov         R3, R1

ARM_MEMSET16_LoopHandleBulkWord:
        ; Copy 4 32-bit values per loop iteration
        subs        R2, R2, #8
        stmgeia     R0!, {R1, R3, R4, R5}
        bge         ARM_MEMSET16_LoopHandleBulkWord
        add         R2, R2, #8

ARM_MEMSET16_HandleTrailingWords:
        ; Copy up to 3 remaining 32-bit values
        tst         R2, #4
        stmneia     R0!, {R1, R4}

        tst         R2, #2
        strne       R1, [R0], #4

        and         R2, R2, #1

ARM_MEMSET16_HandleTrailingHalfWord:
        ; Copy up to 1 remaining 16-bit value
        subs        R2, R2, #1
        strgeh      R1, [R0], #2

        ldmia       SP!, {R4, R5}
        bx          LR


; int ARM_MEMSET32(void* pDest, U32 c, U32 NumWords);
;-------------------------------------------------------------------------------
ARM_MEMSET32:
;-------------------------------------------------------------------------------
        stmdb       SP!, {R4, R5}

        cmp         R2, #4
        blt         ARM_MEMSET32_loop2

        ; Alignment is known to be at least 32-bit
        mov         R3, R1

        tst         R0, #4
        strne       R1, [R0], #4
        subne       R2, R2, #1

        ; Now we are 64-bit aligned
        tst         R0, #8
        stmneia     R0!, {R1, R3}
        subne       R2, R2, #2

        ; Now we are 128-bit aligned
        mov         R4, R1
        mov         R5, R1
ARM_MEMSET32_loop1:
        ; Copy 4 32-bit values per loop iteration
        subs        R2, R2, #4
        stmgeia     R0!, {R1, R3, R4, R5}
        bge         ARM_MEMSET32_loop1
        add         R2, R2, #4

ARM_MEMSET32_loop2:
        ; Copy up to 3 remaining 32-bit values
        subs        R2, R2, #1
        strge       R1, [R0], #4
        subs        R2, R2, #1
        strge       R1, [R0], #4
        subs        R2, R2, #1
        strge       R1, [R0], #4

        ldmia       SP!, {R4, R5}
        bx          LR

;-__arm void ARM_memxor(void* pDest, U32 c, U32 NumBytes);
;                           r0         r1     r2
;-------------------------------------------------------------------------------
arm_memxor:
;-------------------------------------------------------------------------------
        orr         R1, R1, R1, LSL #+8
        orr         R1, R1, R1, LSL #+16

        cmp         R2, #+3                     ; R2 = NumBytes
        bls         arm_memxor_HandleTrailingBytes        ; If we have less than one complete word, use single byte transfer

        ands        R3, R0, #+3                 ; R0 = destination address
        beq         arm_memxor_DestIsAligned              ; Is destination address already word aligned ?

;-
; Handle as much bytes as necessary to align destination address
;-
        ldrb        R12, [R0], #+0              ; We need at least one byte to the next word alignment, so we read one.
        eor         R12, R12, r1
        strb        R12, [R0], #+1              ; We need at least one byte to the next word alignment, so we read one.

        cmp         R3, #+2                    ; Set condition codes according to the mis-alignment
        add         R2, R2, R3                 ; Adjust NumBytes

        ldrbls      R3, [R0], #+0              ; We need at least one byte to the next word alignment, so we read one.
        eorls       R3, R3, r1
        strbls      R3, [R0], #+1             ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address

        sub         R2, R2, #+4                ; Adjust NumBytes

        ldrbcc      R3, [R0], #+0              ; We need at least one byte to the next word alignment, so we read one.
        eorcc       R3, R3, r1
        strbcc      R3, [R0], #+1              ; Carry clear (CC)? -> We need one more byte

;-
; Choose best way to transfer data
;-
arm_memxor_DestIsAligned:                                  ; destination is aligned, use bulk word transfer
;-
; Handle large bulk data in blocks of 8 words (32 bytes)
;-
arm_memxor_HandleBulkWordData:
        stmdb       SP!, {R4, R5, R6, R7}

        subs        R2, R2, #+0x20             ; 32 Bytes = 8 DWords
        bcc         arm_memxor_HandleTrailingWords

arm_memxor_LoopHandleBulkWord:
        ldm         R0,  {R3, R4, R5, R6}
        eor         r3, r3, r1
        eor         r4, r4, r1
        eor         r5, r5, r1
        eor         r6, r6, r1
        stm         R0!, {R3, R4, R5, R6}

        ldm         R0,  {R3, R4, R5, R6}
        eor         r3, r3, r1
        eor         r4, r4, r1
        eor         r5, r5, r1
        eor         r6, r6, r1
        stm         R0!, {R3, R4, R5, R6}

        subs        R2, R2, #+0x20
        bcs         arm_memxor_LoopHandleBulkWord

;-
; Handle trailing 7 words
;-
arm_memxor_HandleTrailingWords:
        movs        R7, R2, LSL #28             ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

        ldmcs       R0,  {R3, R4, R5, R6}
        eorcs       r3, r3, r1
        eorcs       r4, r4, r1
        eorcs       r5, r5, r1
        eorcs       r6, r6, r1
        stmcs       R0!, {R3, R4, R5, R6}       ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is xor)

        ldmmi       R0,  {R3, R4}
        eormi       r3, r3, r1
        eormi       r4, r4, r1
        stmmi       R0!, {R3, R4}                ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is xor)

        movs        R7, R2, LSL #+30            ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

        ldrcs       R3, [R0]
        eorcs       r3, r3, r1
        strcs       R3, [R0], #+4              ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is xor)

        ldmia       SP!, {R4, R5, R6, R7}
        bxeq        LR                          ; Z flag contain no Trailing Bytes

;-
; Handle trailing 3 bytes
;-
arm_memxor_HandleTrailingBytes:
        movs        R2, R2, LSL #+31           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

        ldrmi       R2, [R0]
        eormi       R2, R2, r1
        strbmi      R2, [R0], #+1              ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is xor)

        ldrcs       R2, [R0]
        eorcs       R2, R2, r1
        strbcs      R2, [R0], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is xor)

        ldrcs       R2, [R0]
        eorcs       R2, R2, r1
        strbcs      R2, [R0], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is xor)

        bx          LR

;-__arm int arm_memxor8(void* pDest, U32 c, U32 NumBytes);
;                           r0         r1     r2
;-------------------------------------------------------------------------------
arm_memxor8:
;-------------------------------------------------------------------------------
        stmdb       SP!, {R4, R5, R6}

        orr         R1, R1, R1, LSL #+8
        orr         R1, R1, R1, LSL #+16

        cmp         R2, #4
        blt         arm_memxor8_loop3

        ; Alignment is unknown
        tst         R0, #1

        ldrneb      R6, [R0]
        eorne       R6, r6, R1
        strneb      R6, [R0], #1

        subne       R2, R2, #1

        ; Now we are 16-bit aligned (need to upgrade 'c' to 16-bit)
        tst         R0, #2

        ldrneh      R6, [R0]
        eorne       R6, r6, R1
        strneh      R6, [R0], #2

        subne       R2, R2, #2

        ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
        cmp         R2, #16
        blt         arm_memxor8_loop2
        tst         R0, #4

        ldrne       R6, [R0]
        eorne       R6, r6, R1
        strne       R6, [R0], #4
        ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
        subne       R2, R2, #4
        tst         R0, #8

        ldmneia     R0, {R3, R6}
        eorne       R3, r3, R1
        eorne       R6, r6, R1
        stmneia     R0!, {R3, R6}

        subne       R2, R2, #8

        ; Now we are 128-bit aligned
        mov         R4, R1
        mov         R5, R1
arm_memxor8_loop1:
        ; Copy 4 32-bit values per loop iteration
        subs        R2, R2, #16

        ldmgeia     R0,  {R3, R4, R5, R6}
        eorge       r3, r3, r1
        eorge       r4, r4, r1
        eorge       r5, r5, r1
        eorge       r6, r6, r1
        stmgeia     R0!, {R3, R4, R5, R6}

        bge         arm_memxor8_loop1
        add         R2, R2, #16

arm_memxor8_loop2:
        ; Copy up to 3 remaining 32-bit values
        tst         R2, #8

        ldmneia     R0, {R3, R4}
        eorne       r3, r3, r1
        eorne       r4, r4, r1
        stmneia     R0!, {R3, R4}

        tst         R2, #4

        ldrne       R3, [R0]
        eorne       r3, r3, r1
        strne       R3, [R0], #4

        and         R2, R2, #3

arm_memxor8_loop3:
        ; Copy up to 3 remaining bytes
        subs        R2, R2, #1

        ldrgeb      R3, [R0]
        eorge       r3, r3, r1
        strgeb      R3, [R0], #1

        subs        R2, R2, #1

        ldrgeb      R3, [R0]
        eorge       r3, r3, r1
        strgeb      R1, [R0], #1

        subs        R2, R2, #1

        ldrgeb      R3, [R0]
        eorge       r3, r3, r1
        strgeb      R1, [R0], #1

        ldmia       SP!, {R4, R5, R6}
        bx          LR

;-__arm int arm_memxor16(void* pDest, U32 c, U32 NumHalfWords);
;                           r0         r1     r2
;-------------------------------------------------------------------------------
arm_memxor16:
;-------------------------------------------------------------------------------
        stmdb       SP!, {R4, R5, R6}
        orr         R1, R1, R1, LSL #+16

        cmp         R2, #2
        blt         arm_memxor16_HandleTrailingHalfWord    ; 1 or 0

        ; Alignment is known to be at least 16-bit
        tst         R0, #2

        ldrneh      R6, [R0]
        eorne       R6, r6, R1
        strneh      R6, [R0], #2              ; xxxx-xx10 --->

        subne       R2, R2, #1                ; xxxx-xx00

        ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit )
        cmp         R2, #8
        blt         arm_memxor16_HandleTrailingWords       ; 7, 6, ... 0

        tst         R0, #4

        ldrne       R3, [R0]
        eorne       r3, r3, r1
        strne       R3, [R0], #4              ; xxxx-x100 --->


        subne       R2, R2, #2                ; xxxx-x000 --->

        ; Now we are 64-bit aligned
        tst         R0, #8

        ldmneia     R0, {R3, R4}
        eorne       r3, r3, r1
        eorne       r4, r4, r1
        stmneia     R0!, {R3, R4}             ; xxxx-1000 --->

        subne       R2, R2, #4                ; xxxx-0000 --->

arm_memxor16_HandleBulkWordData:
        ; Now we are 128-bit aligned
        mov         R5, R1
        mov         R6, R1

arm_memxor16_LoopHandleBulkWord:
        ; Copy 4 32-bit values per loop iteration
        subs        R2, R2, #8

        ldmgeia     R0,  {R3, R4, R5, R6}
        eorge       r3, r3, r1
        eorge       r4, r4, r1
        eorge       r5, r5, r1
        eorge       r6, r6, r1
        stmgeia     R0!, {R3, R4, R5, R6}

        bge         arm_memxor16_LoopHandleBulkWord
        add         R2, R2, #8

arm_memxor16_HandleTrailingWords:
        ; Copy up to 3 remaining 32-bit values
        tst         R2, #4

        ldmneia     R0, {R3, R4}
        eorne       r3, r3, r1
        eorne       r4, r4, r1
        stmneia     R0!, {R3, R4}

        tst         R2, #2

        ldrne       R3, [R0]
        eorne       r3, r3, r1
        strne       R3, [R0], #4

        and         R2, R2, #1

arm_memxor16_HandleTrailingHalfWord:
        ; Copy up to 1 remaining 16-bit value
        subs        R2, R2, #1

        ldrgeh      R3, [R0]
        eorge       r3, r3, r1
        strgeh      R3, [R0], #2

        ldmia       SP!, {R4, R5, R6}
        bx          LR


;-__arm int arm_memxor32(void* pDest, U32 c, U32 NumWords);
;                           r0         r1     r2
;-------------------------------------------------------------------------------
arm_memxor32:
;-------------------------------------------------------------------------------
        stmdb       SP!, {R4, R5, R6}

        cmp         R2, #4
        blt         arm_memxor32_loop2

        ; Alignment is known to be at least 32-bit, is it 64-bit aligned ?
        tst         R0, #4
        ; No, it is 32-bit aligned
        ldrne       R3, [R0]
        eorne       R3, r3, R1
        strne       R3, [R0], #4
        subne       R2, R2, #1

        ; Now we are 64-bit aligned, is it 128-bit aligned ?
        tst         R0, #8
        ; No, it is 64-bit aligned
        ldmneia     R0, {R3, R4}
        eorne       r3, r3, r1
        eorne       r4, r4, r1
        stmneia     R0!, {R3, R4}             ; xxxx-1000 --->
        subne       R2, R2, #2

        ; Now we are 128-bit aligned
        mov         R4, R1
        mov         R5, R1
arm_memxor32_loop1:
        ; Copy 4 32-bit values per loop iteration
        subs        R2, R2, #4

        ldmgeia     R0,  {R3, R4, R5, R6}
        eorge       r3, r3, r1
        eorge       r4, r4, r1
        eorge       r5, r5, r1
        eorge       r6, r6, r1
        stmgeia     R0!, {R3, R4, R5, R6}

        bge         arm_memxor32_loop1
        add         R2, R2, #4

arm_memxor32_loop2:
        ; Copy up to 3 remaining 32-bit values

        subs        R2, R2, #1
        ldrge       R3, [R0]
        eorge       r3, r3, r1
        strge       R3, [R0], #4

        subs        R2, R2, #1
        ldrge       R3, [R0]
        eorge       r3, r3, r1
        strge       R3, [R0], #4

        subs        R2, R2, #1
        ldrge       R3, [R0]
        eorge       r3, r3, r1
        strge       R3, [R0], #4

        ldmia       SP!, {R4, R5, R6}
        bx          LR


        END

转载请注明转自: 听风 , 本文固定链接: Arm memcpy 汇编代码