Browse Source

Merge pull request #216 from FrankBoesing/patch-2

Allow blocksizes 8 and 16
dds
Paul Stoffregen 8 years ago
parent
commit
e1403ae4b8
1 changed files with 59 additions and 28 deletions
  1. +59
    -28
      memcpy_audio.S

+ 59
- 28
memcpy_audio.S View File

@ r1: srcL @ r1: srcL
@ r2: srcR @ r2: srcR


#if AUDIO_BLOCK_SAMPLES > 8
push {r4-r11,r14} push {r4-r11,r14}
add r14,r0,#(AUDIO_BLOCK_SAMPLES*2) add r14,r0,#(AUDIO_BLOCK_SAMPLES*2)
.align 2 .align 2
.loopLR: .loopLR:

.irp offset, 1,2

//Load 2*4 words //Load 2*4 words
ldmia r1!, {r5,r7,r9,r11} //1+4 ldmia r1!, {r5,r7,r9,r11} //1+4
ldmia r2!, {r6,r8,r10,r12} //1+4 ldmia r2!, {r6,r8,r10,r12} //1+4


pkhbt r3,r5,r6,LSL #16 //1 pkhbt r3,r5,r6,LSL #16 //1
pkhtb r4,r6,r5,ASR #16 //1 pkhtb r4,r6,r5,ASR #16 //1
pkhtb r10,r12,r11,ASR #16 //1 pkhtb r10,r12,r11,ASR #16 //1


//Write 8 Words //Write 8 Words
stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} //1+8 stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} //1+8 -> 5+5+8+9 = 27 Cycles to interleave 32 bytes.

.endr //5+5+8+9 = 27 Cycles to interleave 32 bytes.

cmp r14, r0 cmp r14, r0
bne .loopLR bne .loopLR


pop {r4-r11,r14} pop {r4-r11,r14}
BX lr #elif AUDIO_BLOCK_SAMPLES == 8
push {r4-r8,r14}

ldmia r1!, {r5,r7}
ldmia r2!, {r6,r8}


pkhbt r3,r5,r6,LSL #16
pkhtb r4,r6,r5,ASR #16


pkhbt r5,r7,r8,LSL #16
pkhtb r6,r8,r7,ASR #16

stmia r0!, {r3,r4,r5,r6}
pop {r4-r8,r14}
#endif
BX lr
/* void memcpy_tointerleaveL(short *dst, short *srcL); */ /* void memcpy_tointerleaveL(short *dst, short *srcL); */
.global memcpy_tointerleaveL .global memcpy_tointerleaveL
.thumb_func .thumb_func


@ r0: dst @ r0: dst
@ r1: srcL @ r1: srcL

push {r4-r11}
mov r2, #0 mov r2, #0
#if AUDIO_BLOCK_SAMPLES > 8
push {r4-r11}
add r12,r0,#(AUDIO_BLOCK_SAMPLES*2) add r12,r0,#(AUDIO_BLOCK_SAMPLES*2)
.align 2 .align 2
.loopL: .loopL:


.irp offset, 1,2

//Load 4 words //Load 4 words
ldmia r1!, {r5,r7,r9,r11} //1+4 ldmia r1!, {r5,r7,r9,r11} //1+4


//Write 8 Words //Write 8 Words
stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} //1+8 stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} //1+8


.endr

cmp r12, r0 cmp r12, r0
bne .loopL bne .loopL


pop {r4-r11} pop {r4-r11}
#elif AUDIO_BLOCK_SAMPLES == 8
push {r4-r7}
ldmia r1!, {r5,r7}

pkhbt r3,r5,r2
pkhtb r4,r2,r5,ASR #16

pkhbt r5,r7,r2 //1
pkhtb r6,r2,r7,ASR #16

stmia r0!, {r3,r4,r5,r6}
pop {r4-r7}
#endif
BX lr BX lr


/* void memcpy_tointerleaveL(short *dst, short *srcR); */ /* void memcpy_tointerleaveL(short *dst, short *srcR); */
.global memcpy_tointerleaveR .global memcpy_tointerleaveR
.thumb_func .thumb_func
@ r0: dst @ r0: dst
@ r1: srcR @ r1: srcR


push {r4-r11}
mov r2, #0 mov r2, #0
#if AUDIO_BLOCK_SAMPLES > 8
push {r4-r11}
add r12,r0,#(AUDIO_BLOCK_SAMPLES*2) add r12,r0,#(AUDIO_BLOCK_SAMPLES*2)
.align 2 .align 2
.loopR: .loopR:


.irp offset, 1,2

//Load 4 words //Load 4 words
ldmia r1!, {r5,r7,r9,r11} ldmia r1!, {r5,r7,r9,r11}


//Write 8 Words //Write 8 Words
stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10}


.endr

cmp r12, r0 cmp r12, r0
bne .loopR bne .loopR


pop {r4-r11} pop {r4-r11}
#elif AUDIO_BLOCK_SAMPLES == 8
push {r4-r7}
ldmia r1!, {r5,r7}

pkhbt r3,r2,r5,LSL #16
pkhtb r4,r5,r2

pkhbt r5,r2,r7,LSL #16
pkhtb r6,r7,r2
stmia r0!, {r3,r4,r5,r6}

pop {r4-r7}

#endif
BX lr BX lr




.align 2 .align 2
.loopQuad: .loopQuad:


.irp offset, 1,2

ldr r5, [r1],4 ldr r5, [r1],4
ldr r6, [r3],4 ldr r6, [r3],4
pkhbt r7,r5,r6,LSL #16 pkhbt r7,r5,r6,LSL #16


stmia r0!, {r7-r10} stmia r0!, {r7-r10}


.endr

cmp r11, r0 cmp r11, r0
bne .loopQuad bne .loopQuad


pop {r4-r11} pop {r4-r11}

BX lr BX lr


.END .END


#endif #endif

Loading…
Cancel
Save