- May 11, 2008
- 21,879
- 1,327
- 126
Since i am a mnemonic-muncher, an assembly geek and an optimizing nerd...
I decided to write a strlen version that is optimized for the 32 bit ARM7TDMI core that i am using. I do have to note that i have not tested it yet.
I wrote 2 strlen versions in C :
Nr1 is the standard version which retrieves 1 byte at a time from memory.
Nr2 is the 32bit wide version that retreives a 32bit word at once and then shifts that 32 bit word around on the core to check at byte size if the "NULL" = 0x00 is present.
Since i do everything on the core, i do not have to worry about wait states and since i do not have as much memory accesses, the memory controller does not have to do as much arbitration between the dma controller and the core if a collision would occur. The StrLen32 version does need a cast from char to >> value = Strlen32((uint32_t *)sz_string);
Here is the C code :
Here is the assembly result after compilation with thew GNU GCC compiler for the arm optimization is -O2 :
Although the strlen32 version that retrieves a word at a time seems larger, it is only the case because i unrolled it a bit in the C code. It is larger but much faster because of less branches to take and less memory access is required.
Any critical thoughts or remarks because i might have overlooked something ?
EDIT:
I just thought of 1 limitation. The string needs to be on a word boundary in memory to avoid data abort exceptions.
That is a problem
.
I decided to write a strlen version that is optimized for the 32 bit ARM7TDMI core that i am using. I do have to note that i have not tested it yet.
I wrote 2 strlen versions in C :
Nr1 is the standard version which retrieves 1 byte at a time from memory.
Nr2 is the 32bit wide version that retreives a 32bit word at once and then shifts that 32 bit word around on the core to check at byte size if the "NULL" = 0x00 is present.
Since i do everything on the core, i do not have to worry about wait states and since i do not have as much memory accesses, the memory controller does not have to do as much arbitration between the dma controller and the core if a collision would occur. The StrLen32 version does need a cast from char to >> value = Strlen32((uint32_t *)sz_string);
Here is the C code :
Code:
// 1 byte at a time version.
uint32_t StrLen(char *uc_pntr)
{
uint32_t ui_cntr = 0;
while(*uc_pntr++ != 0x00)
{
ui_cntr++;
if(ui_cntr > 1024)
{
return 0;
}
}
return (ui_cntr);
}
Code:
// 4 bytes at a time version.
uint32_t StrLen32(uint32_t *ui_pntr)
{
uint32_t ui_cntr;
uint32_t ui_temp;
uint32_t ui_temp2;
ui_cntr = 0;
while(ui_cntr < (128 << 3))
{
ui_temp = *ui_pntr++;
ui_cntr++;
ui_temp2 = ui_temp >> 24;
if((ui_temp2 & 0xFF) == 0)
{
return (ui_cntr);
}
ui_cntr++;
ui_temp2 = ui_temp >> 16;
if((ui_temp2 & 0xFF) == 0)
{
return (ui_cntr);
}
ui_cntr++;
ui_temp2 = ui_temp >> 8;
if((ui_temp2 & 0xFF) == 0)
{
return (ui_cntr);
}
ui_cntr++;
ui_temp2 = ui_temp;
if((ui_temp2 & 0xFF) == 0)
{
return (ui_cntr);
}
}
return (ui_cntr);
}
Here is the assembly result after compilation with thew GNU GCC compiler for the arm optimization is -O2 :
Code:
// standard strlen unction that retreives 1 byte at once.
//uint32_t StrLen(uint32_t *ui_pntr)
mov r3, #1024 // R3 = 1024.
add r3, r3, #1 // Increase with 1.
mov r2, #0 // R2 = 0.
b LB // Jump to LB.
LA: add r2, r2, #1 // R2++.
cmp r2, r3 // R2 - R3.
beq .LC // if R2 == R3 jump to LC.
LB: ldrb r1, [r0, r2] // R1 = memory address R0 points to + R2.
cmp r1, #0 // R1 - 0.
bne .LA // if Z (zeroflag)) is not set, jump to LA.
LC: mov r0, r2 // Return amount of counted characters in R2. R0 holds return value.
bx lr // return.
Code:
// My strlen function that retreives 4 bytes at once. This means 1/4 amount of memory accesses.
//uint32_t StrLen32(uint32_t *ui_pntr)
mov r3, #0 // R3 = 0.
ldr r1, [r0, r3] // R1 = memory address R0 points to + R3. (R3 = 0)
LA: movs r2, r1, lsr #24 // update status flags (S - suffix) R2 = R1 shifted 24 times.
// If R2 is 0x00, Z is set.
str r4, [sp, #-4]! // push r4.
mov ip, r1, lsr #16 // shift value 16 times and store in ip( = R12).
mov r4, r1, lsr #8 // Shift value 8 times and store in R4.
add r2, r3, #1 // R2 = R3 + 1.
beq LB // Jump if Z is set. (if Z is set it means 0x00 is found.)
tst ip, #255 // Do a test (ip & 255).
add r2, r3, #2 // R2 = R3 + 2.
beq LB // Jump if Z is set. (if Z is set it means 0x00 is found.)
tst r4, #255 // Do a test (R4 & 255).
add r2, r3, #3 // R2 = R3 + 3.
beq LB // Jump if Z is set. (if Z is set it means 0x00 is found.)
tst r1, #255 // Do a test (R1 & 255).
add r2, r3, #4 // R2 = R3 + 4.
beq LB // Jump if Z is set. (if Z is set it means 0x00 is found.)
cmp r2, #1024 // compare r2 with 1024.
beq LB // Jump if Z is set. (if Z is set it means 0x00 is found.)
mov r3, r2 // Store R2 inside R3.
ldr r1, [r0, r3] // R1 = memory address R0 points to + R3. (R3 = 1)
bne LA
LB: mov r0, r2 // Return amount of counted characters in R2. R0 holds return value.
ldmfd sp!, {r4} // pop R4.
bx lr // Return
Although the strlen32 version that retrieves a word at a time seems larger, it is only the case because i unrolled it a bit in the C code. It is larger but much faster because of less branches to take and less memory access is required.
Any critical thoughts or remarks because i might have overlooked something ?
EDIT:
I just thought of 1 limitation. The string needs to be on a word boundary in memory to avoid data abort exceptions.
That is a problem
Last edited: