Fun Strlen retrieving 4 bytes at once.

May 11, 2008
21,879
1,327
126
Since i am a mnemonic-muncher, an assembly geek and an optimizing nerd...
I decided to write a strlen version that is optimized for the 32 bit ARM7TDMI core that i am using. I do have to note that i have not tested it yet.

I wrote 2 strlen versions in C :
Nr1 is the standard version which retrieves 1 byte at a time from memory.
Nr2 is the 32bit wide version that retreives a 32bit word at once and then shifts that 32 bit word around on the core to check at byte size if the "NULL" = 0x00 is present.

Since i do everything on the core, i do not have to worry about wait states and since i do not have as much memory accesses, the memory controller does not have to do as much arbitration between the dma controller and the core if a collision would occur. The StrLen32 version does need a cast from char to >> value = Strlen32((uint32_t *)sz_string);

Here is the C code :

Code:
// 1 byte at a time version.
uint32_t StrLen(char *uc_pntr)
{
  uint32_t	ui_cntr = 0;

  while(*uc_pntr++ != 0x00)
  {
    ui_cntr++;
    if(ui_cntr > 1024)
    {
      return 0;
    }
  }	
  return (ui_cntr);
}

Code:
// 4 bytes at a time version. 
uint32_t StrLen32(uint32_t *ui_pntr)
{
  uint32_t	ui_cntr;
  uint32_t	ui_temp;
  uint32_t	ui_temp2;
	
  ui_cntr = 0;
	
  while(ui_cntr < (128 << 3))
  {
    ui_temp = *ui_pntr++;
	
    ui_cntr++;
    ui_temp2 = ui_temp >> 24;
    if((ui_temp2 & 0xFF) == 0)
    {
      return (ui_cntr);
    }	
    ui_cntr++;
    ui_temp2 = ui_temp >> 16;
    if((ui_temp2 & 0xFF) == 0)
    {
      return (ui_cntr);
    }	
    ui_cntr++;
    ui_temp2 = ui_temp >> 8;
    if((ui_temp2 & 0xFF) == 0)
    {
      return (ui_cntr);
    }	

    ui_cntr++;
    ui_temp2 = ui_temp;
    if((ui_temp2 & 0xFF) == 0)
    {
      return (ui_cntr);
    }	
  }
  return (ui_cntr);
}

Here is the assembly result after compilation with thew GNU GCC compiler for the arm optimization is -O2 :

Code:
// standard strlen unction that retreives 1 byte at once.  
//uint32_t StrLen(uint32_t *ui_pntr)

    mov r3, #1024      // R3 = 1024.
    add r3, r3, #1     // Increase with 1. 
    mov r2, #0         // R2 = 0.
    b LB               // Jump to LB.
LA: add r2, r2, #1     // R2++. 
    cmp r2, r3         // R2 - R3.
    beq .LC            // if R2 == R3 jump to LC.
LB: ldrb  r1, [r0, r2] // R1 = memory address R0 points to + R2.
    cmp r1, #0         // R1  - 0.
    bne .LA            // if Z (zeroflag)) is not set, jump to LA.  
LC: mov r0, r2         // Return amount of counted characters in R2. R0 holds return value. 
    bx  lr             // return.

Code:
// My strlen function that retreives 4 bytes at once. This means 1/4 amount of memory accesses. 
//uint32_t StrLen32(uint32_t *ui_pntr)

    mov r3, #0            // R3 = 0.
    ldr r1, [r0, r3]      // R1 = memory address R0 points to + R3. (R3 = 0)  
LA: movs  r2, r1, lsr #24 // update status flags (S - suffix)  R2 = R1 shifted 24 times.
                          // If R2 is 0x00, Z is set. 
    str r4, [sp, #-4]!    //  push r4.  
    mov ip, r1, lsr #16   // shift  value 16 times and store in ip( = R12).
    mov r4, r1, lsr #8    // Shift value 8 times and store in R4.
    add r2, r3, #1        //  R2 = R3 + 1. 
    beq LB                // Jump if Z is set.  (if Z is set it means 0x00 is found.)
    tst ip, #255          // Do a test (ip & 255).  
    add r2, r3, #2        // R2 = R3 + 2. 
    beq LB                // Jump if Z is set.  (if Z is set it means 0x00 is found.)
    tst r4, #255          // Do a test (R4 & 255).
    add r2, r3, #3        // R2 = R3 + 3.
    beq LB                // Jump if Z is set.  (if Z is set it means 0x00 is found.)
    tst r1, #255          // Do a test (R1 & 255).
    add r2, r3, #4        // R2 = R3 + 4. 
    beq LB                // Jump if Z is set. (if Z is set it means 0x00 is found.)  
    cmp r2, #1024         // compare r2 with 1024.  
    beq LB                // Jump if Z is set. (if Z is set it means 0x00 is found.)
    mov r3, r2            // Store R2 inside R3. 
 
    ldr r1, [r0, r3]      // R1 = memory address R0 points to + R3. (R3 = 1) 
    bne LA  

LB: mov r0, r2            // Return amount of counted characters in R2. R0 holds return value. 
    ldmfd sp!, {r4}       // pop R4.
    bx  lr                // Return

Although the strlen32 version that retrieves a word at a time seems larger, it is only the case because i unrolled it a bit in the C code. It is larger but much faster because of less branches to take and less memory access is required.

Any critical thoughts or remarks because i might have overlooked something ?


EDIT:
I just thought of 1 limitation. The string needs to be on a word boundary in memory to avoid data abort exceptions. :(
That is a problem :(.
 
Last edited:
May 11, 2008
21,879
1,327
126
I do not think it works even on a 4 byte boundary.

I optimized it by hand and this should work : :)

Code:
// My hand optimized strlen function that retrieves 4 bytes at once. This means 1/4 amount of memory accesses. 
//It searches through strings with a maximum length of 1024 bytes.
//uint32_t StrLen32(uint32_t *ui_pntr)

    mov r3, #0            // R3 = 0.
    str r4, [sp, #-4]!    //  push r4.  

LA: ldr r1, [r0, r3]      // R1 = memory address R0 points to + R3. (R3 = 0,4,8,12 etcetera..)
    movs  r2, r1, lsr #24 // update status flags (S - suffix)  R2 = R1 shifted 24 times.
                          // If R2 is 0x00, Z is set. 
    mov ip, r1, lsr #16   // shift  value 16 times and store in ip( = R12).
    mov r4, r1, lsr #8    // Shift value 8 times and store in R4.
    add r3, r3, #1        // R3++. 
    beq LB                // Jump if Z is set.  (if Z is set it means 0x00 is found.)
    tst ip, #255          // Do a test (ip & 255).  
    add r3, r3, #1        // R3++. 
    beq LB                // Jump if Z is set.  (if Z is set it means 0x00 is found.)
    tst r4, #255          // Do a test (R4 & 255).
    add r3, r3, #1        // R3++.
    beq LB                // Jump if Z is set.  (if Z is set it means 0x00 is found.)
    tst r1, #255          // Do a test (R1 & 255).
    add r3, r3, #1        // R3++. 
    beq LB                // Jump if Z is set. (if Z is set it means 0x00 is found.)  
    cmp R3, #1024        // compare R3 with 1024.  
    beq LB                // Jump if Z is set. (if Z is set it means 1024 is found.)
    bne LA  

LB: mov r0, r3            // Return amount of counted characters in R3. R0 holds return value. 
    ldmfd sp!, {r4}       // pop R4.
    bx  lr                // Return

EDIT:
I should note that the ARM has a barrel shifter that can be used with the second operand in some instructions. This means that a value can be shifted and if i remember correctly it only takes 1 cpu cycle more.
 
Last edited:

degibson

Golden Member
Mar 21, 2008
1,389
0
0
Be careful of the generality of this optimization. You don't know the alignment of your input, and if your ARM has virtual memory you could run off of the end of a page that your per-byte version wouldn't.

And, depending again on your ARM, an unaligned input might perform worse than the byte version.
 
May 11, 2008
21,879
1,327
126
Indeed these issues can cause serious problems. But it is for an embedded system that at the moment does not have cache. And i had an awkward solution just before i fell a sleep. Since the controller i use has more memory than i use anyway, i wanted to make a sort of mini calloc version together with a mini free. Very simplified but i can make sure that the requested memory always starts on a word boundary by use of an option passed with the arguments. Then whenever a lot of strings needs to be determined, i can use the fast version without any worries because i request a word aligned block of memory. I already changed my startup assembler scripts a few weeks ago to do a double ram test(instead of a single ram test) and fill the ram with a special value prior before the controller and memory is setup to run c compiled code. I have also written a small c function called FindFreeRam that determines the amount of unused ram memory. When i compared it with the *.map file from GNU GCC, it works great because the result of FindFreeRam is always the same as *.map file.
Of course this solution only works with received strings and not with strings placed as constants in the flash memory. I have made sure that my uart rx buffer and my uart tx buffer both are on a word aligned part of ram memory.

Although i can solve the alignment in flash problem too by using a separate c file that produces a separate *.text section and then i notify the linker to align the *.text on a word boundary. The only catch is that all const char strings have to be aligned in the c file prior before compilation. That means padding with 0x00 until a word boundary is reached. I have enough solutions. ^_^


There is one more thing i have to honestly mention, i changed the optimization level to -O1 and GCC created other less optimized code with more loops. And this code was easier to understand.
With respect to the -O2 version, i think the list file that i received with the assembler code was not finished with respect to the end result. I have serious thoughts that the code that i found and have posted is an intermediate result and not the finished version. That may explain why it does not seem to work. Because i do not believe GNU GCC would produce not working code. Everything i have written so far works out of the box and therefore i must conclude that it is something i am not aware of yet and made the wrong conclusion.
 
Last edited:
May 11, 2008
21,879
1,327
126
I should also note that it is quite easy to create a functionality in my W-ARM program where it would search through the special string.c file and look for all const char strings. When it found them, it can automatically sort them to get the most efficient use of memory with the least amount of padding.
It seems the gcc compiler supports even attributes that force alignment removing the option of padding and the possibility of GCC removing excess NULL values found in string...
After all this, GCC can do the magic it is very good at.
 
May 11, 2008
21,879
1,327
126
Or i can create a functionality in W-ARM that convert strings to arrays with hex numbers prior before the compilation starts. Then i do not need to worry about how the GCC compiler treats the strings. That is the most easy way. Then i can optimize for alignment and afterwards add padding by use of 0x00 (NULL). After compilation, everything is word aligned with the minimal amount of lost memory in Flash and RAM. Then i have optimized from all perspectives. Maximum speed because of optimized code and word alignment of data. Minimal loss of free memory because of alignment and optimized placement of string data.

This is the best way IMHO. If i figure out how to add the needed casts. :hmm:
 
May 11, 2008
21,879
1,327
126
I am in luck. I just was looking around in the map file at the .rodata section.
I do not have to do anything. :D

The const char strings are already aligned automatically on a word boundary (0,4,8,C,10,14,18,1C,20....). No matter what size the string is.

A snippet of the map file :
sz_test is made up of nine characters + one NULL character. As you can see it is still a word boundary.

Code:
.rodata         0x00106030       0xac
 .rodata        0x00106030       0x9c ./src/main.o
                0x00106030                sz_bl_startup
                0x00106064                sz_bl_builddate
                0x00106084                sz_os_mem_init
                0x001060ac                sz_test
                0x001060b8                sz_bl_version
 .rodata        0x001060cc       0x10 ./src/semi_stdio.o
I think this is because of the align(4) instruction in the linker file.

If i want to be absolutely sure i can also examine the memory address of the string i want to determine the length from. If it is not word aligned, use the 8bit byte at a time version or return with an error code. If it is word aligned use the 32bit word at a time version. Now that i have enough solutions, it is time for decisions. :)


EDIT:
Here is a snippet from the linker file :

Code:
	.fastcode :	/* All the instruction code in ROM copied to RAM are stored in ROM here */
		{
		__fastcode_load = LOADADDR (.fastcode);
		__fastcode_start = .;
		*isr_asm.o (.text)	/* RAM vectors and assembler parts of the interrupt routines are placed in the isr_asm.s file*/
		. = ALIGN (4);
		*isr.o (.text)			/* All interrupt service routine code is placed in the isr.c file*/
		. = ALIGN (4);
		*fastexec.o (.text)	/* All fast executing code is placed in the fastexec.c file */
		. = ALIGN (4);
		__fastcode_end = .;
		} >RAM AT>ROM

	.text :	/* All the instruction code not copied to RAM are stored in ROM here */
		{
		*commandcontrol.o (.text)
		*inithardware.o (.text)
		*isr_ctrl.o (.text)
		*low_level_init.o (.text)
		*main.o (.text)
		*os_functions.o (.text)
		*sam7s_hardware.o (.text)
		*semi_stdio.o (.text)
		. = ALIGN (4);
		} >ROM
 
Last edited:

Cogman

Lifer
Sep 19, 2000
10,284
138
106
Just a note,

x86 supports a lot of string manipulating instructions. Even though you are taking in multiple bytes in at a time, you are probably still going slower than what can be done with the x86 arch.

Check out the rep modifier http://www.fermi.mn.it/linux/quarta/x86/rep.htm . It only works with certain functions, but where applicable, it is always faster than rolling your own.

In other words, make sure you are surveying the instructions of your Arch before trying to optimize. Quite often, you can't do better than standard library stuff because they have done just that.
 

degibson

Golden Member
Mar 21, 2008
1,389
0
0
It's good that you've thoroughly investigated what your toolchain does on the alignment front; ARM's microarchitecture is optimized for aligned accesses, so this makes sense. But don't forget that a programmer (you) can easily pass a non-aligned pointer to your strlen(), as well.

Code:
const char* FOO = "Foo";  // aligned
char foo[4];
strncpy(foo, FOO, 4);  // on stack, possibly not aligned
int len1 = strlen(foo);
int len2 = strlen(&foo[1]);  // indexed string, possibly not aligned

This re-opens the door for unaligned accesses crossing page boundaries. You haven't mentioned yet if your embedded system uses paging. If it does, be careful.
 
May 11, 2008
21,879
1,327
126
It is ARM architecture. I am not using x86. If that was the case i prefer to use indeed the string manipulation instructions the architecture has to offer. Perhaps newer ARM architectures also support these instructions...

On my embedded system most strings are really constants such as .rodata.
This is much more handy to do. Most of the functions i made that actually do something with strings automatically align on a word boundary. Because the (hidden) buffers always start on a word boundary. For example i have a PrintString function that is similar in functionality although limited as printf (hence the different name to avoid confusion). This function prints to a buffer in memory that is word aligned. If i would need to know the length of the complete string it would not be an issue. My receive buffer is also word aligned but here a problem can indeed arise because the received string itself may not be aligned. Thus i have chosen for the method of memory address checking and returning with an error in StrLen32 if not word aligned. Then i can fall back to the normal StrLen on failure. I could add the complete functionality within StrLen itself but i prefer to keep functions as simple and self explaining as possible. It is even possible to create a combination function which uses StrLen until a word alignment is reached and then switches to StrLen32 to do word aligned searching.

Of course with memory protection, i would need to add a trick. One example can be to give such (part of the OS)functions a special tag. Then on a memory access violation it is possible to see what function caused this and if it is StrLen32 there is no problem restoring the previous state.

Although i do think that when one starts to think of a solution for everything, the solution will be slower then the original optimization was meant for...

Forcing word alignment (and StrLen32 checking for word aligned access ) will do for me. :)

The more modern ARM architectures do support non aligned access but what i understand of it is that multiple accesses are made to compensate for the non alignment. For now i am happy with my solution. Next step is to figure out how to call assembly functions from within C while respecting the calling convention. I do know how to call C functions from within assembly. But not the other way around and with passing parameters. The ARM has a SWI instruction that i could use for that but i do not want to do that.
 
Last edited:

Schmide

Diamond Member
Mar 7, 2002
5,693
934
126
I haven't worked in non x86 asm since college. After looking up a few instructions, wouldn't it be better just to mask and and rather than shift and add?

My untested code.

Code:
//uint32_t StrLen32(uint32_t *ui_pntr)

     mov r3, #0            // R3 = 0.
     str r4, [sp, #-4]!    //  push r4.  

LA:  ldr r1, [r0, r3]         // R1 = memory address R0 points to + R3. (R3 = 0,4,8,12 etcetera..)
     teq r1, 0x000000ff     // test for bits in the low byte
     beq LBE                   // jump out (no extra count add)
     teq r1, 0x0000ff00     // test for bits in the second byte
     beq LB1                      // jump out 1 extra byte
     teq r1, 0x00ff0000     // test for bits in the third byte
     beq LB2                 // jump out 2 extra bytes
     teq r1, 0xff000000   // test for bits in the fourth byte
     beq LB3                 // jump out 3 extra bytes
     add r3, #4             // get to the next dword.
     cmp r3, #1024       // compare R3 with 1024.  
     bne LA
     bal LBE
LB3: add r3, #1                 // add in the extra bytes is there an inc?
LB2: add r3, #1        
LB1: add r3, #1        
LBE: mov r0, r3                 // save the count for return
     ldmfd sp!, {r4}             // pop R4.
     bx  lr                            // Return
 
Last edited:
May 11, 2008
21,879
1,327
126
That can indeed be an option. The limitations of the TST instruction are not quite clear to me. But if i am not mistaken, it is possible to do the shift for the second operand. I need to do the most significant byte first but that is just shifting.

After further examining, (but not tested yet) I think you are great. :thumbsup:
I modified the code a tiny bit and this should work(have not tested it yet).
With your optimization i saved registers and the need to push and to pop R4. i need less registers and instructions. Thank you. :)
I have added the word alignment check as well. It will return 0 on fail.

Code:
//uint32_t StrLen32(uint32_t *ui_pntr)
    mov r2, #0            // R2 = 0.
                          // We have to do a word alignment check.        
    tst r0, #3            // test if BIT0 and BIT1 are cleared. 
    bne LBE               // if Z is set then BIT0 and BIT1 are cleared. We can go to work.
    
LA: ldr r1, [r0, r2]      // R1 = memory address R0 points to + R2. (R2 = 0,4,8,12 etcetera..)
    teq r1, #255, lsl #24  // test for bits in the low byte
    beq LBE               // jump out (no extra count add)
    teq r1, #255, lsl #16  // test for bits in the second byte
    beq LB1               // jump out 1 extra byte
    teq r1, #255, lsl #8   // test for bits in the third byte
    beq LB2               // jump out 2 extra bytes
    teq r1, #255          // test for bits in the fourth byte
    beq LB3               // jump out 3 extra bytes
    add r2, #4            // get to the next dword.
    cmp r2, #1024         // compare R2 with 1024.  
    bne LA
    bal LBE
LB3: add r2, #1           // add in the extra bytes is there an inc?
LB2: add r2, #1        
LB1: add r2, #1        
LBE: mov r0, r2           // save the count for return
     bx  lr

I did test the original StrLen32 code generated by GNU GCC that i have posted above with -O1. It works ! :)
I will test the code with the enhancements you proposed.
 
Last edited:
May 11, 2008
21,879
1,327
126
It seems TST and TEQ indeed expect a 32bit constant (which is internally created by shifting :eek:) The LSL is not allowed. But doing it as you described works. I changed the TEQ to TST again.
This compiles (assembles) and should work... :)

Code:
//uint32_t StrLen32(uint32_t *ui_pntr)
    mov r2, #0            // R2 = 0.
                          // We have to do a word alignment check.        
    tst r0, #3            // test if BIT0 and BIT1 are cleared. 
    bne LBE               // if Z is set then BIT0 and BIT1 are cleared. 
    
LA: ldr r1, [r0, r2]     // R1 = memory address R0 points to + R2. (R2 = 0,4,8,12 etcetera..)
    tst r1, #0xFF000000   // test for bits in the low byte
    beq LBE               // jump out (no extra count add)
    tst r1, #0x00FF0000   // test for bits in the second byte
    beq LB1               // jump out 1 extra byte
    tst r1, #0x0000FF00   // test for bits in the third byte
    beq LB2               // jump out 2 extra bytes
    tst r1, #0x000000FF  // test for bits in the fourth byte
    beq LB3               // jump out 3 extra bytes
    add r2, #4            // get to the next dword.
    cmp r2, #1024         // compare R2 with 1024.  
    bne LA
    bal LBE
LB3: add r2, #1           // add in the extra bytes is there an inc?
LB2: add r2, #1        
LB1: add r2, #1        
LBE: mov r0, r2           // save the count for return
     bx  lr
 
Last edited:

Schmide

Diamond Member
Mar 7, 2002
5,693
934
126
It seems TST and TEQ indeed expect a 32bit constant (which is internally created by shifting :eek:) The LSL is not allowed. But doing it as you described works. I changed the TEQ to TST again.
This compiles (assembles) and should work... :)

Ha my google ref got it wrong lol. (here) So the following reply is wrong and you are right.

tst is a bitwise exclusive or (^)
teq is a bitwise and (&)

r1 ^ 0x000000ff will be non zero for all values except r1=0x000000ff and other bits in the dword will effect the flags.

while

r1 & 0x000000ff will be non zero for all values except r1=0xXXXXXX00 (00 in the byte) and all the other bits in the dword will be logically masked out.

I'm pretty sure you need teq.

It probably doesn't matter but

Code:
    tst r1, #0xFF         // test for bits in the fourth byte

shorting the opcode (0xff vs 0x000000ff) could lead to sign extension or other anomalies especially on byte addressing machines.
 
Last edited:
May 11, 2008
21,879
1,327
126
To be sure i will change the code to 0x000000FF.

It is amazing how much information is present on the internet.
I have the arm technical manual, the arm cookbook and the optimizing manual for the ARM downloaded as pdf. These are all free to download.
I carry them around on my phone even. Yeah :cool:

But here i found all relevant information although my inexperience sometimes cause some confusion. I am still looking for a proper ARM simulator with a nice graphical front end.
 
Last edited:
May 11, 2008
21,879
1,327
126
I reverse engineered the hand optimized code from assembler to c. And i think i have succeeded.
I was thinking how to get the c compiler to use the tst function. I had an idea and it is the right one. Now my c-fu has improved as well.

Here is the code in C.
Code:
uint32_t TestStrLen32(uint32_t *ui_pntr)
{
  uint32_t  ui_cntr;
  uint32_t  ui_temp;

  
  if(((uint32_t)ui_pntr & 0x00000003) != 0)
  {
    return 0;
  }
  
  ui_cntr = 0;
  
  while(ui_cntr < 1024)
  {
    ui_temp = *ui_pntr++;
  
    if((ui_temp &0xFF000000) == 0)
    {
      ui_cntr++;
      return(ui_cntr);
    }
    if((ui_temp &0x00FF0000) == 0)
    {
      ui_cntr+=2;
      return(ui_cntr);
    }
    if((ui_temp &0x0000FF00) == 0)
    {
      ui_cntr+=3;
      return(ui_cntr);
    }
    if((ui_temp &0x000000FF) == 0)
    {
      ui_cntr+=4;
      return(ui_cntr);
    }
    ui_cntr+=4;
  }
  return (ui_cntr);
}

And this is the result in assembler by use of GNU GCC with optimization -O2.

Code:
      ands  r2, r0, #3     // Load with 3 to check if BIT0 and BIT1 are both 0.
      movne r2, #0         // Load R2 with 0.
      beq .L17             // Jump to L17 if Z is set (result is 0).
      b .L12               // Jump to L12.
    
.L13  tst r3, #16711680    // R3 & 0x00FF0000.
      beq .L20             // Jump to L12 if Z is set (result is 0).
      tst r3, #65280       // R3 & 0x0000FF00.
      beq .L21             // Jump to L12 if Z is set (result is 0).
      tst r3, #255         // R3 & 0x000000FF.
      beq .L22             // Jump to L12 if Z is set (result is 0).
      add r2, r2, #4       // R2 = R2 + 4. 
      cmp r2, #1024        // compare (R2 - 1024).
      beq .L12             // Jumpt to L12 if Z is set (result is 0). 

.L17: ldr r3, [r0, r2]     // Load in R3 the value of memory address in R0 + R2.  
      tst r3, #-16777216   // R3 & 0xFF000000.
      bne .L13             // Jump to L13 if Z is not set (result  is not equal to 0). 
  
      add r2, r2, #1       // R2++.  
.L12: mov r0, r2           // R0 = R2.
      bx  lr               // Return.
      
.L20  add r2, r2, #2       // R2 +=2. 
      mov r0, r2           // R0 = R2.
      bx  lr               // Return.
      
.L21  add r2, r2, #3       // R2 +=3. 
      mov r0, r2           // R0 = R2.
      bx  lr               // Return.

.L22  add r2, r2, #4       // R2 +=4. 
      mov r0, r2           // R0 = R2.
      bx  lr               // Return.

It is almost an exact match to the original optimized code you proposed with addition of the world alignment check.

I am happy. I figured out a piece of the puzzle about how to manipulate the compiler to extract maximum performance with hand tweaking in assembly.


EDIT:
Yahooo. I have fixed it. ( I hope...)
 
Last edited:
May 11, 2008
21,879
1,327
126
I have now an almost exact match :

C code :
Code:
uint32_t TestStrLen32V2(uint32_t *ui_pntr)
{
  uint32_t  ui_cntr;
  uint32_t  ui_temp;

  
    if(((uint32_t)ui_pntr & 0x00000003) != 0)
    {
      return 0;
    }
  
    ui_cntr = 0;
  
    while(ui_cntr < 1024)
    {
      ui_temp = *ui_pntr++;
  
      if((ui_temp & 0xFF000000) == 0)
      {
        goto L1;
      }
      if((ui_temp & 0x00FF0000) == 0)
      {
        goto L2;
      }
      if((ui_temp & 0x0000FF00) == 0)
      {
        goto L3;
      }
      if((ui_temp & 0x000000FF) == 0)
      {
        goto L4;
      }
      ui_cntr+=4;
    }

L4: ui_cntr++;
L3: ui_cntr++;
L2: ui_cntr++;
L1: ui_cntr++;

  return (ui_cntr);
}

Assebmler result by GNU GCC optimization = -O2.

Code:
      ands r2, r0, #3      // Load with 3 to check if BIT0 and BIT1 are both 0.
                           // If this is the case, then R2 = 0.
      movne r0, #0         // Load R0 with 0.
      beq .L30             // Jump to L30 if Z is set (result is 0).
      bx  lr               // Return.
    
.L33  tst r3, #16711680    // R3 & 0x00FF0000.  
      beq .L27             // Jump to L27 if Z is set (result is 0).
      tst r3, #65280       // R3 & 0x0000FF00. 
      beq .L28             // Jump to L28 if Z is set (result is 0). 
      tst r3, #255         // R3 & 0x000000FF.
      beq .L29             // Jump to L29 if Z is set (result is 0). 
 
      add r2, r2, #4       // R2+=4;
      cmp r2, #1024        // compare (R2 - 1024).          
      beq .L29             // Jump to L29 if Z is set (result is 0). 
 
.L30: ldr r3, [r0, r2]     // Load in R3 the value of memory address pointed at by use of R0 + R2.
      tst r3, #-16777216   // R3 & 0xFF000000.
      bne .L33             // Jump to L33 if Z is clear (result is not equal to 0). 
.L26: add r0, r2, #1       // R0 = R2 + 1.
      bx  lr               // Return.
.L29: add r2, r2, #1       // R2++.  
.L28: add r2, r2, #1       // R2++.
.L27: add r2, r2, #1       // R2++.
      add r0, r2, #1       // R0 = R2 + 1.
      bx  lr               // Return.

Now i no longer need to create an assembler version.
I have the desired optimized code in C. :thumbsup:
 

Schmide

Diamond Member
Mar 7, 2002
5,693
934
126
The logic isn't correct on that one you're actually counting an extra byte. If the first byte in a string is null it is zero length. Your code jumps out and counts that as string length 1.

Modifying the code.

Code:
uint32_t TestStrLen32V2(uint32_t *ui_pntr)
{
  uint32_t  ui_cntr;

  
    if(((uint32_t)ui_pntr & 0x00000003) != 0)
    {
      return 0;
    }
  
    ui_cntr = 0;
  
    do
    {
      if((*ui_pntr & 0xFF000000) == 0)
      {
        goto L1;
      }
      if((*ui_pntr & 0x00FF0000) == 0)
      {
        goto L2;
      }
      if((*ui_pntr & 0x0000FF00) == 0)
      {
        goto L3;
      }
      if((*ui_pntr & 0x000000FF) == 0)
      {
        goto L4;
      }
      ui_pntr++;
      ui_cntr+=4;
    } while (ui_cntr < 1024)
    gogo L1;

L4: ui_cntr++;
L3: ui_cntr++;
L2: ui_cntr++;
L1: return (ui_cntr);
}
 
May 11, 2008
21,879
1,327
126
You are right. I found that out too.

This is my latest version, i have not plucked the code from the assembler listing yet. I did wonder about one thing : I used a variable to store the data in a register instead of checking with the pointer directly. I had this cautious nature which is not needed with the ARM after thinking about it. Because it is a load & store architecture that has no instructions to directly deal with data in memory, data in memory must be retrieved and placed into a register first. After that the core can work it's magic on the data. Thus indeed i do not have to use the variable ui_temp because the compiler needs to copy the value in memory to a register anyway. It makes sense.

Code:
 uint32_t StrLen32(uint32_t *ui_pntr)
{
  uint32_t  ui_cntr;
  uint32_t  ui_temp;

  
    if(((uint32_t)ui_pntr & 0x00000003) != 0)
    {
      return 0;
    }
  
    ui_cntr = 0;
  
    while(ui_cntr < 1024)
    {
      ui_temp = *ui_pntr++;
  
      if((ui_temp &0xFF000000) == 0)
      {
        goto L1;
      }
      if((ui_temp &0x00FF0000) == 0)
      {
        goto L2;
      }
      if((ui_temp &0x0000FF00) == 0)
      {
        goto L3;
      }
      if((ui_temp &0x000000FF) == 0)
      {
        goto L4;
      }
      ui_cntr+=4;
    }
    goto LE;

L4: ui_cntr++;
L3: ui_cntr++;
L2: ui_cntr++;
L1: ui_cntr++;

LE: return (ui_cntr);
 

Schmide

Diamond Member
Mar 7, 2002
5,693
934
126
Maybe you posted the wrong code. Send the above code a pointer to a null byte and you will get a return of 1.
 
May 11, 2008
21,879
1,327
126
Well, I removed ui_temp.
This is the result :

Code:
uint32_t StrLen32(uint32_t *ui_pntr)
{
  uint32_t  ui_cntr;
  
    if(((uint32_t)ui_pntr & 0x00000003) != 0)
    {
      return 0;
    }
  
    ui_cntr = 0;
  
    while(ui_cntr < 1024)
    {
      if((*ui_pntr & 0xFF000000) == 0)
      {
        goto L1;
      }
      if((*ui_pntr & 0x00FF0000) == 0)
      {
        goto L2;
      }
      if((*ui_pntr & 0x0000FF00) == 0)
      {
        goto L3;
      }
      if((*ui_pntr & 0x000000FF) == 0)
      {
        goto L4;
      }
      ui_pntr++;
      ui_cntr+=4;
    }
    goto LE;

L4: ui_cntr++;
L3: ui_cntr++;
L2: ui_cntr++;
L1: ui_cntr++;

LE: return (ui_cntr);
}

This is the assembler code. It is more difficult to keep track of but it is very fast and almost ideal. Assembler result by GNU GCC optimization = -O2.
Code:
      ands r2, r0, #3      // Load with 3 to check if BIT0 and BIT1 are both 0.
                           // If this is the case, then R2 = 0.
      movne r2, #0         // Load R2 with 0.
      beq .L17             // Jump to L17 if Z is set (result is 0).
      bx  .L12             // Jump to L12 if Z is cleared (result is not equal to 0). 
       
.L20  tst r3, #16711680    // R3 & 0x00FF0000.  
      beq .L14             // Jump to L14 if Z is set (result is 0).
      tst r3, #65280       // R3 & 0x0000FF00. 
      beq .L15             // Jump to L15 if Z is set (result is 0). 
      tst r3, #255         // R3 & 0x000000FF.
      beq .L16             // Jump to L16 if Z is set (result is 0). 

      add r2, r2, #4       // R2+=4;
      cmp r2, #1024        // compare (R2 - 1024).          
      beq .L12             // Jump to L29 if Z is set (result is 0). 

.L17: ldr r3, [r0, r2]     // Load in R3 the value of memory address pointed at by use of R0 + R2.
      tst r3, #-16777216   // R3 & 0xFF000000.
      bne .L20             // Jump to L20 if Z is clear (result is not equal to 0). 
.L13  add r2, r2, #1       // R2++.
.L12  mov r0, r2            // R0 = R2.
      bx lr                      // Return.
.L16: add r2, r2, #1       // R2++.  
.L15: add r2, r2, #1       // R2++.
.L14: add r2, r2, #1       // R2++.
      b .L13
 
Last edited:
May 11, 2008
21,879
1,327
126
I am falling a sleep so i will have to offer this quick and dirty solution :

Code:
//*************************************************************************************************
// Counts the amount of characters in a "NULL" terminated string.
// Return 0 When string is larger then 1024 characters.
// Returns amount of characters in string.
//
uint32_t StrLen32(uint32_t *ui_pntr)
{
  uint32_t  ui_cntr;
  
    ui_cntr = 0;
    
    if(((uint32_t)ui_pntr & 0x00000003) != 0)
    {
      goto LE;
    }

    if((*ui_pntr & 0xFF000000) == 0)
    {
      goto LE;
    }
  
    while(ui_cntr < 1024)
    {
      if((*ui_pntr & 0xFF000000) == 0)
      {
        goto L1;
      }
      if((*ui_pntr & 0x00FF0000) == 0)
      {
        goto L2;
      }
      if((*ui_pntr & 0x0000FF00) == 0)
      {
        goto L3;
      }
      if((*ui_pntr & 0x000000FF) == 0)
      {
        goto L4;
      }
      ui_pntr++;
      ui_cntr+=4;
    }
    goto LE;

L4: ui_cntr++;
L3: ui_cntr++;
L2: ui_cntr++;
L1: ui_cntr++;
LE: return (ui_cntr);
}
 
May 11, 2008
21,879
1,327
126
ZZZZzzzzZZZZZZzzzz my lazzzzt attemptzzz :

Code:
//*************************************************************************************************
// Counts the amount of characters in a "NULL" terminated string.
// Return 0 When string is larger then 1024 characters.
// Returns amount of characters in string.
//
uint32_t StrLen32(uint32_t *ui_pntr)
{
  uint32_t  ui_cntr;
  
    ui_cntr = 0;
    
    if(((uint32_t)ui_pntr & 0x00000003) != 0)
    {
      goto L1;
    }

    while(ui_cntr < 1024)
    {
      if((*ui_pntr & 0xFF000000) == 0)
      {
        goto L1;
      }
      if((*ui_pntr & 0x00FF0000) == 0)
      {
        goto L2;
      }
      if((*ui_pntr & 0x0000FF00) == 0)
      {
        goto L3;
      }
      if((*ui_pntr & 0x000000FF) == 0)
      {
        goto L4;
      }
      ui_pntr++;
      ui_cntr+=4;
    }
    goto L1;

L4: ui_cntr++;
L3: ui_cntr++;
L2: ui_cntr++;
L1: return (ui_cntr);
}

Time to go to sleep.

I will test and compare it tomorrow. Right now i am in a window of sleep. And i must use it or i have to wait another 4 hours.