Fun Strlen retrieving 4 bytes at once.

Page 3 - Seeking answers? Join the AnandTech community: where nearly half-a-million members share solutions and discuss the latest tech.

Schmide

Diamond Member
Mar 7, 2002
5,712
978
126
So I modified some code I had to do most all the variations. Sad to say, it's darn hard to beat the standard routines. I came close often within 10%. The strchr* routine does basically the same thing as the intrinsic routine with the exception of using a simd routine to align data and using a bsf (bit scan forward) to finalize the find.

* I walked into the assembly.

Output:
Code:
Checking Output...

strchr
120 110 99 110 105 104 88 0
Intrinsic
120 110 99 110 105 104 88 0
byte
120 110 99 110 105 104 88 0
DWord
120 110 99 110 105 104 88 0
QWord
120 110 99 110 105 104 88 0

Calculating...

strchr 67.1348 ms
Intrinsic 67.601 ms
Byte 317.552 ms
DWord 1051.8 ms
QWord 729.756 ms

Second run (with pauses)
Code:
strchr 62.0654 ms
Intrinsic 69.4543 ms
Byte 338.806 ms
DWord 1124.12 ms
QWord 784.669 ms
charsearch.h:
Code:
// Check windows
#if _WIN32 || _WIN64
#if _WIN64
#define ENVIRONMENT64
#else
#define ENVIRONMENT32
#endif
#endif

// Check GCC
#if __GNUC__
#if __x86_64__ || __ppc64__
#define ENVIRONMENT64
#else
#define ENVIRONMENT32
#endif
#endif

#define MASK(a) ( ( 1 << (a) ) - 1 )
#define MASKCAST(a,b) ( ( ( (a)1 ) << (b) ) - 1 )
#define ONESHIFT(a) ( 1 << (a) )

#ifdef ENVIRONMENT64
#define TOALIGNED32(a,b) ( ((unsigned long)(-(long)reinterpret_cast<long long>(a))) & MASK(b) )
#else
#define TOALIGNED32(a,b) ( ((unsigned long)(-reinterpret_cast<long>(a))) & MASK(b) )
#endif
#define TOALIGNED64(a,b) ( ((unsigned long long)(-reinterpret_cast<long long>(a))) & MASKCAST(unsigned long long, b) )

#define USE_INTRINSIC
#ifdef USE_INTRINSIC
#include <xmmintrin.h>
#define BYTE_TYPE __m128i
#define BYTE_SET(a) _mm_set1_epi8(a)
#define BYTE_EQ(a,b) _mm_cmpeq_epi8(a,b)
#define BYTE_MOVEMASK(a) _mm_movemask_epi8(a)
#define BYTE_SIZE_POW2 4
#endif

template<typename B, typename I, unsigned char L>           // note L must be size of alignment 2=32 3=64 4=128...
B H_FindCharTemplate(void* buffer, B count, unsigned char c)
{
   if (!count)
      return 0;
   void* currentPosition = buffer;
   B alignedCount = count >> L;
   if ((alignedCount >> 1) && (L > 1) ) {
      unsigned long toAligned = TOALIGNED32(currentPosition, L);
      if (toAligned) {
         unsigned char* charBuffer = (unsigned char *)currentPosition;
         unsigned char* charBufferEnd = charBuffer + toAligned;
         do {
            if (c == *charBuffer)
               return (B)((B)charBuffer - (B)currentPosition);
            else
               charBuffer++;
         } while (charBuffer < charBufferEnd);
         currentPosition = (void *)charBuffer;
         count -= toAligned;
      }
      switch (L)
      {
         case 2: {
            alignedCount = count >> L;
            B* alignedBuffer = (B *)currentPosition;
            B* alignedBufferEnd = alignedBuffer + alignedCount;
            do {
               B charCompares;
               unsigned char *chars = (unsigned char *)&charCompares;
               unsigned char *alignedBufferChars = (unsigned char *)alignedBuffer;
               *(chars) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars))) >> 8);
               *(chars + 1) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 1))) >> 8);
               *(chars + 2) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 2))) >> 8);
               *(chars + 3) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 3))) >> 8);
               if (((B)-1) != charCompares) {
                  chars = (unsigned char *)alignedBuffer;
                  while (c != *chars)
                     chars++;
                  return (B)((B)chars - (B)buffer);
               }
            } while (++alignedBuffer < alignedBufferEnd);
            currentPosition = (void *)alignedBuffer;
            count -= alignedCount << L;
         }
         break;
         case 3: {
            alignedCount = count >> L;
            B* alignedBuffer = (B *)currentPosition;
            B* alignedBufferEnd = alignedBuffer + alignedCount;
            do {
               B charCompares;
               unsigned char *chars = (unsigned char *)&charCompares;
               unsigned char *alignedBufferChars = (unsigned char *)alignedBuffer;
               *(chars) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars))) >> 8);
               *(chars + 1) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 1))) >> 8);
               *(chars + 2) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 2))) >> 8);
               *(chars + 3) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 3))) >> 8);
               *(chars + 4) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 4))) >> 8);
               *(chars + 5) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 5))) >> 8);
               *(chars + 6) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 6))) >> 8);
               *(chars + 7) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 7))) >> 8);
               if (((B)-1) != charCompares) {
                  chars = (unsigned char *)alignedBuffer;
                  while (c != *chars)
                     chars++;
                  return (B)((B)chars - (B)buffer);
               }
            } while (++alignedBuffer < alignedBufferEnd);
            currentPosition = (void *)alignedBuffer;
            count -= alignedCount << L;
         }
         break;
         default: {
            alignedCount = count >> L;
            I* alignedBuffer = (I *)currentPosition;
            I* alignedBufferEnd = alignedBuffer + alignedCount;
            do {
               I charCompares = BYTE_SET(c);
               charCompares = BYTE_EQ(charCompares, *alignedBuffer);
               if (BYTE_MOVEMASK(charCompares)) {
                  unsigned char *chars = (unsigned char *)alignedBuffer;
                  while (c != *chars)
                     chars++;
                  return (B)((B)chars - (B)buffer);
               }
            } while (++alignedBuffer < alignedBufferEnd);
            currentPosition = (void *)alignedBuffer;
            count -= alignedCount << L;
         }
      }
   }
   unsigned char* charBuffer = (unsigned char *)currentPosition;
   unsigned char* charBufferEnd = charBuffer + count;
   while (charBuffer < charBufferEnd)
      if (c == *charBuffer)
         return (B)((B)charBuffer - (B)buffer);
      else
         charBuffer++;
   return (B)((B)charBuffer - (B)buffer);
}
charsearch.cpp:
Code:
// charsearch.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <algorithm>
#include <string>
#include <iostream>
#include <time.h>  
#include <chrono>
#include <thread>

#if defined WIN32
#include <Windows.h>
#endif

#include "charsearch.h"


int main()
{
   unsigned int indexToFind[8] = { 120, 110, 99 , 110, 105 , 104, 88, 64};
   unsigned char buffer[] = ".......................................................................0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\0";
   int loopCount = 1000000;
   int bufferAlignment = 0;
   int bufferSize = strlen((const char *)buffer) - bufferAlignment;

   printf("Checking Output...\n\n");
   printf("strchr\n");
   for (int i = 0; i < 8; i++) {
      unsigned int foundAt = (unsigned int)strchr((const char *)&buffer[bufferAlignment], buffer[indexToFind[i]]) - (unsigned int)&buffer[bufferAlignment];
      printf("%d ", foundAt);
   }
   printf("\nIntrinsic\n");
   for (int i = 0; i < 8; i++) {
      unsigned int foundAt = H_FindCharTemplate<unsigned long long, BYTE_TYPE, BYTE_SIZE_POW2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
      printf("%d ", foundAt);
   }

   printf("\nbyte\n");
   for (int i = 0; i < 8; i++) {
      unsigned int foundAt = H_FindCharTemplate<unsigned long, BYTE_TYPE, 1>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
      printf("%d ", foundAt);
   }

   printf("\nDWord\n");
   for (int i = 0; i < 8; i++) {
      unsigned int foundAt = H_FindCharTemplate<unsigned long, BYTE_TYPE, 2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
      printf("%d ", foundAt);
   }

   printf("\nQWord\n");
   for (int i = 0; i < 8; i++) {
      unsigned int foundAt = H_FindCharTemplate<unsigned long long, BYTE_TYPE, 3>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
      printf("%d ", foundAt);
   }

   clock_t t;
   printf("\n\nCalculating...\n\n");
 
   // strchr
   std::this_thread::sleep_for(std::chrono::seconds(1));
   // running twice to normalize caches
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = (unsigned int)strchr((const char *)&buffer[bufferAlignment], buffer[indexToFind[i]]) - (unsigned int)&buffer[bufferAlignment];
   auto start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = (unsigned int)strchr((const char *)&buffer[bufferAlignment], buffer[indexToFind[i]]) - (unsigned int)&buffer[bufferAlignment];
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double, std::milli> elapsed = end - start;
   std::cout << "strchr " << elapsed.count() << " ms\n";

   // Intrinsic
   std::this_thread::sleep_for(std::chrono::seconds(1));
   // running twice to normalize caches
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = H_FindCharTemplate<unsigned long long, BYTE_TYPE, BYTE_SIZE_POW2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
   start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = H_FindCharTemplate<unsigned long long, BYTE_TYPE, BYTE_SIZE_POW2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
   end = std::chrono::high_resolution_clock::now();
   elapsed = end - start;
   std::cout << "Intrinsic " << elapsed.count() << " ms\n";

   // Byte
   std::this_thread::sleep_for(std::chrono::seconds(1));
   start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = H_FindCharTemplate<unsigned long, BYTE_TYPE, 1>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
   end = std::chrono::high_resolution_clock::now();
   elapsed = end - start;
   std::cout << "Byte " << elapsed.count() << " ms\n";

   // DWord
   std::this_thread::sleep_for(std::chrono::seconds(1));
   start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = H_FindCharTemplate<unsigned long, BYTE_TYPE, 2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
   end = std::chrono::high_resolution_clock::now();
   elapsed = end - start;
   std::cout << "DWord " << elapsed.count() << " ms\n";

   // QWord
   std::this_thread::sleep_for(std::chrono::seconds(1));
   start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = H_FindCharTemplate<unsigned long long, BYTE_TYPE, 3>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
   end = std::chrono::high_resolution_clock::now();
   elapsed = end - start;
   std::cout << "QWord " << elapsed.count() << " ms\n";

   return 0;
}

executable x64 visual studio 15 release.

charsearch.exe
 
Last edited:
May 11, 2008
22,175
1,402
126
Interesting, i am still digesting it to see what happens sequentially.

I wish i had written down all the values i measured with my logic analyzer for the test i did.
On my 48MHz mcu, counting 30 characters takes between 5.2us and 5.5us with strlen32 because i have to take into account the setting and clearing of the port pin as well.
But that is when it is running form flash with 1 waitstates for 2 cycle flash read access.
If i run it from ram with 0 waitstates, (1 cycle access) it would be even faster.
That is a trick i learned to keep the few interrupt routines in ram for faster execution.
And some code that needs to run as fast as possible.
But it comes at a price of using ram for storage of routines.
 
May 11, 2008
22,175
1,402
126
So I modified some code I had to do most all the variations. Sad to say, it's darn hard to beat the standard routines. I came close often within 10%. The strchr* routine does basically the same thing as the intrinsic routine with the exception of using a simd routine to align data and using a bsf (bit scan forward) to finalize the find.

* I walked into the assembly.

Output:
Code:
Checking Output...

strchr
120 110 99 110 105 104 88 0
Intrinsic
120 110 99 110 105 104 88 0
byte
120 110 99 110 105 104 88 0
DWord
120 110 99 110 105 104 88 0
QWord
120 110 99 110 105 104 88 0

Calculating...

strchr 67.1348 ms
Intrinsic 67.601 ms
Byte 317.552 ms
DWord 1051.8 ms
QWord 729.756 ms

Second run (with pauses)
Code:
strchr 62.0654 ms
Intrinsic 69.4543 ms
Byte 338.806 ms
DWord 1124.12 ms
QWord 784.669 ms
charsearch.h:
Code:
// Check windows
#if _WIN32 || _WIN64
#if _WIN64
#define ENVIRONMENT64
#else
#define ENVIRONMENT32
#endif
#endif

// Check GCC
#if __GNUC__
#if __x86_64__ || __ppc64__
#define ENVIRONMENT64
#else
#define ENVIRONMENT32
#endif
#endif

#define MASK(a) ( ( 1 << (a) ) - 1 )
#define MASKCAST(a,b) ( ( ( (a)1 ) << (b) ) - 1 )
#define ONESHIFT(a) ( 1 << (a) )

#ifdef ENVIRONMENT64
#define TOALIGNED32(a,b) ( ((unsigned long)(-(long)reinterpret_cast<long long>(a))) & MASK(b) )
#else
#define TOALIGNED32(a,b) ( ((unsigned long)(-reinterpret_cast<long>(a))) & MASK(b) )
#endif
#define TOALIGNED64(a,b) ( ((unsigned long long)(-reinterpret_cast<long long>(a))) & MASKCAST(unsigned long long, b) )

#define USE_INTRINSIC
#ifdef USE_INTRINSIC
#include <xmmintrin.h>
#define BYTE_TYPE __m128i
#define BYTE_SET(a) _mm_set1_epi8(a)
#define BYTE_EQ(a,b) _mm_cmpeq_epi8(a,b)
#define BYTE_MOVEMASK(a) _mm_movemask_epi8(a)
#define BYTE_SIZE_POW2 4
#endif

template<typename B, typename I, unsigned char L>           // note L must be size of alignment 2=32 3=64 4=128...
B H_FindCharTemplate(void* buffer, B count, unsigned char c)
{
   if (!count)
      return 0;
   void* currentPosition = buffer;
   B alignedCount = count >> L;
   if ((alignedCount >> 1) && (L > 1) ) {
      unsigned long toAligned = TOALIGNED32(currentPosition, L);
      if (toAligned) {
         unsigned char* charBuffer = (unsigned char *)currentPosition;
         unsigned char* charBufferEnd = charBuffer + toAligned;
         do {
            if (c == *charBuffer)
               return (B)((B)charBuffer - (B)currentPosition);
            else
               charBuffer++;
         } while (charBuffer < charBufferEnd);
         currentPosition = (void *)charBuffer;
         count -= toAligned;
      }
      switch (L)
      {
         case 2: {
            alignedCount = count >> L;
            B* alignedBuffer = (B *)currentPosition;
            B* alignedBufferEnd = alignedBuffer + alignedCount;
            do {
               B charCompares;
               unsigned char *chars = (unsigned char *)&charCompares;
               unsigned char *alignedBufferChars = (unsigned char *)alignedBuffer;
               *(chars) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars))) >> 8);
               *(chars + 1) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 1))) >> 8);
               *(chars + 2) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 2))) >> 8);
               *(chars + 3) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 3))) >> 8);
               if (((B)-1) != charCompares) {
                  chars = (unsigned char *)alignedBuffer;
                  while (c != *chars)
                     chars++;
                  return (B)((B)chars - (B)buffer);
               }
            } while (++alignedBuffer < alignedBufferEnd);
            currentPosition = (void *)alignedBuffer;
            count -= alignedCount << L;
         }
         break;
         case 3: {
            alignedCount = count >> L;
            B* alignedBuffer = (B *)currentPosition;
            B* alignedBufferEnd = alignedBuffer + alignedCount;
            do {
               B charCompares;
               unsigned char *chars = (unsigned char *)&charCompares;
               unsigned char *alignedBufferChars = (unsigned char *)alignedBuffer;
               *(chars) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars))) >> 8);
               *(chars + 1) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 1))) >> 8);
               *(chars + 2) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 2))) >> 8);
               *(chars + 3) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 3))) >> 8);
               *(chars + 4) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 4))) >> 8);
               *(chars + 5) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 5))) >> 8);
               *(chars + 6) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 6))) >> 8);
               *(chars + 7) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 7))) >> 8);
               if (((B)-1) != charCompares) {
                  chars = (unsigned char *)alignedBuffer;
                  while (c != *chars)
                     chars++;
                  return (B)((B)chars - (B)buffer);
               }
            } while (++alignedBuffer < alignedBufferEnd);
            currentPosition = (void *)alignedBuffer;
            count -= alignedCount << L;
         }
         break;
         default: {
            alignedCount = count >> L;
            I* alignedBuffer = (I *)currentPosition;
            I* alignedBufferEnd = alignedBuffer + alignedCount;
            do {
               I charCompares = BYTE_SET(c);
               charCompares = BYTE_EQ(charCompares, *alignedBuffer);
               if (BYTE_MOVEMASK(charCompares)) {
                  unsigned char *chars = (unsigned char *)alignedBuffer;
                  while (c != *chars)
                     chars++;
                  return (B)((B)chars - (B)buffer);
               }
            } while (++alignedBuffer < alignedBufferEnd);
            currentPosition = (void *)alignedBuffer;
            count -= alignedCount << L;
         }
      }
   }
   unsigned char* charBuffer = (unsigned char *)currentPosition;
   unsigned char* charBufferEnd = charBuffer + count;
   while (charBuffer < charBufferEnd)
      if (c == *charBuffer)
         return (B)((B)charBuffer - (B)buffer);
      else
         charBuffer++;
   return (B)((B)charBuffer - (B)buffer);
}
charsearch.cpp:
Code:
// charsearch.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <algorithm>
#include <string>
#include <iostream>
#include <time.h>
#include <chrono>
#include <thread>

#if defined WIN32
#include <Windows.h>
#endif

#include "charsearch.h"


int main()
{
   unsigned int indexToFind[8] = { 120, 110, 99 , 110, 105 , 104, 88, 64};
   unsigned char buffer[] = ".......................................................................0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\0";
   int loopCount = 1000000;
   int bufferAlignment = 0;
   int bufferSize = strlen((const char *)buffer) - bufferAlignment;

   printf("Checking Output...\n\n");
   printf("strchr\n");
   for (int i = 0; i < 8; i++) {
      unsigned int foundAt = (unsigned int)strchr((const char *)&buffer[bufferAlignment], buffer[indexToFind[i]]) - (unsigned int)&buffer[bufferAlignment];
      printf("%d ", foundAt);
   }
   printf("\nIntrinsic\n");
   for (int i = 0; i < 8; i++) {
      unsigned int foundAt = H_FindCharTemplate<unsigned long long, BYTE_TYPE, BYTE_SIZE_POW2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
      printf("%d ", foundAt);
   }

   printf("\nbyte\n");
   for (int i = 0; i < 8; i++) {
      unsigned int foundAt = H_FindCharTemplate<unsigned long, BYTE_TYPE, 1>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
      printf("%d ", foundAt);
   }

   printf("\nDWord\n");
   for (int i = 0; i < 8; i++) {
      unsigned int foundAt = H_FindCharTemplate<unsigned long, BYTE_TYPE, 2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
      printf("%d ", foundAt);
   }

   printf("\nQWord\n");
   for (int i = 0; i < 8; i++) {
      unsigned int foundAt = H_FindCharTemplate<unsigned long long, BYTE_TYPE, 3>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
      printf("%d ", foundAt);
   }

   clock_t t;
   printf("\n\nCalculating...\n\n");
 
   // strchr
   std::this_thread::sleep_for(std::chrono::seconds(1));
   // running twice to normalize caches
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = (unsigned int)strchr((const char *)&buffer[bufferAlignment], buffer[indexToFind[i]]) - (unsigned int)&buffer[bufferAlignment];
   auto start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = (unsigned int)strchr((const char *)&buffer[bufferAlignment], buffer[indexToFind[i]]) - (unsigned int)&buffer[bufferAlignment];
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double, std::milli> elapsed = end - start;
   std::cout << "strchr " << elapsed.count() << " ms\n";

   // Intrinsic
   std::this_thread::sleep_for(std::chrono::seconds(1));
   // running twice to normalize caches
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = H_FindCharTemplate<unsigned long long, BYTE_TYPE, BYTE_SIZE_POW2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
   start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = H_FindCharTemplate<unsigned long long, BYTE_TYPE, BYTE_SIZE_POW2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
   end = std::chrono::high_resolution_clock::now();
   elapsed = end - start;
   std::cout << "Intrinsic " << elapsed.count() << " ms\n";

   // Byte
   std::this_thread::sleep_for(std::chrono::seconds(1));
   start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = H_FindCharTemplate<unsigned long, BYTE_TYPE, 1>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
   end = std::chrono::high_resolution_clock::now();
   elapsed = end - start;
   std::cout << "Byte " << elapsed.count() << " ms\n";

   // DWord
   std::this_thread::sleep_for(std::chrono::seconds(1));
   start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = H_FindCharTemplate<unsigned long, BYTE_TYPE, 2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
   end = std::chrono::high_resolution_clock::now();
   elapsed = end - start;
   std::cout << "DWord " << elapsed.count() << " ms\n";

   // QWord
   std::this_thread::sleep_for(std::chrono::seconds(1));
   start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i<8; i++)
      for (int j = 0; j<loopCount; j++)
         volatile int found = H_FindCharTemplate<unsigned long long, BYTE_TYPE, 3>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
   end = std::chrono::high_resolution_clock::now();
   elapsed = end - start;
   std::cout << "QWord " << elapsed.count() << " ms\n";

   return 0;
}

executable x64 visual studio 15 release.

charsearch.exe

I am starting to grasp what you are doing. I think i do...
This maybe silly.
I was wondering, When using the simd instructions, is it possible to do this ?

Ascii characters range from 0x40 to 0x5A and 0x60 and 0x7A.
I do have to say that i am doing ascii character tests here.

Take for example 8 bytes at once.
Do a if values > 0x40 and store results as bits in byte A
A bit set as 1 when true.
Do a if values < 0x5B and store results as bits in byte B
A bit set as 1 when true.

Do byte A and byte B, result in C.
When C has a bit set, a character has been found.
This depends on endian format.
rotate through bits and increase counter I.

Add I + base address of buffer = address of first character.

Do a if values > 0x60 and store results as bits in byte A
A bit set as 1 when true.
Do a if values < 0x7B and store results as bits in byte B
A bit set as 1 when true.

Do byte A and byte B, result in D.
When C has a bit set, a character has been found.
This depends on endian format.

rotate through bits and increase counter I.

Add I + base address of buffer = address of first character.

I wonder if this would work.
 

Schmide

Diamond Member
Mar 7, 2002
5,712
978
126
Rather than go deeply into implementation.

You are proposing to check the bounds of the char stream to ascii displayable chars first and eliminate outsiders first?

One of the major things I learned from the test code. If you can't do it in a few instructions, it is not worth it.

The simple loop that compared each byte to the target byte was second in execution time more than doubling any non simd multi-byte calculation.

The simd calculation although doing 16 compares at once was only 4x as fast. It could be that if the search was longer this gap could get nearer to 16x. In all cases the simd should win, the question is by how much.

Lets look at the simd loop with actual instructions.

Code:
            do {
               __m128i charCompares = _mm_set1_epi8(c); // loads the char c into each byte of the 16 char operand
               charCompares = _mm_cmpeq_epi8(charCompares, *alignedBuffer); // compares 16 bytes if equal set byte to 0xff
               if (_mm_movemask_epi8(charCompares)) { // takes the high bit of every byte and builds a mask 16bits long
                  (determine first actual char)
               }
            } while (++alignedBuffer < alignedBufferEnd);

The most expensive operation in this loop is set operation. It expands to this.

Code:
movsx       eax,dl  
mov         r10,r8  
shr         r10,4  
mov         r9,r10  
shl         r9,4  
movd        xmm1,eax  
add         r9,rcx  
punpcklbw   xmm1,xmm1  
punpcklwd   xmm1,xmm1  
pshufd      xmm1,xmm1,0

The simd compare (16 bytes at once) Notice it saves the set operation in a register.

Code:
movdqa      xmm0,xmm1  
pcmpeqb     xmm0,xmmword ptr [rcx]

The check if found.

Code:
pmovmskb    eax,xmm0  
test        eax,eax  
jne         (label)

Most of the operation comes down to 5 instructions. They are in order and dependent. Any compare and test is going to take at least this. Doing a bounds check would be at least 11 assuming you do a bitwise or on the resulting flags. Further compares add at least 6.

I do not think you can optimize this further.
 
May 11, 2008
22,175
1,402
126
I agree.

When i looked up strchr, i read it wrong and assumed it had to look for any readable ascii character in a buffer hence my strange example. But strchr looks for a given character that is provided.
That indeed makes a difference.
I noticed the same thing with the strlen32 that the longer the string gets, the more the performance increases. But this is purely from the reduction of the memory accesses that are needed.
The amount of clock cycles needed for every byte access are reduced because of the word access.

In the strchr example with simd, it has also more to do with reducing the amount of memory accesses needed and that is where the speed up comes from.
It is not linear, so longer buffers will have a more profound speed increase just as strlen32 is not linear.
I wonder if you would also run into extra latencies because of a misaligned memory access when doing simd on x86.
(i really want to play in the future with the neon unit from my RPBI3. Just for the fun.)

Because simd checking or using just general instructions, will allways take a certain amount of clock cycles.
With reducing the amount of sequential memory accesses that are needed, the cpu is doing more usefull work in the same time.