Schmide
Diamond Member
- Mar 7, 2002
- 5,712
- 978
- 126
So I modified some code I had to do most all the variations. Sad to say, it's darn hard to beat the standard routines. I came close often within 10%. The strchr* routine does basically the same thing as the intrinsic routine with the exception of using a simd routine to align data and using a bsf (bit scan forward) to finalize the find.
* I walked into the assembly.
Output:
Second run (with pauses)
charsearch.h:
charsearch.cpp:
executable x64 visual studio 15 release.
charsearch.exe
* I walked into the assembly.
Output:
Code:
Checking Output...
strchr
120 110 99 110 105 104 88 0
Intrinsic
120 110 99 110 105 104 88 0
byte
120 110 99 110 105 104 88 0
DWord
120 110 99 110 105 104 88 0
QWord
120 110 99 110 105 104 88 0
Calculating...
strchr 67.1348 ms
Intrinsic 67.601 ms
Byte 317.552 ms
DWord 1051.8 ms
QWord 729.756 ms
Second run (with pauses)
Code:
strchr 62.0654 ms
Intrinsic 69.4543 ms
Byte 338.806 ms
DWord 1124.12 ms
QWord 784.669 ms
Code:
// Check windows
#if _WIN32 || _WIN64
#if _WIN64
#define ENVIRONMENT64
#else
#define ENVIRONMENT32
#endif
#endif
// Check GCC
#if __GNUC__
#if __x86_64__ || __ppc64__
#define ENVIRONMENT64
#else
#define ENVIRONMENT32
#endif
#endif
#define MASK(a) ( ( 1 << (a) ) - 1 )
#define MASKCAST(a,b) ( ( ( (a)1 ) << (b) ) - 1 )
#define ONESHIFT(a) ( 1 << (a) )
#ifdef ENVIRONMENT64
#define TOALIGNED32(a,b) ( ((unsigned long)(-(long)reinterpret_cast<long long>(a))) & MASK(b) )
#else
#define TOALIGNED32(a,b) ( ((unsigned long)(-reinterpret_cast<long>(a))) & MASK(b) )
#endif
#define TOALIGNED64(a,b) ( ((unsigned long long)(-reinterpret_cast<long long>(a))) & MASKCAST(unsigned long long, b) )
#define USE_INTRINSIC
#ifdef USE_INTRINSIC
#include <xmmintrin.h>
#define BYTE_TYPE __m128i
#define BYTE_SET(a) _mm_set1_epi8(a)
#define BYTE_EQ(a,b) _mm_cmpeq_epi8(a,b)
#define BYTE_MOVEMASK(a) _mm_movemask_epi8(a)
#define BYTE_SIZE_POW2 4
#endif
template<typename B, typename I, unsigned char L> // note L must be size of alignment 2=32 3=64 4=128...
B H_FindCharTemplate(void* buffer, B count, unsigned char c)
{
if (!count)
return 0;
void* currentPosition = buffer;
B alignedCount = count >> L;
if ((alignedCount >> 1) && (L > 1) ) {
unsigned long toAligned = TOALIGNED32(currentPosition, L);
if (toAligned) {
unsigned char* charBuffer = (unsigned char *)currentPosition;
unsigned char* charBufferEnd = charBuffer + toAligned;
do {
if (c == *charBuffer)
return (B)((B)charBuffer - (B)currentPosition);
else
charBuffer++;
} while (charBuffer < charBufferEnd);
currentPosition = (void *)charBuffer;
count -= toAligned;
}
switch (L)
{
case 2: {
alignedCount = count >> L;
B* alignedBuffer = (B *)currentPosition;
B* alignedBufferEnd = alignedBuffer + alignedCount;
do {
B charCompares;
unsigned char *chars = (unsigned char *)&charCompares;
unsigned char *alignedBufferChars = (unsigned char *)alignedBuffer;
*(chars) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars))) >> 8);
*(chars + 1) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 1))) >> 8);
*(chars + 2) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 2))) >> 8);
*(chars + 3) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 3))) >> 8);
if (((B)-1) != charCompares) {
chars = (unsigned char *)alignedBuffer;
while (c != *chars)
chars++;
return (B)((B)chars - (B)buffer);
}
} while (++alignedBuffer < alignedBufferEnd);
currentPosition = (void *)alignedBuffer;
count -= alignedCount << L;
}
break;
case 3: {
alignedCount = count >> L;
B* alignedBuffer = (B *)currentPosition;
B* alignedBufferEnd = alignedBuffer + alignedCount;
do {
B charCompares;
unsigned char *chars = (unsigned char *)&charCompares;
unsigned char *alignedBufferChars = (unsigned char *)alignedBuffer;
*(chars) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars))) >> 8);
*(chars + 1) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 1))) >> 8);
*(chars + 2) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 2))) >> 8);
*(chars + 3) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 3))) >> 8);
*(chars + 4) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 4))) >> 8);
*(chars + 5) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 5))) >> 8);
*(chars + 6) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 6))) >> 8);
*(chars + 7) = (unsigned char)((0 - (unsigned char)(c - *(alignedBufferChars + 7))) >> 8);
if (((B)-1) != charCompares) {
chars = (unsigned char *)alignedBuffer;
while (c != *chars)
chars++;
return (B)((B)chars - (B)buffer);
}
} while (++alignedBuffer < alignedBufferEnd);
currentPosition = (void *)alignedBuffer;
count -= alignedCount << L;
}
break;
default: {
alignedCount = count >> L;
I* alignedBuffer = (I *)currentPosition;
I* alignedBufferEnd = alignedBuffer + alignedCount;
do {
I charCompares = BYTE_SET(c);
charCompares = BYTE_EQ(charCompares, *alignedBuffer);
if (BYTE_MOVEMASK(charCompares)) {
unsigned char *chars = (unsigned char *)alignedBuffer;
while (c != *chars)
chars++;
return (B)((B)chars - (B)buffer);
}
} while (++alignedBuffer < alignedBufferEnd);
currentPosition = (void *)alignedBuffer;
count -= alignedCount << L;
}
}
}
unsigned char* charBuffer = (unsigned char *)currentPosition;
unsigned char* charBufferEnd = charBuffer + count;
while (charBuffer < charBufferEnd)
if (c == *charBuffer)
return (B)((B)charBuffer - (B)buffer);
else
charBuffer++;
return (B)((B)charBuffer - (B)buffer);
}
Code:
// charsearch.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <algorithm>
#include <string>
#include <iostream>
#include <time.h>
#include <chrono>
#include <thread>
#if defined WIN32
#include <Windows.h>
#endif
#include "charsearch.h"
int main()
{
unsigned int indexToFind[8] = { 120, 110, 99 , 110, 105 , 104, 88, 64};
unsigned char buffer[] = ".......................................................................0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\0";
int loopCount = 1000000;
int bufferAlignment = 0;
int bufferSize = strlen((const char *)buffer) - bufferAlignment;
printf("Checking Output...\n\n");
printf("strchr\n");
for (int i = 0; i < 8; i++) {
unsigned int foundAt = (unsigned int)strchr((const char *)&buffer[bufferAlignment], buffer[indexToFind[i]]) - (unsigned int)&buffer[bufferAlignment];
printf("%d ", foundAt);
}
printf("\nIntrinsic\n");
for (int i = 0; i < 8; i++) {
unsigned int foundAt = H_FindCharTemplate<unsigned long long, BYTE_TYPE, BYTE_SIZE_POW2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
printf("%d ", foundAt);
}
printf("\nbyte\n");
for (int i = 0; i < 8; i++) {
unsigned int foundAt = H_FindCharTemplate<unsigned long, BYTE_TYPE, 1>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
printf("%d ", foundAt);
}
printf("\nDWord\n");
for (int i = 0; i < 8; i++) {
unsigned int foundAt = H_FindCharTemplate<unsigned long, BYTE_TYPE, 2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
printf("%d ", foundAt);
}
printf("\nQWord\n");
for (int i = 0; i < 8; i++) {
unsigned int foundAt = H_FindCharTemplate<unsigned long long, BYTE_TYPE, 3>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
printf("%d ", foundAt);
}
clock_t t;
printf("\n\nCalculating...\n\n");
// strchr
std::this_thread::sleep_for(std::chrono::seconds(1));
// running twice to normalize caches
for (int i = 0; i<8; i++)
for (int j = 0; j<loopCount; j++)
volatile int found = (unsigned int)strchr((const char *)&buffer[bufferAlignment], buffer[indexToFind[i]]) - (unsigned int)&buffer[bufferAlignment];
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i<8; i++)
for (int j = 0; j<loopCount; j++)
volatile int found = (unsigned int)strchr((const char *)&buffer[bufferAlignment], buffer[indexToFind[i]]) - (unsigned int)&buffer[bufferAlignment];
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> elapsed = end - start;
std::cout << "strchr " << elapsed.count() << " ms\n";
// Intrinsic
std::this_thread::sleep_for(std::chrono::seconds(1));
// running twice to normalize caches
for (int i = 0; i<8; i++)
for (int j = 0; j<loopCount; j++)
volatile int found = H_FindCharTemplate<unsigned long long, BYTE_TYPE, BYTE_SIZE_POW2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i<8; i++)
for (int j = 0; j<loopCount; j++)
volatile int found = H_FindCharTemplate<unsigned long long, BYTE_TYPE, BYTE_SIZE_POW2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << "Intrinsic " << elapsed.count() << " ms\n";
// Byte
std::this_thread::sleep_for(std::chrono::seconds(1));
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i<8; i++)
for (int j = 0; j<loopCount; j++)
volatile int found = H_FindCharTemplate<unsigned long, BYTE_TYPE, 1>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << "Byte " << elapsed.count() << " ms\n";
// DWord
std::this_thread::sleep_for(std::chrono::seconds(1));
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i<8; i++)
for (int j = 0; j<loopCount; j++)
volatile int found = H_FindCharTemplate<unsigned long, BYTE_TYPE, 2>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << "DWord " << elapsed.count() << " ms\n";
// QWord
std::this_thread::sleep_for(std::chrono::seconds(1));
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i<8; i++)
for (int j = 0; j<loopCount; j++)
volatile int found = H_FindCharTemplate<unsigned long long, BYTE_TYPE, 3>(&buffer[bufferAlignment], bufferSize, buffer[indexToFind[i]]);
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << "QWord " << elapsed.count() << " ms\n";
return 0;
}
executable x64 visual studio 15 release.
charsearch.exe
Last edited: