// 3dpm3.cpp : Defines the entry point for the application.
//
#define _USE_MATH_DEFINES
#include <stdio.h>
#include <time.h>
#include <thread>
#include <chrono>
#include <iostream>
#include <math.h>
#include <algorithm>
#include <string>
#include <bit>
#include <random>
#include <vector>
#include <immintrin.h>
using namespace std;
// Fills 'buffer' with 'count' random normalized 4D vectors (x, y, z, w=1).
void fill_with_random_vectors(double* buffer, size_t count, bool singleThread) {
unsigned int numThreads = singleThread ? 1 : std::thread::hardware_concurrency();
if (numThreads == 0) numThreads = 4;
std::vector<std::thread> threads(singleThread ? 0 : numThreads);
auto worker = [buffer, count, numThreads](unsigned int threadId) {
std::random_device rd;
std::mt19937 gen(rd() + threadId);
std::uniform_real_distribution<double> dist(-1.0, 1.0);
size_t chunkSize = count / numThreads;
size_t start = threadId * chunkSize;
size_t end = (threadId == numThreads - 1) ? count : start + chunkSize;
for (size_t i = start; i < end; ++i) {
double x = dist(gen);
double y = dist(gen);
double z = dist(gen);
double mag = std::sqrt(x * x + y * y + z * z);
if (mag > 1e-8) {
double invMag = 1.0 / mag;
buffer[i * 4 + 0] = x * invMag;
buffer[i * 4 + 1] = y * invMag;
buffer[i * 4 + 2] = z * invMag;
buffer[i * 4 + 3] = 1.0;
} else {
buffer[i * 4 + 0] = 1.0;
buffer[i * 4 + 1] = 0.0;
buffer[i * 4 + 2] = 0.0;
buffer[i * 4 + 3] = 1.0;
}
}
};
if (singleThread) {
worker(0);
} else {
for (unsigned int t = 0; t < numThreads; ++t) {
threads[t] = std::thread(worker, t);
}
for (auto& th : threads) {
th.join();
}
}
}
// AVX-accelerated version: normalization uses AVX intrinsics and reciprocal
void fill_with_random_vectors_avx(double* buffer, size_t count, bool singleThread) {
unsigned int numThreads = singleThread ? 1 : std::thread::hardware_concurrency();
if (numThreads == 0) numThreads = 4;
std::vector<std::thread> threads(singleThread ? 0 : numThreads);
auto worker = [buffer, count, numThreads](unsigned int threadId) {
std::random_device rd;
std::mt19937 gen(rd() + threadId);
std::uniform_real_distribution<double> dist(-1.0, 1.0);
size_t chunkSize = count / numThreads;
size_t start = threadId * chunkSize;
size_t end = (threadId == numThreads - 1) ? count : start + chunkSize;
size_t i = start;
for (; i + 3 < end; i += 4) {
double x[4], y[4], z[4];
for (int k = 0; k < 4; ++k) {
x[k] = dist(gen);
y[k] = dist(gen);
z[k] = dist(gen);
}
__m256d vx = _mm256_loadu_pd(x);
__m256d vy = _mm256_loadu_pd(y);
__m256d vz = _mm256_loadu_pd(z);
__m256d xx = _mm256_mul_pd(vx, vx);
__m256d yy = _mm256_mul_pd(vy, vy);
__m256d zz = _mm256_mul_pd(vz, vz);
__m256d sum = _mm256_add_pd(xx, _mm256_add_pd(yy, zz));
__m256d mag = _mm256_sqrt_pd(sum);
__m256d eps = _mm256_set1_pd(1e-8);
__m256d mask = _mm256_cmp_pd(mag, eps, _CMP_GT_OQ);
__m256d invMag = _mm256_div_pd(_mm256_set1_pd(1.0), mag);
__m256d rx = _mm256_mul_pd(vx, invMag);
__m256d ry = _mm256_mul_pd(vy, invMag);
__m256d rz = _mm256_mul_pd(vz, invMag);
double fallback[4] = { 1.0, 1.0, 1.0, 1.0 };
double zero[4] = { 0.0, 0.0, 0.0, 0.0 };
__m256d fx = _mm256_loadu_pd(fallback);
__m256d fy = _mm256_loadu_pd(zero);
__m256d fz = _mm256_loadu_pd(zero);
rx = _mm256_blendv_pd(fx, rx, mask);
ry = _mm256_blendv_pd(fy, ry, mask);
rz = _mm256_blendv_pd(fz, rz, mask);
double outx[4], outy[4], outz[4];
_mm256_storeu_pd(outx, rx);
_mm256_storeu_pd(outy, ry);
_mm256_storeu_pd(outz, rz);
for (int k = 0; k < 4; ++k) {
buffer[(i + k) * 4 + 0] = outx[k];
buffer[(i + k) * 4 + 1] = outy[k];
buffer[(i + k) * 4 + 2] = outz[k];
buffer[(i + k) * 4 + 3] = 1.0;
}
}
for (; i < end; ++i) {
double x = dist(gen);
double y = dist(gen);
double z = dist(gen);
double mag = std::sqrt(x * x + y * y + z * z);
if (mag > 1e-8) {
double invMag = 1.0 / mag;
buffer[i * 4 + 0] = x * invMag;
buffer[i * 4 + 1] = y * invMag;
buffer[i * 4 + 2] = z * invMag;
buffer[i * 4 + 3] = 1.0;
} else {
buffer[i * 4 + 0] = 1.0;
buffer[i * 4 + 1] = 0.0;
buffer[i * 4 + 2] = 0.0;
buffer[i * 4 + 3] = 1.0;
}
}
};
if (singleThread) {
worker(0);
} else {
for (unsigned int t = 0; t < numThreads; ++t) {
threads[t] = std::thread(worker, t);
}
for (auto& th : threads) {
th.join();
}
}
}
// AVX-512-accelerated version: normalization uses AVX-512 intrinsics and reciprocal
void fill_with_random_vectors_avx512(double* buffer, size_t count, bool singleThread) {
unsigned int numThreads = singleThread ? 1 : std::thread::hardware_concurrency();
if (numThreads == 0) numThreads = 4;
std::vector<std::thread> threads(singleThread ? 0 : numThreads);
auto worker = [buffer, count, numThreads](unsigned int threadId) {
std::random_device rd;
std::mt19937 gen(rd() + threadId);
std::uniform_real_distribution<double> dist(-1.0, 1.0);
size_t chunkSize = count / numThreads;
size_t start = threadId * chunkSize;
size_t end = (threadId == numThreads - 1) ? count : start + chunkSize;
size_t i = start;
for (; i + 7 < end; i += 8) {
double x[8], y[8], z[8];
for (int k = 0; k < 8; ++k) {
x[k] = dist(gen);
y[k] = dist(gen);
z[k] = dist(gen);
}
__m512d vx = _mm512_loadu_pd(x);
__m512d vy = _mm512_loadu_pd(y);
__m512d vz = _mm512_loadu_pd(z);
__m512d xx = _mm512_mul_pd(vx, vx);
__m512d yy = _mm512_mul_pd(vy, vy);
__m512d zz = _mm512_mul_pd(vz, vz);
__m512d sum = _mm512_add_pd(xx, _mm512_add_pd(yy, zz));
__m512d mag = _mm512_sqrt_pd(sum);
__m512d eps = _mm512_set1_pd(1e-8);
__mmask8 mask = _mm512_cmp_pd_mask(mag, eps, _CMP_GT_OQ);
__m512d invMag = _mm512_div_pd(_mm512_set1_pd(1.0), mag);
__m512d rx = _mm512_mul_pd(vx, invMag);
__m512d ry = _mm512_mul_pd(vy, invMag);
__m512d rz = _mm512_mul_pd(vz, invMag);
__m512d fx = _mm512_set1_pd(1.0);
__m512d fy = _mm512_set1_pd(0.0);
__m512d fz = _mm512_set1_pd(0.0);
rx = _mm512_mask_blend_pd(mask, fx, rx);
ry = _mm512_mask_blend_pd(mask, fy, ry);
rz = _mm512_mask_blend_pd(mask, fz, rz);
double outx[8], outy[8], outz[8];
_mm512_storeu_pd(outx, rx);
_mm512_storeu_pd(outy, ry);
_mm512_storeu_pd(outz, rz);
for (int k = 0; k < 8; ++k) {
buffer[(i + k) * 4 + 0] = outx[k];
buffer[(i + k) * 4 + 1] = outy[k];
buffer[(i + k) * 4 + 2] = outz[k];
buffer[(i + k) * 4 + 3] = 1.0;
}
}
for (; i < end; ++i) {
double x = dist(gen);
double y = dist(gen);
double z = dist(gen);
double mag = std::sqrt(x * x + y * y + z * z);
if (mag > 1e-8) {
double invMag = 1.0 / mag;
buffer[i * 4 + 0] = x * invMag;
buffer[i * 4 + 1] = y * invMag;
buffer[i * 4 + 2] = z * invMag;
buffer[i * 4 + 3] = 1.0;
} else {
buffer[i * 4 + 0] = 1.0;
buffer[i * 4 + 1] = 0.0;
buffer[i * 4 + 2] = 0.0;
buffer[i * 4 + 3] = 1.0;
}
}
};
if (singleThread) {
worker(0);
} else {
for (unsigned int t = 0; t < numThreads; ++t) {
threads[t] = std::thread(worker, t);
}
for (auto& th : threads) {
th.join();
}
}
}
// No normalization, just random values
void fill_with_random_vectors_no_normalization(double* buffer, size_t count, bool singleThread) {
unsigned int numThreads = singleThread ? 1 : std::thread::hardware_concurrency();
if (numThreads == 0) numThreads = 4;
std::vector<std::thread> threads(singleThread ? 0 : numThreads);
auto worker = [buffer, count, numThreads](unsigned int threadId) {
std::random_device rd;
std::mt19937 gen(rd() + threadId);
std::uniform_real_distribution<double> dist(-1.0, 1.0);
size_t chunkSize = count / numThreads;
size_t start = threadId * chunkSize;
size_t end = (threadId == numThreads - 1) ? count : start + chunkSize;
for (size_t i = start; i < end; ++i) {
buffer[i * 4 + 0] = dist(gen);
buffer[i * 4 + 1] = dist(gen);
buffer[i * 4 + 2] = dist(gen);
buffer[i * 4 + 3] = 1.0;
}
};
if (singleThread) {
worker(0);
} else {
for (unsigned int t = 0; t < numThreads; ++t) {
threads[t] = std::thread(worker, t);
}
for (auto& th : threads) {
th.join();
}
}
}
// Fixed vector (0, 1, 2, 1)
void fill_with_fixed_vectors(double* buffer, size_t count, bool singleThread) {
unsigned int numThreads = singleThread ? 1 : std::thread::hardware_concurrency();
if (numThreads == 0) numThreads = 4;
std::vector<std::thread> threads(singleThread ? 0 : numThreads);
auto worker = [buffer, count, numThreads](unsigned int threadId) {
size_t chunkSize = count / numThreads;
size_t start = threadId * chunkSize;
size_t end = (threadId == numThreads - 1) ? count : start + chunkSize;
for (size_t i = start; i < end; ++i) {
buffer[i * 4 + 0] = 0.0;
buffer[i * 4 + 1] = 1.0;
buffer[i * 4 + 2] = 2.0;
buffer[i * 4 + 3] = 1.0;
}
};
if (singleThread) {
worker(0);
} else {
for (unsigned int t = 0; t < numThreads; ++t) {
threads[t] = std::thread(worker, t);
}
for (auto& th : threads) {
th.join();
}
}
}
int main()
{
const size_t vectorCount = 1 << 27; // 134,217,728, a power of 2
double* vectors = new double[vectorCount * 4]; // 4 components per vector
// Test all routines in both single-threaded and multi-threaded modes
for (int singleThread = 0; singleThread <= 1; ++singleThread) {
std::cout << (singleThread ? "[Single-threaded]" : "[Multi-threaded]") << std::endl;
auto start = std::chrono::high_resolution_clock::now();
fill_with_random_vectors(vectors, vectorCount, singleThread);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> elapsed = end - start;
std::cout << "[Standard] Generated " << vectorCount << " random normalized vectors in "
<< elapsed.count() << " ms." << std::endl;
for (size_t i = 0; i < std::min(vectorCount, size_t(5)); ++i) {
std::cout << "Vector " << i << ": ("
<< vectors[i * 4 + 0] << ", "
<< vectors[i * 4 + 1] << ", "
<< vectors[i * 4 + 2] << ", "
<< vectors[i * 4 + 3] << ")" << std::endl;
}
start = std::chrono::high_resolution_clock::now();
fill_with_random_vectors_avx(vectors, vectorCount, singleThread);
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << "[AVX] Generated " << vectorCount << " random normalized vectors in "
<< elapsed.count() << " ms." << std::endl;
for (size_t i = 0; i < std::min(vectorCount, size_t(5)); ++i) {
std::cout << "Vector " << i << ": ("
<< vectors[i * 4 + 0] << ", "
<< vectors[i * 4 + 1] << ", "
<< vectors[i * 4 + 2] << ", "
<< vectors[i * 4 + 3] << ")" << std::endl;
}
start = std::chrono::high_resolution_clock::now();
fill_with_random_vectors_avx512(vectors, vectorCount, singleThread);
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << "[AVX-512] Generated " << vectorCount << " random normalized vectors in "
<< elapsed.count() << " ms." << std::endl;
for (size_t i = 0; i < std::min(vectorCount, size_t(5)); ++i) {
std::cout << "Vector " << i << ": ("
<< vectors[i * 4 + 0] << ", "
<< vectors[i * 4 + 1] << ", "
<< vectors[i * 4 + 2] << ", "
<< vectors[i * 4 + 3] << ")" << std::endl;
}
start = std::chrono::high_resolution_clock::now();
fill_with_random_vectors_no_normalization(vectors, vectorCount, singleThread);
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << "[No Normalization] Generated " << vectorCount << " random vectors in "
<< elapsed.count() << " ms." << std::endl;
for (size_t i = 0; i < std::min(vectorCount, size_t(5)); ++i) {
std::cout << "Vector " << i << ": ("
<< vectors[i * 4 + 0] << ", "
<< vectors[i * 4 + 1] << ", "
<< vectors[i * 4 + 2] << ", "
<< vectors[i * 4 + 3] << ")" << std::endl;
}
start = std::chrono::high_resolution_clock::now();
fill_with_fixed_vectors(vectors, vectorCount, singleThread);
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << "[Fixed] Generated " << vectorCount << " fixed vectors in "
<< elapsed.count() << " ms." << std::endl;
for (size_t i = 0; i < std::min(vectorCount, size_t(5)); ++i) {
std::cout << "Vector " << i << ": ("
<< vectors[i * 4 + 0] << ", "
<< vectors[i * 4 + 1] << ", "
<< vectors[i * 4 + 2] << ", "
<< vectors[i * 4 + 3] << ")" << std::endl;
}
}
delete[] vectors;
return 0;
}