Windows 10
Pelles ISO C Compiler, Version 10.00.15
Feeding a 1 million line file into the Pelles C compiled .exe of the code below outputs
FileSize is 228009172
1000231 ends of the lines are found in 0.106292 seconds
Feeding a 1 million line file into the Microsoft VS19 compiled .exe of the code below outputs
FileSize is 228009172
1000000 ends of the lines are found in 0.103677 seconds
If a different million line file is fed into the Pelles C .exe the output is different but still not correct.
// Code below by Robert Wishlaw adapted from
// File mapping code Posted on 2003-06-10 19:30:51 by P2M at
// http://www.asmcommunity.net/forums/topic/?id=13727
// and
// memcount_avx2 code Posted on 2019-08-17 9:14 by powturbo at
// https://stackoverflow.com/questions/54541129/how-to-count-character-occurrences-using-simd/54543009
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <stdio.h>
#include <limits.h>
#include <immintrin.h>
#define INVALID_FILE_SIZE ((DWORD)0xFFFFFFFF)
size_t memcount_avx2(const void* s, int c, size_t n)
{
__m256i cv = _mm256_set1_epi8(c),
zv = _mm256_setzero_si256(),
sum = zv, acr0, acr1, acr2, acr3;
const char* p, * pe;
for (p = (char*)s; p != (char*)s + (n - (n % (252 * 32)));)
{
for (acr0 = acr1 = acr2 = acr3 = zv, pe = p + 252 * 32; p != pe; p += 128)
{
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i*)p)));
acr1 = _mm256_sub_epi8(acr1, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i*)(p + 32))));
acr2 = _mm256_sub_epi8(acr2, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i*)(p + 64))));
acr3 = _mm256_sub_epi8(acr3, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i*)(p + 96))));
}
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr0, zv));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr1, zv));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr2, zv));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr3, zv));
}
for (acr0 = zv; p + 32 < (char*)s + n; p += 32)
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i*)p)));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr0, zv));
size_t count = _mm256_extract_epi64(sum, 0)
+ _mm256_extract_epi64(sum, 1)
+ _mm256_extract_epi64(sum, 2)
+ _mm256_extract_epi64(sum, 3);
while (p != (char*)s + n)
count += *p++ == c;
return count;
}
int main(int argc, char* argv[])
{
static LARGE_INTEGER qpf;
static LARGE_INTEGER qpc1;
static LARGE_INTEGER qpc2;
static int RetVal;
HANDLE hFile = INVALID_HANDLE_VALUE;
HANDLE hFMap = NULL;
LPVOID pFile = NULL;
char* pStart = NULL;
char* pFind = NULL;
LONGLONG DefMapSize = 7 * (256 * 1024 * 1024);
LARGE_INTEGER FileSize, MapOffset;
SIZE_T EolCount;
SIZE_T MapSize;
RetVal = QueryPerformanceFrequency(&qpf);
if (RetVal == 0)
{
printf("%s\n", "This computer does not have a high-performance timer.");
fflush(stdout);
ExitProcess(0);
}
QueryPerformanceCounter(&qpc1);
if (1 >= argc)
{
printf("Usage %s filename\n", argv[0]);
goto cleanup;
}
hFile = CreateFile(argv[1], GENERIC_READ, FILE_SHARE_READ, NULL,
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (INVALID_HANDLE_VALUE == hFile)
{
printf("CreateFile('%s') failed %d\n", argv[1], GetLastError());
goto cleanup;
}
FileSize.QuadPart = 0;
FileSize.LowPart = GetFileSize(hFile, (DWORD*)&FileSize.HighPart);
if (INVALID_FILE_SIZE == FileSize.LowPart)
{
printf("GetFileSize failed %d\n", GetLastError());
goto cleanup;
}
if (0 == FileSize.QuadPart)
{
printf("File is empty\n");
goto cleanup;
}
printf("FileSize is %llu\n", FileSize.QuadPart);
hFMap = CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
if (NULL == hFMap)
{
printf("CreateFileMapping failed %d\n", GetLastError());
goto cleanup;
}
MapOffset.QuadPart = 0;
EolCount = 0;
while (0 != FileSize.QuadPart)
{
MapSize = (SIZE_T)min(DefMapSize, FileSize.QuadPart);
pFile = MapViewOfFile(hFMap, FILE_MAP_READ,
MapOffset.HighPart, MapOffset.LowPart, MapSize);
if (NULL == pFile)
{
printf("MapViewOfFile failed %d\n", GetLastError());
goto cleanup;
}
FileSize.QuadPart -= MapSize;
MapOffset.QuadPart += MapSize;
pStart = pFind = (char*)pFile;
EolCount = memcount_avx2(pStart, '\n', MapSize);
UnmapViewOfFile(pFile); pFile = NULL;
}
CloseHandle(hFMap); hFMap = NULL;
QueryPerformanceCounter(&qpc2);
printf("%s\n", " ");
printf("%llu ends of the lines are found in %f seconds\n", EolCount, (double)(qpc2.QuadPart - qpc1.QuadPart) / qpf.QuadPart);
cleanup:
if (NULL != pFile) UnmapViewOfFile(pFile);
if (NULL != hFMap) CloseHandle(hFMap);
if (INVALID_HANDLE_VALUE != hFile) CloseHandle(hFile);
return 0;
}