Windows 10
Pelles ISO C Compiler, Version 10.00.15
Feeding a 1 million line file into the Pelles C compiled .exe of the code below outputs
Quote
FileSize is 228009172
1000231 ends of the lines are found in 0.106292 seconds
Feeding a 1 million line file into the Microsoft VS19 compiled .exe of the code below outputs
Quote
FileSize is 228009172
1000000 ends of the lines are found in 0.103677 seconds
If a different million line file is fed into the Pelles C .exe the output is different but still not correct.
// Code below by Robert Wishlaw adapted from
// File mapping code Posted on 2003-06-10 19:30:51 by P2M at
// http://www.asmcommunity.net/forums/topic/?id=13727
// and
// memcount_avx2 code Posted on 2019-08-17 9:14 by powturbo at
// https://stackoverflow.com/questions/54541129/how-to-count-character-occurrences-using-simd/54543009
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <stdio.h>
#include <limits.h>
#include <immintrin.h>
#define INVALID_FILE_SIZE ((DWORD)0xFFFFFFFF)
size_t memcount_avx2(const void* s, int c, size_t n)
{
  __m256i cv = _mm256_set1_epi8(c),
    zv = _mm256_setzero_si256(),
    sum = zv, acr0, acr1, acr2, acr3;
  const char* p, * pe;
  for (p = (char*)s; p != (char*)s + (n - (n % (252 * 32)));)
  {
    for (acr0 = acr1 = acr2 = acr3 = zv, pe = p + 252 * 32; p != pe; p += 128)
    {
      acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i*)p)));
      acr1 = _mm256_sub_epi8(acr1, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i*)(p + 32))));
      acr2 = _mm256_sub_epi8(acr2, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i*)(p + 64))));
      acr3 = _mm256_sub_epi8(acr3, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i*)(p + 96))));
    }
    sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr0, zv));
    sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr1, zv));
    sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr2, zv));
    sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr3, zv));
  }
  for (acr0 = zv; p + 32 < (char*)s + n; p += 32)
    acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i*)p)));
     sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr0, zv));
  size_t count = _mm256_extract_epi64(sum, 0)
               + _mm256_extract_epi64(sum, 1)
               + _mm256_extract_epi64(sum, 2)
               + _mm256_extract_epi64(sum, 3);
  while (p != (char*)s + n)
    count += *p++ == c;
  return count;
}
int main(int argc, char* argv[])
{
  static LARGE_INTEGER qpf;
  static LARGE_INTEGER qpc1;
  static LARGE_INTEGER qpc2;
  static int     RetVal;
  HANDLE    hFile = INVALID_HANDLE_VALUE;
  HANDLE    hFMap = NULL;
  LPVOID    pFile = NULL;
  char* pStart = NULL;
  char* pFind = NULL;
  LONGLONG  DefMapSize = 7 * (256 * 1024 * 1024);
  LARGE_INTEGER FileSize, MapOffset;
  SIZE_T       EolCount;
  SIZE_T        MapSize;
  RetVal = QueryPerformanceFrequency(&qpf);
  if (RetVal == 0)
  {
    printf("%s\n", "This computer does not have a high-performance timer.");
    fflush(stdout);
    ExitProcess(0);
  }
   QueryPerformanceCounter(&qpc1);
  if (1 >= argc)
  {
    printf("Usage %s filename\n", argv[0]);
    goto cleanup;
  }
  hFile = CreateFile(argv[1], GENERIC_READ, FILE_SHARE_READ, NULL,
    OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
  if (INVALID_HANDLE_VALUE == hFile)
  {
    printf("CreateFile('%s') failed %d\n", argv[1], GetLastError());
    goto cleanup;
  }
  FileSize.QuadPart = 0;
  FileSize.LowPart = GetFileSize(hFile, (DWORD*)&FileSize.HighPart);
  if (INVALID_FILE_SIZE == FileSize.LowPart)
  {
    printf("GetFileSize failed %d\n", GetLastError());
    goto cleanup;
  }
  if (0 == FileSize.QuadPart)
  {
    printf("File is empty\n");
    goto cleanup;
  }
  printf("FileSize is %llu\n", FileSize.QuadPart);
  hFMap = CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
  if (NULL == hFMap)
  {
    printf("CreateFileMapping failed %d\n", GetLastError());
    goto cleanup;
  }
  MapOffset.QuadPart = 0;
  EolCount = 0;
  while (0 != FileSize.QuadPart)
  {
    MapSize = (SIZE_T)min(DefMapSize, FileSize.QuadPart);
    pFile = MapViewOfFile(hFMap, FILE_MAP_READ,
      MapOffset.HighPart, MapOffset.LowPart, MapSize);
    if (NULL == pFile)
    {
      printf("MapViewOfFile failed %d\n", GetLastError());
      goto cleanup;
    }
    FileSize.QuadPart -= MapSize;
    MapOffset.QuadPart += MapSize;
    pStart = pFind = (char*)pFile;
    EolCount = memcount_avx2(pStart, '\n', MapSize);
    UnmapViewOfFile(pFile); pFile = NULL;
  }
  CloseHandle(hFMap); hFMap = NULL;
  QueryPerformanceCounter(&qpc2);
  printf("%s\n", " ");
  printf("%llu ends of the lines are found in %f seconds\n", EolCount, (double)(qpc2.QuadPart - qpc1.QuadPart) / qpf.QuadPart);
cleanup:
  if (NULL != pFile) UnmapViewOfFile(pFile);
  if (NULL != hFMap) CloseHandle(hFMap);
  if (INVALID_HANDLE_VALUE != hFile) CloseHandle(hFile);
  return 0;
}
				Oh, I can count. I just can't extract one of four resulting dword's (I guess 75% of the result isn't enough?). Details...