News:

Download Pelles C here: http://www.smorgasbordet.com/pellesc/

Main Menu

frequency of ints in huge array

Started by czerny, July 03, 2014, 01:29:15 PM

Previous topic - Next topic

czerny

I have a little problem. May be there is an intelligent  solution I don't know?

I have a huge (typically 20000000 elements) array of unsigned long ints and I have to find that int with the highest frequency.

My idea so far is to use some sort of binary tree with something like


struct node {
  unsigned int x;
  unsigned int freq;
  struct node *left;
  struct node *right;
};


as node. But thisway I have 5 times as much memory to allocate.
Is there a cheaper way to do this?


jj2007

Have you tried sorting the array? On my trusty AMD, that takes three seconds. Afterwards, you just go through the array and check how often a value is being repeated, and if it's repeat count is higher than the previous champion, you take that value.

Bitbeisser

Quote from: jj2007 on July 03, 2014, 02:41:49 PM
Have you tried sorting the array? On my trusty AMD, that takes three seconds. Afterwards, you just go through the array and check how often a value is being repeated, and if it's repeat count is higher than the previous champion, you take that value.
Well, that would only work if the original order of the array doesn't matter, a fact that czerny unfortunately did not elude to if that is a restriction or not.
Also he did not mention if time is an issue here either, otherwise I would suggest just start counting from the top down, which could be reasonable fast depending on the distribution of values within the array.
Might also want to look into a modification of a binary search over the array...

Ralf

jj2007

Quote from: Bitbeisser on July 03, 2014, 07:40:25 PMI would suggest just start counting from the top down

That sounds straightforward, can you post an example?

czerny

Quote from: jj2007 on July 03, 2014, 02:41:49 PM
Have you tried sorting the array? On my trusty AMD, that takes three seconds. Afterwards, you just go through the array and check how often a value is being repeated, and if it's repeat count is higher than the previous champion, you take that value.
I do not want to change the original array. So I would have to make a copy. :(

czerny

#5
Quote from: Bitbeisser on July 03, 2014, 07:40:25 PM
I would suggest just start counting from the top down, ...
Do you mean something like that?

maxcount = 0;
for (i=0; i<MAX; i++) {
  count = 0;
  for (k=i; k<MAX; k++)
    if (a[k] == a[i]) count++;
  if (count > maxcount) {
    maxcount = count;
    index = i;
  }
}

This is very slow for huge arrays! O(n²)

jj2007

Quote from: czerny on July 04, 2014, 09:59:06 AM
This is very slow for huge arrays! O(n²)

Indeed. If you have enough RAM, sorting a copy is a lot faster. Quicksort does it at O(n log n), a radix sort can even be faster, depending on the data. Finding the most frequent element after the sort takes a negligible time, see below.

AMD Athlon(tm) Dual Core Processor 4450B (MMX, SSE, SSE2, SSE3)
2596 ms for sorting
77 ms for finding

czerny

Quote from: jj2007 on July 04, 2014, 11:03:16 AM
Indeed. If you have enough RAM, sorting a copy is a lot faster. Quicksort does it at O(n log n), a radix sort can even be faster, depending on the data. Finding the most frequent element after the sort takes a negligible time, see below.

AMD Athlon(tm) Dual Core Processor 4450B (MMX, SSE, SSE2, SSE3)
2596 ms for sorting
77 ms for finding

Plus the time to copy the array (which is small) and 2 * memory.
But by all means better than the tree.

czerny

Quote from: czerny on July 04, 2014, 09:59:06 AM
Quote from: Bitbeisser on July 03, 2014, 07:40:25 PM
I would suggest just start counting from the top down, ...
Do you mean something like that?

maxcount = 0;
for (i=0; i<MAX; i++) {
  count = 0;
  for (k=i; k<MAX; k++)
    if (a[k] == a[i]) count++;
  if (count > maxcount) {
    maxcount = count;
    index = i;
  }
}

This is very slow for huge arrays! O(n²)
May be, that this can be optimized. The search can be terminated if maxcount > (MAX-i). But this depends on the data.

JohnF

#9
czerny, see if this is ok, it takes 92 msecs here.


#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>

long long __cdecl StartTimer(void)
{
long long t1;
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
return t1;
}

long long __cdecl StopTimer(long long t1)
{
long long t2;
long long frequency;
QueryPerformanceCounter((LARGE_INTEGER *) & t2);
QueryPerformanceFrequency((LARGE_INTEGER *) & frequency);
long long elapsedTime = (t2 - t1) * 1000 / frequency;
return (long long)elapsedTime;
}

int __cdecl main(void)
{
#define SIZE 20000000
unsigned long int i;
unsigned long int * arry;

arry = malloc(SIZE * sizeof(unsigned long int));
if(arry == NULL){
printf("Error, no mem!");
return 0;
}

for (i = 0; i<SIZE; i++)
{
arry[i] = rand()%10000;
}

long long t = StartTimer();
unsigned long int maximum;
int freq = 1;
maximum = arry[0];

for (i = 0; i < SIZE; i++)
{
if (arry[i] > maximum){
maximum = arry[i];
freq = 1;
}else if(arry[i] == maximum){
freq++;
}
}

printf("msecs %lld\n", StopTimer(t));
printf("maximum %d has occurences: %d times\n", maximum, freq);
free(arry);
return 0;
}


EDIT: freq = 1;

John

czerny

Quote from: JohnF on July 04, 2014, 11:55:32 AM
czerny, see if this is ok, it takes 92 msecs here.


#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>

long long __cdecl StartTimer(void)
{
long long t1;
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
return t1;
}

long long __cdecl StopTimer(long long t1)
{
long long t2;
long long frequency;
QueryPerformanceCounter((LARGE_INTEGER *) & t2);
QueryPerformanceFrequency((LARGE_INTEGER *) & frequency);
long long elapsedTime = (t2 - t1) * 1000 / frequency;
return (long long)elapsedTime;
}

int __cdecl main(void)
{
#define SIZE 20000000
unsigned long int i;
unsigned long int * arry;

arry = malloc(SIZE * sizeof(unsigned long int));
if(arry == NULL){
printf("Error, no mem!");
return 0;
}

for (i = 0; i<SIZE; i++)
{
arry[i] = rand()%10000;
}

long long t = StartTimer();
unsigned long int maximum;
int freq = 1;
maximum = arry[0];

for (i = 0; i < SIZE; i++)
{
if (arry[i] > maximum){
maximum = arry[i];
freq = 1;
}else if(arry[i] == maximum){
freq++;
}
}

printf("msecs %lld\n", StopTimer(t));
printf("maximum %d has occurences: %d times\n", maximum, freq);
free(arry);
return 0;
}


EDIT: freq = 1;

John
Try this with: SIZE=3; arry = {5,3,3];

I want the result: maximum == 3; freq == 2;

jj2007

Quote from: czerny on July 04, 2014, 11:25:58 AM
Plus the time to copy the array (which is small) and 2 * memory.

199 ms for copying
5800 ms for sorting
127 ms for finding


"copying" includes allocating and freeing the copy. The biggest chunk is clearly the sorting. What kind of data do you have? Does it spread over the whole 32-bit range?

czerny

Quote from: jj2007 on July 04, 2014, 01:28:58 PM
Quote from: czerny on July 04, 2014, 11:25:58 AM
Plus the time to copy the array (which is small) and 2 * memory.

199 ms for copying
5800 ms for sorting
127 ms for finding


"copying" includes allocating and freeing the copy. The biggest chunk is clearly the sorting. What kind of data do you have? Does it spread over the whole 32-bit range?
Yes, there is no known subset.

JohnF

#13
czerny, sorry about my post, I should read more carefully.

This code takes just over 4 secs here. I found the main body of the code on the web.


#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>

long long __cdecl StartTimer(void)
{
long long t1;
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
return t1;
}

long long __cdecl StopTimer(long long t1)
{
long long t2;
long long frequency;
QueryPerformanceCounter((LARGE_INTEGER *) & t2);
QueryPerformanceFrequency((LARGE_INTEGER *) & frequency);
long long elapsedTime = (t2 - t1) * 1000 / frequency;
return elapsedTime;
}

int __cdecl CompareInt(const void * param1, const void * param2)
{
unsigned long int * pInt1 = (unsigned long int *)param1;
unsigned long int * pInt2 = (unsigned long int *)param2;

if(*pInt1 < *pInt2)
return 1;
else if(*pInt1 > *pInt2)
return -1;

return 0;
}

int __cdecl main(void)
{
#define SIZE 20000000
unsigned long int i;
unsigned long int * arry, * arry1;

arry = malloc(SIZE * sizeof(unsigned long int));
if(arry == NULL){
printf("Error, no mem!");
return 0;
}
arry1 = malloc(SIZE * sizeof(unsigned long int));
if(arry1 == NULL){
printf("Error, no mem!");
return 0;
}

for (i = 0; i<SIZE; i++)
{
arry[i] = rand()%10000;
}


long long t = StartTimer();

memcpy(arry1, arry, SIZE*sizeof(unsigned long int));

qsort(arry, SIZE, sizeof(unsigned long int), CompareInt);

    int previous = arry[0];
    int popular = arry[0];
    int count = 1;
    int maxCount = 1;

    for (int i = 1; i < SIZE; i++) {
        if (arry[i] == previous)
            count++;
        else {
            if (count > maxCount) {
                popular = arry[i-1];
                maxCount = count;
            }
            previous = arry[i];
            count = 1;
        }
    }
printf("msecs %lld\n", StopTimer(t));
printf("maximum %d has occured %d times\n", popular, maxCount);

free(arry);
free(arry1);
return 0;
}


If it's no good I'll stop. :)

John

frankie

I would propose a different approach that is very efficient if the distribution is narrow (we have many ripetitions) in memory usage and time because we scan the main array only one time and the 'reduced' array many times.
If all values are expected to be gaussian this algorithm is a nightmare like the others  ;D


#define _DEBUG
#ifdef _DEBUG
#define PDEBUG(...) printf(__VA_ARGS__)
#else
#define PDEBUG(...)
#endif
#define ARRAY_SIZE 20000000
#define STEP    (sizeof(array)/(100*sizeof(DWORD))*sizeof(EVAL))

typedef struct
{
DWORD val;
int Count;
} EVAL, *LPEVAL;

DWORD array[ARRAY_SIZE];

BOOL ArrayMode(int a[], int arraySize, DWORD *Val, int *freq)
{
DWORD CurrSize = STEP; //Start with a small percentage
DWORD Idx      = 0; //Current free element
LPEVAL pE = malloc(CurrSize);
if (!pE)
return FALSE; //Memory fail

for (int i=0; i<sizeof(array)/sizeof(DWORD); i++)
{
DWORD j;
for (j=0; j<Idx; j++)
if (pE[j].val==a[i])
{
pE[j].Count++;
break;
}
if (j==Idx) //NotFound, we have to add the new element
{
//First check if we have room in the array
if (Idx>=CurrSize/sizeof(EVAL))
{ //We have to expand array
PDEBUG("Expanding array, Idx=%u\n", Idx);
getchar();
void *p = realloc(pE, CurrSize+STEP);
if (!p)
{
free(pE);
return FALSE;
}
pE = p;
CurrSize+=STEP;
}
//Add new element
pE[Idx].val = a[i];
pE[Idx].Count = 1;
PDEBUG("Adding element, val[%u]=%u\n", Idx, pE[Idx].val);
Idx++;
}
}
//Now search the mode of reduced array
DWORD IdxMax = 0;

for (DWORD i=0; i<Idx; i++)
if (pE[i].Count>pE[IdxMax].Count)
IdxMax = i;

//Copy results
*Val  = pE[IdxMax].val;
*freq = pE[IdxMax].Count;

//Free memory
free(pE);

return TRUE;
}

void test(void)
{
DWORD val;
int freq;

//Fill a test array
for (int i=0; i<sizeof(array)/sizeof(DWORD); i++)
#if 1
if (i<1000)
array[i] = 1;
else if (i<100000)
array[i] = 2;
else
array[i] = 3;
#else
array[i] = rand();
#endif
ArrayMode(array, ARRAY_SIZE, &val, &freq);
printf("Array mode: value=%u, freq=%d\n", val, freq);
}
"It is better to be hated for what you are than to be loved for what you are not." - Andre Gide