Recording a histogram in a tree exhibits strange best case


The task is to record a histogram from a streaming data source.

One data point is, say, a 16 bit integer. The maximum multiple of one data point before the stream ends, is < 2^32. The main restriction is that there isn’t enough memory to have an array of counters which completely covers the range of the data points (65536 counters of 0-2^32), much less record all single data points and bulk-process them afterwards. In the case where the dispersion of the data source covers the complete possible 16-bit range, the result is not of interest, i.e. this marks a histogram type which is not processed further. Valid histograms consist of only a few clusters where the majority of points are found. Data is roughly continuous which has a very valuable consequence: new outliers can, without loss of detection capability) supersede old outliers (i.e. those with the lowest count) so that new clusters can enter the histogram even when the available memory is full. This is very rare however, the data source is in a way nice to track; just the number and location of the clusters is unpredictable. The following is a try of mine with a binary tree recording the multitude of each data point (I didn’t implement the substitution, just the insert). The special property of the tree is that it tries to move points with a high occurrence number close to the root, in hope for a short search path in the majority cases. The boolean predicate is the function exchange_p which compares the weight of a root to its left or right child. In case the predicate is true, an exchange is executed, possibly invoking further exchanges lateron.

The idea is that it is favorable to not exchange tree nodes at every insert but to let the node count run up to a certain multiple of its parent before an exchange is initiated, thereby avoiding unnecessary oscillations of nodes. Things worked out very differently, tho.

The problem exhibits a strange behaviour, with QUOTA around 0.8 (i.e. exchange if root < 0.8*child) for the overall minimum number of operations. Is there some theory which amalgates O-notation with stochastics for special input distributions?

You can run the below example (it has real world data in it), the file is completely self-contained and should run out of the box.

#include <cstdio> #include <cstdint> #include <cstdlib>   int exchg; int ins; int tree_ins;  float QUOTA = 1.0f;  void error_func (const char *s) {   printf ("%s", s);   exit (1); }  typedef enum { LEFT_CHILD, RIGHT_CHILD, NO_CHILD } child_side_t;    typedef int16_t element_t; typedef uint32_t counting_t;   struct bin_tree_counting_node_t {   bin_tree_counting_node_t *left, *right;   counting_t cnt;   element_t element; };   void print_node (bin_tree_counting_node_t* root) {   printf ("%i\t%u\n", root->element, root->cnt); }  void print_tree (bin_tree_counting_node_t* root, int lvl) {   if (root == NULL)     return;   print_tree (root->left, lvl+1);   // printf ("Level %d: ", lvl);   print_node (root);   print_tree (root->right, lvl+1); }     bool exchange_p (bin_tree_counting_node_t* root,         bin_tree_counting_node_t* child,         child_side_t side) {   return (root->cnt < QUOTA * child->cnt); }  bin_tree_counting_node_t* tree_insert_tree (bin_tree_counting_node_t* tree_a,          bin_tree_counting_node_t* tree_b) {   tree_ins++;      if (tree_a == NULL)     return tree_b;   else if (tree_b == NULL)     return tree_a;   else if (exchange_p (tree_a, tree_b, NO_CHILD)) {     if (tree_a->element < tree_b->element)        tree_b->left = tree_insert_tree (tree_a, tree_b->left);     else       tree_b->right = tree_insert_tree (tree_a, tree_b->right);     return tree_b;   }   // Case for a > b coincides with a ~ b   // else if (exchange_p (tree_b, tree_a, NO_CHILD)) {   //   if (tree_a->element < tree_b->element)    //     tree_a->right = tree_insert_tree (tree_a->right, tree_b);   //   else   //     tree_a->left = tree_insert_tree (tree_a->left, tree_b);   //   return tree_a;   // }   else {     if (tree_a->element < tree_b->element)        tree_a->right = tree_insert_tree (tree_a->right, tree_b);     else       tree_a->left = tree_insert_tree (tree_a->left, tree_b);     return tree_a;   }       }  bin_tree_counting_node_t* exchange (bin_tree_counting_node_t* root,      child_side_t side) {   exchg++;      // Exchange root with its child   bin_tree_counting_node_t* child;   if (side == LEFT_CHILD) {     child = root->left;     root->left = NULL;     child->right = tree_insert_tree (root, child->right);   }   else {     child = root->right;     root->right = NULL;     child->left = tree_insert_tree (root, child->left);   }   return child; }  bin_tree_counting_node_t* tree_insert (bin_tree_counting_node_t* root,         element_t elem) {   ins++;      if (root == NULL) {     bin_tree_counting_node_t* new_node = (bin_tree_counting_node_t*)malloc (sizeof (bin_tree_counting_node_t));     if (new_node == NULL) { error_func ("Memory exhausted!"); }     new_node->element = elem;     new_node->cnt = 1;     new_node->right = new_node->left = NULL;     return new_node;   }      if (elem == root->element) {     root->cnt++;     if (root->cnt == 0) { error_func ("Counting overflow! Very large amount of inserts >2^32!"); }   }   else if (elem < root->element) {     root->left = tree_insert (root->left, elem);     if (exchange_p (root, root->left, LEFT_CHILD))       return exchange (root, LEFT_CHILD);   }   else {     root->right = tree_insert (root->right, elem);     if (exchange_p (root, root->right, RIGHT_CHILD))       return exchange (root, RIGHT_CHILD);   }   return root; }  void free_tree(bin_tree_counting_node_t* root) {   if (root == NULL)     return;   free_tree (root->left);   free_tree (root->right);   free (root); }  int main (void) {   element_t sample[] =     {       17060,17076,17076,17060,17092,17028,17076,17060,17060,17076,17060,17076,17060,17060,17140,17140,17124,17124,       17124,17140,17108,17140,17124,17124,17108,17108,17124,17124,17140,17092,17124,17124,17140,17140,17124,17156,       17140,17108,17156,17140,17124,17124,17124,17108,17124,17124,17124,17092,17140,17092,       17124,17108,17156,17124,17156,17140,17124,17172,17124,17108,17124,17108,17108,17108,17108,17124,17108,17124,       17108,17108,17124,17124,17140,17124,17108,17092,17108,17108,17092,17124,17108,17108,17124,17108,17124,17108,       17124,17140,17156,17124,17108,17108,17124,17124,17124,17140,17092,17140,17124,       17108,17124,17124,17124,17124,17108,17124,17108,17124,17108,17108,17108,17124,17108,17124,17124,17108,17108,       17124,17108,17124,17140,17124,17124,17108,17108,17140,17140,17124,17108,17140,17124,16291,16339,16339,16307,       16323,16259,16275,16275,16259,16388,16388,16355,15795,15731,16195,16179,16179,       15715,14467,14643,14851,17284,17012,16147,15155,15203,16131,15331,14691,14739,14739,14755,14723,14739,14707,       14771,14739,14707,14691,14531,14787,14563,15587,15907,15907,15923,15907,13123,13107,13091,13267,13091,13187,       13091,14643,15875,15907,16964,16404,16227,15219,14771,14771,14803,14787,14803,       14787,14803,14803,14755,14755,14771,14755,14723,14723,14739,14739,14963,16884,16868,15827,13075,13123,13091,       12979,13043,13219,13075,13059,13075,13123,15779,15875,16916,17028,15155,14675,14707,14691,14707,14691,14691,       14691,14707,14691,14691,14691,14675,14707,14691,14675,14595,14595,14595,14563,       14547,14579,14611,14547,14515,14611,14595,14611,14595,14659,14659,14627,14643,14643,14659,14643,14643,14643,       14659,14659,14643,14643,14611,14659,14611,14659,14659,14659,14643,14643,14643,14643,14611,14643,14627,14675,       14627,14643,14531,14531,14499,15187,16884,16932,15891,15923,13107,13091,13043,       13059,13075,13027,13027,13059,13059,13587,13059,15763,16900,16932,14899,14595,14627,14659,14691,14691,14691,       14691,14659,14675,14675,14675,14675,14691,14675,14691,14675,14611,14643,15107,16788,16964,14835,13363,13059,       13075,13075,13043,13043,13043,13075,13059,13059,15475,15715,16884,16932,16964,       14707,14643,14675,14643,14611,14611,14611,14611,14627,14611,14611,14627,14627,14611,14595,14595,14595,14563,       15027,16756,16932,14595,13059,13075,13059,13059,13011,13027,13059,13059,13043,13059,15683,16852,16948,16067,       15267,14771,15875,16964,16996,16980,17012,17012,17028,17028,17076,17060,17044,       17060,17076,17076,17092,17076,17092,17092,17108,17124,17092,17108,17124,17108,17124,17124,17108,17140,17124,       17108,17124,17108,17028,17124,17124,17092,17124,17108,17124,17108,17124,17092,17108,17108,17108,17124,17124,       17140,17108,17124,17108,17124,17108,17108,17156,17124,17108,17092,17108,17092,       17108,17108,17124,17124,17108,17108,17108,17124,17108,17124,17108,17124,17124,17108,17124,17124,17124,17124,       17124,17092,17140,17108,17124,17124,17140,17028,17124,17028,17108,17108,17124,17124,17124,17124,17108,17108,       17124,17140,17044,17108,17028,17124,17124,17124,17108,17124,17124,17108,17124,       17092,17124,17124,17124,17108,17124,17124,17140,17124,17140,17140,17140,17124,17044,17124,17092,17140,17124,       17140,17124,17124,17124,17108,17124,17124,17124,17124,17124,17140,17076,17140,17140,17108,17140,17124,17140,       17140,17108,17108,17124,17108,17124,17124,17124,17108,17140,17124,17140,17060,       17124,17124,17188,17108,17124,17124,17140,     };    for (QUOTA = 0.1; QUOTA < 3; QUOTA += 0.1) {     ins = tree_ins = exchg = 0;          bin_tree_counting_node_t *tree = NULL;      for (int i=0; i<sizeof sample/sizeof sample[0]; i++) {       tree = tree_insert (tree, sample[i]);     }     printf ("Q:\t%f\tins:\t%d\t tree_ins:\t%d\t exchg:\t%d\t sum:\t%d\n", QUOTA, ins, tree_ins, exchg, ins + tree_ins + exchg);     //print_tree (tree, 0);     //printf ("\n");     free_tree (tree);   }   return 0; }