drankinatty

Hashtable for Strings using LH_strhash() Hash Function.

Sep 4th, 2020 (edited)
394
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 11.29 KB | None | 0 0
  1. /* reads words, one-per line into a dynamically allocated and sized hashtable
  2.  * storage for words is dynamically allocated to exact length, table is
  3.  * re-hashed when load-factor exceeds 0.6. The header, source and example
  4.  * files are shown below, but can be compiled together in this single exmple.
  5.  * when separated into separate files, the hastable struct and list struct
  6.  * are opaque to the example file with typedefs and forward declarations
  7.  * being all that is visible to the user if compiled into a library.
  8.  * openSSL LH_strhash() function is used for hashing.
  9.  */
  10.  
  11. /* =========================== wordhash_tbl.h =========================== */
  12.  
  13. #ifndef _wordhash_tbl_h_
  14. #define _wordhash_tbl_h_  1
  15.  
  16. #include <stdio.h>
  17.  
  18. #define HTSZ     256                    /* default number of buckets for new hashtable */
  19. #define LOADFACT 0.6F                   /* never > .7  (buckets filled / total) */
  20.  
  21. typedef struct _list_t list_t;              /* typedef for opaque hashtable list */
  22. typedef struct _hashtable_t hashtable_t;    /* typedef for opaque hashtable struct */
  23.  
  24. int ht_size (hashtable_t *ht);          /* size (total no. of buckets) */
  25. int ht_filled (hashtable_t *ht);        /* buckets filled (used) */
  26. size_t ht_words (hashtable_t *ht);      /* total words in hashtable */
  27. float ht_load (hashtable_t *ht);        /* current load-factor (filled / total) */
  28.  
  29. hashtable_t *ht_create (size_t size);   /* create new hashtable (if 0 HTSZ used) */
  30.  
  31. /* lookup functions */
  32. list_t *ht_lookup_str (hashtable_t *ht, const char *s);
  33. /* takes hash to avoid re-hash of same word if hash already computed */
  34. list_t *ht_lookup_hash (hashtable_t *ht, const char *s, unsigned int hval);
  35.  
  36. /* insert string, delete string and free hashtable funcitons */
  37. int ht_insert_str (hashtable_t **ht, const char *s);
  38. void ht_delete_str (hashtable_t *ht, const char *s);
  39. void ht_free (hashtable_t *ht);
  40.  
  41. #endif
  42.  
  43.  
  44.  
  45. /* =========================== wordhash_tbl.c =========================== */
  46.  
  47. #include <stdio.h>
  48. #include <stdlib.h>
  49. #include <string.h>
  50. #include <limits.h>
  51. #include <stdint.h>
  52.  
  53. #include "wordhash_tbl.h"
  54.  
  55. struct _list_t {        /* opaque struct for hashtable list */
  56.     char *word;
  57.     struct _list_t *next;
  58. };
  59.  
  60. struct _hashtable_t {   /* opaque struct for hashtable */
  61.     int size;
  62.     int filled;
  63.     size_t words;
  64.     list_t **table;
  65. };
  66.  
  67. int ht_size (hashtable_t *ht)
  68. {
  69.     return ht->size;
  70. }
  71.  
  72. int ht_filled (hashtable_t *ht)
  73. {
  74.     return ht->filled;
  75. }
  76.  
  77. size_t ht_words (hashtable_t *ht)
  78. {
  79.     return ht->words;
  80. }
  81.  
  82. /** create hashtable of size 'size', return pointer to hashtable.
  83.  *  allocate size buckets and validate, initializing all
  84.  *  pointers NULL, setting ht->size to 'size' and filled = 0.
  85.  *  returns pointer to allocated hashtable on success, NULL otherwise.
  86.  */
  87. hashtable_t *ht_create (size_t size)
  88. {
  89.     hashtable_t *ht;
  90.  
  91.     if (size < 1) return NULL;
  92.  
  93.     /* allocate hashtable */
  94.     if (!(ht = calloc (1, sizeof *ht)))
  95.         return NULL;
  96.  
  97.     /* allocate & INITIALIZE element pointers NULL */
  98.     if (!(ht->table = calloc (size, sizeof *(ht->table))))
  99.         return NULL;
  100.  
  101.     ht->size = size;
  102.     ht->filled = 0;
  103.     ht->words = 0;
  104.  
  105.     return ht;
  106. }
  107.  
  108. /** OPENSSL_LH_strhash
  109.  *  The following hash seems to work very well on normal text words no
  110.  *  collisions on /usr/dict/words and it distributes on %2^n quite well, not
  111.  *  as good as MD5, but still good.
  112.  */
  113. uint32_t lh_strhash (const char *s)
  114. {
  115.     uint64_t ret = 0, v;
  116.     int64_t n = 0x100;
  117.     int32_t r;
  118.  
  119.     if (!s || !*s) return (ret);
  120.  
  121.     for (; *s; s++) {
  122.         v = n | (*s);
  123.         n += 0x100;
  124.         r = (int32_t)((v >> 2) ^ v) & 0x0f;
  125.         ret = (ret << r) | (ret >> (32 - r));
  126.         ret &= 0xFFFFFFFFL;
  127.         ret ^= v * v;
  128.     }
  129.     return ((ret >> 16) ^ ret);
  130. }
  131.  
  132. /** wrapper for lh_strhash returning key (index), 0 - (ht->size - 1).
  133.  *  modulo ht-size insures hash is coverted to key that
  134.  *  corresponds to an index within the allocated hashtable size.
  135.  */
  136. uint32_t hash (hashtable_t *ht, const char *s)
  137. {
  138.     return lh_strhash (s) % ht->size;
  139. }
  140.  
  141. list_t *ht_lookup_str (hashtable_t *ht, const char *s)
  142. {
  143.     if (!ht || !s) return NULL;
  144.  
  145.     list_t *list;
  146.     uint32_t hashval = hash (ht, s);
  147.  
  148.     /* go to the correct list based on the hash value and see if str is
  149.      * in the list.  if found, return return a pointer to the list element,
  150.      * otherwise, return NULL.
  151.      */
  152.     for (list = (ht->table)[hashval]; list; list = list->next) {
  153.         if (strcmp (s, list->word) == 0) return list;
  154.     }
  155.  
  156.     return NULL;
  157. }
  158.  
  159. /** lookup prevents hashing twice when called from within a function */
  160. list_t *ht_lookup_hash (hashtable_t *ht, const char *s, unsigned int hval)
  161. {
  162.     if (!ht) return NULL;
  163.  
  164.     for (list_t *list = (ht->table)[hval]; list; list = list->next) {
  165.         if (strcmp (s, list->word) == 0) return list;
  166.     }
  167.  
  168.     return NULL;
  169. }
  170.  
  171. /** resize hashtable increase by 2x current, rehashing all keys.
  172.  *  double the current number of buckets, moving current list nodes
  173.  *  to new buckets/lists within the resized hashtable, free existing
  174.  *  ht->table returning pointer to resized hashtable on success,
  175.  *  NULL otherwise.
  176.  */
  177. hashtable_t *ht_rehash (hashtable_t *ht)
  178. {
  179.     size_t htsz = ht->size * 2;     /* double current hashtable size */
  180.     hashtable_t *nht = NULL;
  181.  
  182.     if (!(nht = ht_create (htsz)))  /* allocate new hashtable, validate */
  183.         return NULL;
  184.  
  185.     nht->size = htsz;                               /* assign new size     */
  186.  
  187.     for (int i = 0; i < ht->size; i++)              /* for each old bucket */
  188.         for (list_t *l = (ht->table)[i]; l;) {      /* for each list entry */
  189.             list_t *nextlp = l->next;               /* save next list ptr  */
  190.             uint32_t nhash = hash (nht, l->word);   /* compute new hash    */
  191.             l->next = (nht->table)[nhash];          /* set next to index   */
  192.             if (!(nht->table)[nhash])               /* set filled buckets  */
  193.                 (nht->filled)++;
  194.             (nht->table)[nhash] = l;                /* set index to list   */
  195.             l = nextlp;                      /* assign next old to iterate */
  196.         }
  197.  
  198.     nht->words = ht->words; /* assign no. of words */
  199.     free (ht->table);       /* free old list ptrs */
  200.     free (ht);              /* free old hashtable */
  201.  
  202.     return nht;
  203. }
  204.  
  205. /** insert string into hashtable, check load, resize/rehash.
  206.  *  returns 0 if string placed in new bucket 1 if string exists,
  207.  *  otherwise, return -1 on invalid parameter or allocation failure,
  208.  *  -2 on resize/rehash failure.
  209.  */
  210. int ht_insert_str (hashtable_t **ht, const char *s)
  211. {
  212.     if (!ht || !s) return -1;
  213.  
  214.     if (!*ht)                           /* if hashtable NULL */
  215.         if (!(*ht = ht_create (HTSZ)))  /* create empty hashtable & validate */
  216.             return -1;
  217.  
  218.     list_t *new_list, *current_list;
  219.     uint32_t hashval = hash (*ht, s);
  220.     size_t len = 0;
  221.  
  222.     /* Does item already exist? */
  223.     if ((current_list = ht_lookup_hash (*ht, s, hashval))) return 1;
  224.  
  225.     /* allocate / validate new list node */
  226.     if (!(new_list = malloc (sizeof *new_list))) return -1;
  227.  
  228.     if (!((*ht)->table)[hashval]) (*ht)->filled++;  /* no. of filled buckets */
  229.  
  230.     /* Insert into list */
  231.     len = strlen(s);                            /* get length of string */
  232.     new_list->word = malloc (len + 1);          /* allocate string storage */
  233.     if (!new_list->word) {                      /* validate allocation */
  234.         perror ("malloc-new_list->word");
  235.         return -1;
  236.     }
  237.     memcpy (new_list->word, s, len + 1);        /* copy string to new storage */
  238.     (*ht)->words++;                             /* increment word count */
  239.     new_list->next = ((*ht)->table)[hashval];   /* set next to key addr (ptr is NULL) */
  240.     ((*ht)->table)[hashval] = new_list;         /* set key to list */
  241.  
  242.     while (ht_load (*ht) > LOADFACT) {          /* check load factor, resize/rehash */
  243. #ifdef DEBUG
  244.     printf (" ht_load : %.2f = %d / %d => %d\n",
  245.             ht_load (*ht), (*ht)->filled, (*ht)->size, (*ht)->size * 2);
  246. #endif
  247.         hashtable_t *tmp = ht_rehash (*ht);
  248.         if (!tmp)
  249.             return -2;
  250.         *ht = tmp;
  251.     }
  252.  
  253.     return 0;
  254. }
  255.  
  256. /** custom free for each type of list_t.
  257.  *  (if not more than one member, just handle in ht_free)
  258.  */
  259. /*
  260. void ht_free_node (list_t *lp)
  261. {
  262.     free (lp->word);
  263.     free (lp->defn);
  264.     free (lp);
  265. }
  266. */
  267.  
  268. void ht_delete_str (hashtable_t *ht, const char *s)
  269. {
  270.     if (!ht || !s) return;
  271.  
  272.     list_t *list, *victim, **pplist;
  273.     uint32_t hashval = hash (ht, s);
  274.  
  275.     /* lookup element, bail if it does not exist */
  276.     if (!(victim = ht_lookup_hash (ht, s, hashval))) return;
  277.  
  278.     list = ht->table[hashval];
  279.     pplist = &list;
  280.    
  281.     for (; list; pplist = &list->next, list = list->next)
  282.         if (list == victim) {
  283.             *pplist = list->next;
  284.             free (victim->word);
  285.             free (victim);
  286.             ht->words--;
  287.             break;
  288.         }
  289. }
  290.  
  291. void ht_free (hashtable_t *ht)
  292. {
  293.     int i;
  294.     list_t *list, *victim;
  295.  
  296.     if (!ht) return;
  297.  
  298.     /* Free the memory for every item in the table, including the
  299.      * words themselves.
  300.      */
  301.     for (i = 0; i < ht->size; i++) {
  302.         list = (ht->table)[i];
  303.         while (list) {
  304.             victim = list;
  305.             list = list->next;
  306.             free (victim->word);
  307.             free (victim);
  308.         }
  309.     }
  310.  
  311.     /* Free the table itself */
  312.     free (ht->table);
  313.     free (ht);
  314. }
  315.  
  316. float ht_load (hashtable_t *ht)
  317. {
  318.     return ht ? (float)ht->filled/ht->size : -1.0F;
  319. }
  320.  
  321.  
  322. /* =========================== wordhash_tst.c =========================== */
  323.  
  324. #include "wordhash_tbl.h"
  325.  
  326. #define MAXC     64         /* max characters in read buffer for words */
  327. #define TBLSIZE 256         /* initial number of buckets in table */
  328.  
  329. int main (int argc, char **argv) {
  330.  
  331.     hashtable_t *ht = NULL;             /* pointer to hashtable */
  332.     char buf[MAXC];                     /* buffer for reading from file */
  333.     /* use filename provided as 1st argument (stdin by default) */
  334.     FILE *fp = argc > 1 ? fopen (argv[1], "r") : stdin;
  335.  
  336.     if (!fp) {  /* validate file open for reading */
  337.         perror ("file open failed");
  338.         return 1;
  339.     }
  340.  
  341.     if (!(ht = ht_create (TBLSIZE))) {  /* create/validate hashtable */
  342.         perror ("ht_create (TBLSIZE)");
  343.         exit (EXIT_FAILURE);
  344.     }
  345.  
  346.     while (fgets (buf, MAXC, fp)) {
  347.         int rtn;                        /* variable for insert return */
  348.         buf[strcspn(buf, "\n")] = 0;    /* trim trailing '\n' from buf */
  349.         /* insert word in hashtable/validate */
  350.         if ((rtn = ht_insert_str (&ht, buf)) < 0) {
  351.             if (rtn == -1)
  352.                 fputs ("error: bad parameter or allocation failure\n", stderr);
  353.             else
  354.                 fputs ("error: resize/rehash failure\n", stderr);
  355.             exit (EXIT_FAILURE);
  356.         }
  357.     }
  358.  
  359.     if (fp != stdin)   /* close file if not stdin */
  360.         fclose (fp);
  361.  
  362.     printf ("hast table size : %d\n"
  363.             "buckets filled  : %d\n"
  364.             "load factor     : %.2f\n"
  365.             "words in table  : %zu\n",
  366.             ht_size(ht), ht_filled(ht), ht_load(ht), ht_words(ht));
  367.  
  368.     ht_free (ht);
  369. }
  370.  
Add Comment
Please, Sign In to add comment