Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* SPDX-License-Identifier: GPL-2.0 */
- /*
- * Find the duplication words in different sentenses.
- *
- * Possible ways to extend:
- * - support of input from file
- * - track and collect current sentense number
- * - better exit handling (perhaps atexit() implementation)
- * - support locales (call setlocale() and other stuff)
- * - support Unicode (characters should be multi-byte sequences)
- * - use hash instead of dumb list with O(n^2) time of execution
- * - hash will allow to digest huge files using sliding window approach
- * - alternative is word grep algorithm
- * - morphem based search (for example, it, its — can be considered same)
- */
- #include <ctype.h>
- #include <err.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <stdint.h>
- #include <stdbool.h>
- #include <string.h>
- /* Double linked list implementation */
- struct list_head {
- struct list_head *prev;
- struct list_head *next;
- };
- #define LIST_HEAD(name) \
- struct list_head name = { &(name), &(name) }
- static inline void INIT_LIST_HEAD(struct list_head *list)
- {
- list->next = list;
- list->prev = list;
- }
- static void list_del(struct list_head *entry)
- {
- struct list_head *prev = entry->prev;
- struct list_head *next = entry->next;
- next->prev = prev;
- prev->next = next;
- }
- static void list_add(struct list_head *new, struct list_head *head)
- {
- struct list_head *prev = head->prev;
- struct list_head *next = head;
- next->prev = new;
- new->next = next;
- new->prev = prev;
- prev->next = new;
- }
- static int list_empty(const struct list_head *head)
- {
- return head->next == head;
- }
- static inline int list_is_singular(const struct list_head *head)
- {
- return !list_empty(head) && (head->next == head->prev);
- }
- /* The member type of list_head MUST be first in a structure */
- #define list_entry(ptr, type, member) ((type *)(ptr))
- #define list_first_entry(ptr, type, member) list_entry((ptr)->next, type, member)
- #define list_next_entry(pos, member) list_entry((pos)->member.next, typeof(*(pos)), member)
- #define list_last_entry(ptr, type, member) list_entry((ptr)->prev, type, member)
- #define list_for_each_entry_safe(pos, n, head, member) \
- for (pos = list_first_entry(head, typeof(*pos), member), \
- n = list_next_entry(pos, member); \
- &pos->member != (head); \
- pos = n, n = list_next_entry(n, member))
- /*************************************/
- struct off {
- struct list_head node; /* MUST be first member */
- int offset;
- };
- struct word {
- struct list_head node; /* MUST be first member */
- struct list_head off;
- const char *w;
- };
- static void offset_free(struct list_head *head)
- {
- struct off *o, *_o;
- list_for_each_entry_safe(o, _o, head, node) {
- list_del(&o->node);
- free(o);
- }
- }
- static int offset_add(struct list_head *head, int offset)
- {
- struct off *o;
- o = malloc(sizeof(*o));
- if (!o)
- return -1;
- o->offset = offset;
- list_add(&o->node, head);
- return 0;
- }
- static void word_free(struct list_head *head)
- {
- struct word *w, *_w;
- list_for_each_entry_safe(w, _w, head, node) {
- offset_free(&w->off);
- list_del(&w->node);
- free(w);
- }
- }
- static int word_add(struct list_head *head, const char *word, int offset)
- {
- struct word *w;
- int ret;
- w = malloc(sizeof(*w));
- if (!w)
- return -1;
- INIT_LIST_HEAD(&w->off);
- ret = offset_add(&w->off, offset);
- if (ret) {
- free(w);
- return ret;
- }
- w->w = word;
- list_add(&w->node, head);
- return 0;
- }
- static struct word *word_find(struct list_head *head, const char *word)
- {
- struct word *w, *_w;
- list_for_each_entry_safe(w, _w, head, node) {
- if (!strcasecmp(w->w, word))
- return w;
- }
- return NULL;
- }
- static void word_dump(struct list_head *head, bool dups)
- {
- struct word *w, *_w;
- printf("Dump %s:\n", dups ? "dups only" : "entire list");
- list_for_each_entry_safe(w, _w, head, node) {
- struct off *o, *_o;
- if (dups && list_is_singular(&w->off))
- continue;
- printf("%s:", w->w);
- list_for_each_entry_safe(o, _o, &w->off, node) {
- printf(" %d", o->offset);
- }
- printf("\n");
- }
- printf("\n");
- }
- int tokenize(struct list_head *head, char *data)
- {
- char *p = data, *q;
- int current = 0;
- struct word *w;
- int ret;
- while ((q = strsep(&p, " \t\n"))) {
- int offset = (uintptr_t)q - (uintptr_t)data;
- char *r = q + strlen(q) - 1;
- bool end;
- /* Skip whitespaces */
- if (*q == '\0')
- continue;
- /* End of sentense? */
- end = strchr(".!?", *r) != NULL;
- while (r >= q && !isalnum(*r))
- *r-- = '\0';
- /* Skip malformed sentenses */
- if (*q == '\0')
- continue;
- w = word_find(head, q);
- if (w) {
- /* We have it in our list */
- struct off *last = list_last_entry(&w->off, struct off, node);
- if (last->offset < current) {
- ret = offset_add(&w->off, offset);
- if (ret) {
- word_free(head);
- return ret;
- }
- }
- } else {
- /* New word */
- ret = word_add(head, q, offset);
- if (ret) {
- word_free(head);
- return ret;
- }
- }
- /* Track current sentense by word offset */
- current = end ? (uintptr_t)r - (uintptr_t)data : current;
- }
- return 0;
- }
- static const char *input =
- "\n"
- "What the heck? It's easy to count 33 cows. Easy to\n"
- "implement. A computer is dumb, user is slave!\n";
- int main(int argc, char *argv[])
- {
- char *data = strdup(input);
- LIST_HEAD(words);
- int ret;
- ret = tokenize(&words, data);
- if (ret) {
- free(data);
- err(1, "tokenize()");
- }
- word_dump(&words, false);
- word_dump(&words, true);
- word_free(&words);
- free(data);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement