occurence.c

/* SPDX-License-Identifier: GPL-2.0 */

/*
 * Find the duplication words in different sentenses.
 *
 * Possible ways to extend:
 *   - support of input from file
 *   - track and collect current sentense number
 *   - better exit handling (perhaps atexit() implementation)
 *   - support locales (call setlocale() and other stuff)
 *   - support Unicode (characters should be multi-byte sequences)
 *   - use hash instead of dumb list with O(n^2) time of execution
 *   - hash will allow to digest huge files using sliding window approach
 *   - alternative is word grep algorithm
 *   - morphem based search (for example, it, its — can be considered same)
 */

#include <ctype.h>
#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>

/* Double linked list implementation */

struct list_head {
        struct list_head *prev;
        struct list_head *next;
};

#define LIST_HEAD(name) \
        struct list_head name = { &(name), &(name) }

static inline void INIT_LIST_HEAD(struct list_head *list)
{
        list->next = list;
        list->prev = list;
}

static void list_del(struct list_head *entry)
{
        struct list_head *prev = entry->prev;
        struct list_head *next = entry->next;

        next->prev = prev;
        prev->next = next;
}

static void list_add(struct list_head *new, struct list_head *head)
{
        struct list_head *prev = head->prev;
        struct list_head *next = head;

        next->prev = new;
        new->next = next;
        new->prev = prev;
        prev->next = new;
}

static int list_empty(const struct list_head *head)
{
        return head->next == head;
}

static inline int list_is_singular(const struct list_head *head)
{
        return !list_empty(head) && (head->next == head->prev);
}

/* The member type of list_head MUST be first in a structure */
#define list_entry(ptr, type, member)           ((type *)(ptr))

#define list_first_entry(ptr, type, member)     list_entry((ptr)->next, type, member)

#define list_next_entry(pos, member)            list_entry((pos)->member.next, typeof(*(pos)), member)

#define list_last_entry(ptr, type, member)      list_entry((ptr)->prev, type, member)

#define list_for_each_entry_safe(pos, n, head, member)                  \
        for (pos = list_first_entry(head, typeof(*pos), member),        \
               n = list_next_entry(pos, member);                        \
             &pos->member != (head);                                    \
             pos = n, n = list_next_entry(n, member))

/*************************************/

struct off {
        struct list_head node;  /* MUST be first member */
        int offset;
};

struct word {
        struct list_head node;  /* MUST be first member */
        struct list_head off;
        const char *w;
};

static void offset_free(struct list_head *head)
{
        struct off *o, *_o;

        list_for_each_entry_safe(o, _o, head, node) {
                list_del(&o->node);
                free(o);
        }
}

static int offset_add(struct list_head *head, int offset)
{
        struct off *o;

        o = malloc(sizeof(*o));
        if (!o)
                return -1;

        o->offset = offset;

        list_add(&o->node, head);
        return 0;
}

static void word_free(struct list_head *head)
{
        struct word *w, *_w;

        list_for_each_entry_safe(w, _w, head, node) {
                offset_free(&w->off);
                list_del(&w->node);
                free(w);
        }
}

static int word_add(struct list_head *head, const char *word, int offset)
{
        struct word *w;
        int ret;

        w = malloc(sizeof(*w));
        if (!w)
                return -1;

        INIT_LIST_HEAD(&w->off);
        ret = offset_add(&w->off, offset);
        if (ret) {
                free(w);
                return ret;
        }

        w->w = word;

        list_add(&w->node, head);
        return 0;
}

static struct word *word_find(struct list_head *head, const char *word)
{
        struct word *w, *_w;

        list_for_each_entry_safe(w, _w, head, node) {
                if (!strcasecmp(w->w, word))
                        return w;
        }
        return NULL;
}

static void word_dump(struct list_head *head, bool dups)
{
        struct word *w, *_w;

        printf("Dump %s:\n", dups ? "dups only" : "entire list");
        list_for_each_entry_safe(w, _w, head, node) {
                struct off *o, *_o;

                if (dups && list_is_singular(&w->off))
                        continue;

                printf("%s:", w->w);
                list_for_each_entry_safe(o, _o, &w->off, node) {
                        printf(" %d", o->offset);
                }
                printf("\n");
        }
        printf("\n");
}

int tokenize(struct list_head *head, char *data)
{
        char *p = data, *q;
        int current = 0;
        struct word *w;
        int ret;

        while ((q = strsep(&p, " \t\n"))) {
                int offset = (uintptr_t)q - (uintptr_t)data;
                char *r = q + strlen(q) - 1;
                bool end;

                /* Skip whitespaces */
                if (*q == '\0')
                        continue;

                /* End of sentense? */
                end = strchr(".!?", *r) != NULL;
                while (r >= q && !isalnum(*r))
                        *r-- = '\0';

                /* Skip malformed sentenses */
                if (*q == '\0')
                        continue;

                w = word_find(head, q);
                if (w) {
                        /* We have it in our list */
                        struct off *last = list_last_entry(&w->off, struct off, node);

                        if (last->offset < current) {
                                ret = offset_add(&w->off, offset);
                                if (ret) {
                                        word_free(head);
                                        return ret;
                                }
                        }
                } else {
                        /* New word */
                        ret = word_add(head, q, offset);
                        if (ret) {
                                word_free(head);
                                return ret;
                        }
                }

                /* Track current sentense by word offset */
                current = end ? (uintptr_t)r - (uintptr_t)data : current;
        }
        return 0;
}

static const char *input =
        "\n"
        "What the heck? It's easy to count 33 cows. Easy to\n"
        "implement. A computer is dumb, user is slave!\n";

int main(int argc, char *argv[])
{
        char *data = strdup(input);
        LIST_HEAD(words);
        int ret;

        ret = tokenize(&words, data);
        if (ret) {
                free(data);
                err(1, "tokenize()");
        }

        word_dump(&words, false);
        word_dump(&words, true);

        word_free(&words);
        free(data);
        return 0;
}