K&R Exercise 6.1

/* K&R 6-1: "Our version of getword() does not properly handle
   underscores, string constants, or preprocessor control lines.
   Write a better version."

   This is intended to be a solution to K&R 6-1 in "category 0" as
   defined by the official rules given on Richard Heathfield's "The C
   Programming Language Answers To Exercises" page, found at
   http://users.powernet.co.uk/eton/kandr2/index.html.

   For more information on the language for which this is a lexical
   analyzer, please see the comment preceding getword() below.

   Note that there is a small modification to ungetch() as defined by
   K&R.  Hopefully this lies within the rules. */

/* knr61.c - answer to K&R2 exercise 6-1.
   Copyright (C) 2000 Ben Pfaff <[email protected]>.

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
   02111-1307, USA. */
#include <ctype.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* Tokens.  Other non-whitespace characters self-represent themselves
   as tokens. */
enum token
  {
    TOK_ID = UCHAR_MAX + 1,     /* Identifier. */
    TOK_STRING,                 /* String constant. */
    TOK_CHAR,                   /* Character constant. */
    TOK_EOF                     /* End of file. */
  };

enum token getword (char *word, int lim);

static int skipws (void);
static int getstelem (char **, int *, int);

static int getch (void);
static void ungetch (int);
static void putch (char **, int *, int);

/* Main program for testing. */
int
main (void)
{
  ungetch ('\n');

  for (;;)
    {
      char word[64];
      enum token token;

      /* Get token. */
      token = getword (word, sizeof word);

      /* Print token type. */
      switch (token)
        {
        case TOK_ID:
          printf ("id");
          break;

        case TOK_STRING:
          printf ("string");
          break;

        case TOK_CHAR:
          printf ("char");
          break;

        case TOK_EOF:
          printf ("eof\n");
          return 0;

        default:
          printf ("other");
          word[0] = token;
          word[1] = '\0';
          break;
        }

      /* Print token value more or less unambiguously. */
      {
        const char *s;

        printf ("\t'");
        for (s = word; *s != '\0'; s++)
          if (isprint (*s) && *s != '\'')
            putchar (*s);
          else if (*s == '\'')
            printf ("\\'");
          else
            /* Potentially wrong. */
            printf ("\\x%02x", *s);
        printf ("'\n");
      }
    }
}

/* Parses C-like tokens from stdin:

        - Parses C identifiers and string and character constants.

        - Other characters, such as operators, punctuation, and digits
          not part of identifiers are considered as tokens in
          themselves.

        - Skip comments and preprocessor control lines.

   Does not handle trigraphs, line continuation with \, or numerous
   other special C features.

   Returns a token type.  This is either one of TOK_* above, or a single
   character in the range 0...UCHAR_MAX.

   If TOK_ID, TOK_STRING, or TOK_CHAR is returned, WORD[] is filled
   with the identifier or string value, truncated at LIM - 1
   characters and terminated with '\0'.

   For other returned token types, WORD[] is indeterminate. */
enum token
getword (char *word, int lim)
{
  int beg_line, c;

  for (;;)
    {
      beg_line = skipws ();
      c = getch ();

      if (!beg_line || c != '#')
        break;

      /* Skip preprocessor directive. */
      do
        {
          c = getch ();
          if (c == EOF)
            return TOK_EOF;
        }
      while (c != '\n');
      ungetch ('\n');
    }

  if (c == EOF)
    return TOK_EOF;
  else if (c == '_' || isalpha ((unsigned char) c))
    {
      do
        {
          putch (&word, &lim, c);
          c = getch ();
        }
      while (isalnum ((unsigned char) c) || c == '_');

      ungetch (c);
      return TOK_ID;
    }
  else if (c == '\'' || c == '"')
    {
      int quote = c;
      word[0] = '\0';
      while (getstelem (&word, &lim, quote))
        ;
      return quote == '\'' ? TOK_CHAR : TOK_STRING;
    }
  else
    return (unsigned char) c;
}

/* Skips whitespace and comments read from stdin.
   Returns nonzero if a newline was encountered, indicating that we're
   at the beginning of a line. */
static int
skipws (void)
{
  /* Classification of an input character. */
  enum class
    {
      CLS_WS = 0,               /* Whitespace. */
      CLS_BEG_CMT,              /* Slash-star beginning a comment. */
      CLS_END_CMT,              /* Star-slash ending a comment. */
      CLS_OTHER,                /* None of the above. */

      CLS_IN_CMT = 4            /* Combined with one of the above,
                                   indicates we're inside a comment. */
    };

  /* Either 0, if we're not inside a comment,
     or CLS_IN_CMT, if we are inside a comment. */
  enum class in_comment = 0;

  /* Have we encountered a newline outside a comment? */
  int beg_line = 0;

  for (;;)
    {
      int c;                    /* Input character. */
      enum class class;         /* Classification of `c'. */

      /* Get an input character and determine its classification. */
      c = getch ();
      switch (c)
        {
        case '\n':
          if (!in_comment)
            beg_line = 1;
          /* Fall through. */

        case ' ': case '\f': case '\r': case '\t': case '\v':
          class = CLS_WS;
          break;

        case '/':
          /* Outside a comment, slash-star begins a comment. */
          if (!in_comment)
            {
              c = getch ();
              if (c == '*')
                class = CLS_BEG_CMT;
              else
                {
                  ungetch (c);
                  c = '/';
                  class = CLS_OTHER;
                }
            }
          else
            class = CLS_OTHER;
          break;

        case '*':
          /* Inside a comment, star-slash ends the comment. */
          if (in_comment)
            {
              c = getch ();
              if (c == '/')
                class = CLS_END_CMT;
              else
                {
                  ungetch (c);
                  class = CLS_OTHER;
                }
            }
          else
            class = CLS_OTHER;
          break;

        default:
          /* Other characters. */
          if (c == EOF)
            return 0;
          class = CLS_OTHER;
        }

      /* Handle character `c' according to its classification
         and whether we're inside a comment. */
      switch (class | in_comment)
        {
        case CLS_WS:
        case CLS_WS | CLS_IN_CMT:
        case CLS_OTHER | CLS_IN_CMT:
          break;

        case CLS_BEG_CMT:
          in_comment = CLS_IN_CMT;
          break;

        case CLS_OTHER:
          ungetch (c);
          return beg_line;

        case CLS_END_CMT | CLS_IN_CMT:
          in_comment = 0;
          break;

        case CLS_BEG_CMT | CLS_IN_CMT:
        case CLS_END_CMT:
        default:
          printf ("can't happen\n");
          break;
        }
    }
}

/* Get a character inside a quoted string or character constant.
   QUOTE is ' for a character constant or " for a quoted string.
   *WORDP points to a string being constructed that has *LIMP bytes
   available. */
static int
getstelem (char **wordp, int *limp, int quote)
{
  int c;

  /* Handle end-of-quote and EOF. */
  c = getch ();
  if (c == quote || c == EOF)
    return 0;

  /* Handle ordinary string characters. */
  if (c != '\\')
    {
      putch (wordp, limp, c);
      return 1;
    }

  /* We're in a \ escape sequence.
     Get the second character. */
  c = getch ();
  if (c == EOF)
    return 0;

  /* Handle simple single-character escapes. */
  {
    static const char escapes[] = {"''??\"\"\\\\a\ab\bf\fn\nr\rt\tv\v"};
    const char *cp = strchr (escapes, c);
    if (cp != NULL)
      {
        putch (wordp, limp, cp[1]);
        return 1;
      }
  }

  /* Handle hexadecimal and octal escapes.
     This also handles invalid escapes by default,
     doing nothing useful with them.
     That's okay because invalid escapes generate undefined behavior. */
  {
    unsigned char v = 0;

    if (c == 'x' || c == 'X')
      for (;;)
        {
          static const char hexits[] = "0123456789abcdef";
          const char *p;

          c = getch ();
          p = strchr (hexits, tolower ((unsigned char) c));
          if (p == NULL)
            break;
          v = v * 16 + (p - hexits);
        }
    else
      {
        int i;

        for (i = 0; i < 3; i++)
          {
            v = v * 8 + (c - '0');
            c = getch ();
            if (c < '0' || c > '7')
              break;
          }
      }

    putch (wordp, limp, v);
    ungetch (c);
  }

  return 1;
}

/* Capacity of putback buffer. */
#define BUFSIZE 100

/* Putback buffer. */
char buf[BUFSIZE];

/* Number of characters in putback buffer. */
int bufp = 0;

/* Retrieves and returns a character from stdin or from the putback
   buffer.
   Returns EOF if end of file is encountered. */
int
getch (void)
{
  return bufp > 0 ? buf[--bufp] : getchar ();
}

/* Stuffs character C into the putback buffer.
   From the caller's perspective, fails silently if the putback buffer
   is full. */
void
ungetch (int c)
{
  if (c == EOF)
    return;

  if (bufp >= BUFSIZE)
    printf ("ungetch: too many characters\n");
  else
    buf[bufp++] = c;
}

/* Stuffs character C into buffer *WORDP, which has *LIMP bytes
   available.
   Advances *WORDP and reduces *LIMP as appropriate.
   Drops the character on the floor if it would overflow the buffer.
   Ensures that *WORDP is null terminated if possible. */
static void
putch (char **wordp, int *limp, int c)
{
  if (*limp > 1)
    {
      *(*wordp)++ = c;
      (*limp)--;
    }
  if (*limp > 0)
    **wordp = '\0';
}

/*
   Local variables:
   compile-command: "checkergcc -W -Wall -ansi -pedantic knr61.c -o knr61"
   End:
*/