Advertisement
Twissel

K&R Exercise 6.1

Apr 24th, 2016
862
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 10.88 KB | None | 0 0
  1. /* K&R 6-1: "Our version of getword() does not properly handle
  2.    underscores, string constants, or preprocessor control lines.
  3.    Write a better version."
  4.  
  5.    This is intended to be a solution to K&R 6-1 in "category 0" as
  6.    defined by the official rules given on Richard Heathfield's "The C
  7.    Programming Language Answers To Exercises" page, found at
  8.    http://users.powernet.co.uk/eton/kandr2/index.html.
  9.  
  10.    For more information on the language for which this is a lexical
  11.    analyzer, please see the comment preceding getword() below.
  12.  
  13.    Note that there is a small modification to ungetch() as defined by
  14.    K&R.  Hopefully this lies within the rules. */
  15.  
  16. /* knr61.c - answer to K&R2 exercise 6-1.
  17.    Copyright (C) 2000 Ben Pfaff <blp@gnu.org>.
  18.  
  19.    This program is free software; you can redistribute it and/or
  20.    modify it under the terms of the GNU General Public License as
  21.    published by the Free Software Foundation; either version 2 of the
  22.    License, or (at your option) any later version.
  23.  
  24.    This program is distributed in the hope that it will be useful, but
  25.    WITHOUT ANY WARRANTY; without even the implied warranty of
  26.    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  27.    General Public License for more details.
  28.  
  29.    You should have received a copy of the GNU General Public License
  30.    along with this program; if not, write to the Free Software
  31.    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  32.    02111-1307, USA. */
  33. #include <ctype.h>
  34. #include <limits.h>
  35. #include <stdio.h>
  36. #include <stdlib.h>
  37. #include <string.h>
  38.  
  39. /* Tokens.  Other non-whitespace characters self-represent themselves
  40.    as tokens. */
  41. enum token
  42.   {
  43.     TOK_ID = UCHAR_MAX + 1,     /* Identifier. */
  44.     TOK_STRING,                 /* String constant. */
  45.     TOK_CHAR,                   /* Character constant. */
  46.     TOK_EOF                     /* End of file. */
  47.   };
  48.  
  49. enum token getword (char *word, int lim);
  50.  
  51. static int skipws (void);
  52. static int getstelem (char **, int *, int);
  53.    
  54. static int getch (void);
  55. static void ungetch (int);
  56. static void putch (char **, int *, int);
  57.  
  58. /* Main program for testing. */
  59. int
  60. main (void)
  61. {
  62.   ungetch ('\n');
  63.  
  64.   for (;;)
  65.     {
  66.       char word[64];
  67.       enum token token;
  68.  
  69.       /* Get token. */
  70.       token = getword (word, sizeof word);
  71.  
  72.       /* Print token type. */
  73.       switch (token)
  74.         {
  75.         case TOK_ID:
  76.           printf ("id");
  77.           break;
  78.  
  79.         case TOK_STRING:
  80.           printf ("string");
  81.           break;
  82.  
  83.         case TOK_CHAR:
  84.           printf ("char");
  85.           break;
  86.  
  87.         case TOK_EOF:
  88.           printf ("eof\n");
  89.           return 0;
  90.  
  91.         default:
  92.           printf ("other");
  93.           word[0] = token;
  94.           word[1] = '\0';
  95.           break;
  96.         }
  97.  
  98.       /* Print token value more or less unambiguously. */
  99.       {
  100.         const char *s;
  101.  
  102.         printf ("\t'");
  103.         for (s = word; *s != '\0'; s++)
  104.           if (isprint (*s) && *s != '\'')
  105.             putchar (*s);
  106.           else if (*s == '\'')
  107.             printf ("\\'");
  108.           else
  109.             /* Potentially wrong. */
  110.             printf ("\\x%02x", *s);
  111.         printf ("'\n");
  112.       }
  113.     }
  114. }
  115.  
  116. /* Parses C-like tokens from stdin:
  117.  
  118.         - Parses C identifiers and string and character constants.
  119.  
  120.         - Other characters, such as operators, punctuation, and digits
  121.           not part of identifiers are considered as tokens in
  122.           themselves.
  123.  
  124.         - Skip comments and preprocessor control lines.
  125.  
  126.    Does not handle trigraphs, line continuation with \, or numerous
  127.    other special C features.
  128.  
  129.    Returns a token type.  This is either one of TOK_* above, or a single
  130.    character in the range 0...UCHAR_MAX.
  131.  
  132.    If TOK_ID, TOK_STRING, or TOK_CHAR is returned, WORD[] is filled
  133.    with the identifier or string value, truncated at LIM - 1
  134.    characters and terminated with '\0'.
  135.  
  136.    For other returned token types, WORD[] is indeterminate. */
  137. enum token
  138. getword (char *word, int lim)
  139. {
  140.   int beg_line, c;
  141.  
  142.   for (;;)
  143.     {
  144.       beg_line = skipws ();
  145.       c = getch ();
  146.  
  147.       if (!beg_line || c != '#')
  148.         break;
  149.      
  150.       /* Skip preprocessor directive. */
  151.       do
  152.         {
  153.           c = getch ();
  154.           if (c == EOF)
  155.             return TOK_EOF;
  156.         }
  157.       while (c != '\n');
  158.       ungetch ('\n');
  159.     }
  160.  
  161.   if (c == EOF)
  162.     return TOK_EOF;
  163.   else if (c == '_' || isalpha ((unsigned char) c))
  164.     {
  165.       do
  166.         {
  167.           putch (&word, &lim, c);
  168.           c = getch ();
  169.         }
  170.       while (isalnum ((unsigned char) c) || c == '_');
  171.  
  172.       ungetch (c);
  173.       return TOK_ID;
  174.     }
  175.   else if (c == '\'' || c == '"')
  176.     {
  177.       int quote = c;
  178.       word[0] = '\0';
  179.       while (getstelem (&word, &lim, quote))
  180.         ;
  181.       return quote == '\'' ? TOK_CHAR : TOK_STRING;
  182.     }
  183.   else
  184.     return (unsigned char) c;
  185. }
  186.  
  187. /* Skips whitespace and comments read from stdin.
  188.    Returns nonzero if a newline was encountered, indicating that we're
  189.    at the beginning of a line. */
  190. static int
  191. skipws (void)
  192. {
  193.   /* Classification of an input character. */
  194.   enum class
  195.     {
  196.       CLS_WS = 0,               /* Whitespace. */
  197.       CLS_BEG_CMT,              /* Slash-star beginning a comment. */
  198.       CLS_END_CMT,              /* Star-slash ending a comment. */
  199.       CLS_OTHER,                /* None of the above. */
  200.  
  201.       CLS_IN_CMT = 4            /* Combined with one of the above,
  202.                                    indicates we're inside a comment. */
  203.     };
  204.  
  205.   /* Either 0, if we're not inside a comment,
  206.      or CLS_IN_CMT, if we are inside a comment. */
  207.   enum class in_comment = 0;
  208.  
  209.   /* Have we encountered a newline outside a comment? */
  210.   int beg_line = 0;
  211.  
  212.   for (;;)
  213.     {
  214.       int c;                    /* Input character. */
  215.       enum class class;         /* Classification of `c'. */
  216.  
  217.       /* Get an input character and determine its classification. */
  218.       c = getch ();
  219.       switch (c)
  220.         {
  221.         case '\n':
  222.           if (!in_comment)
  223.             beg_line = 1;
  224.           /* Fall through. */
  225.          
  226.         case ' ': case '\f': case '\r': case '\t': case '\v':
  227.           class = CLS_WS;
  228.           break;
  229.  
  230.         case '/':
  231.           /* Outside a comment, slash-star begins a comment. */
  232.           if (!in_comment)
  233.             {
  234.               c = getch ();
  235.               if (c == '*')
  236.                 class = CLS_BEG_CMT;
  237.               else
  238.                 {
  239.                   ungetch (c);
  240.                   c = '/';
  241.                   class = CLS_OTHER;
  242.                 }
  243.             }
  244.           else
  245.             class = CLS_OTHER;
  246.           break;
  247.  
  248.         case '*':
  249.           /* Inside a comment, star-slash ends the comment. */
  250.           if (in_comment)
  251.             {
  252.               c = getch ();
  253.               if (c == '/')
  254.                 class = CLS_END_CMT;
  255.               else
  256.                 {
  257.                   ungetch (c);
  258.                   class = CLS_OTHER;
  259.                 }
  260.             }
  261.           else
  262.             class = CLS_OTHER;
  263.           break;
  264.  
  265.         default:
  266.           /* Other characters. */
  267.           if (c == EOF)
  268.             return 0;
  269.           class = CLS_OTHER;
  270.         }
  271.  
  272.       /* Handle character `c' according to its classification
  273.          and whether we're inside a comment. */
  274.       switch (class | in_comment)
  275.         {
  276.         case CLS_WS:
  277.         case CLS_WS | CLS_IN_CMT:
  278.         case CLS_OTHER | CLS_IN_CMT:
  279.           break;
  280.  
  281.         case CLS_BEG_CMT:
  282.           in_comment = CLS_IN_CMT;
  283.           break;
  284.  
  285.         case CLS_OTHER:
  286.           ungetch (c);
  287.           return beg_line;
  288.  
  289.         case CLS_END_CMT | CLS_IN_CMT:
  290.           in_comment = 0;
  291.           break;
  292.  
  293.         case CLS_BEG_CMT | CLS_IN_CMT:
  294.         case CLS_END_CMT:
  295.         default:
  296.           printf ("can't happen\n");
  297.           break;
  298.         }
  299.     }
  300. }
  301.  
  302. /* Get a character inside a quoted string or character constant.
  303.    QUOTE is ' for a character constant or " for a quoted string.
  304.    *WORDP points to a string being constructed that has *LIMP bytes
  305.    available. */
  306. static int
  307. getstelem (char **wordp, int *limp, int quote)
  308. {
  309.   int c;
  310.  
  311.   /* Handle end-of-quote and EOF. */
  312.   c = getch ();
  313.   if (c == quote || c == EOF)
  314.     return 0;
  315.  
  316.   /* Handle ordinary string characters. */
  317.   if (c != '\\')
  318.     {
  319.       putch (wordp, limp, c);
  320.       return 1;
  321.     }
  322.  
  323.   /* We're in a \ escape sequence.
  324.      Get the second character. */
  325.   c = getch ();
  326.   if (c == EOF)
  327.     return 0;
  328.  
  329.   /* Handle simple single-character escapes. */
  330.   {
  331.     static const char escapes[] = {"''??\"\"\\\\a\ab\bf\fn\nr\rt\tv\v"};
  332.     const char *cp = strchr (escapes, c);
  333.     if (cp != NULL)
  334.       {
  335.         putch (wordp, limp, cp[1]);
  336.         return 1;
  337.       }
  338.   }
  339.  
  340.   /* Handle hexadecimal and octal escapes.
  341.      This also handles invalid escapes by default,
  342.      doing nothing useful with them.
  343.      That's okay because invalid escapes generate undefined behavior. */
  344.   {
  345.     unsigned char v = 0;
  346.  
  347.     if (c == 'x' || c == 'X')
  348.       for (;;)
  349.         {
  350.           static const char hexits[] = "0123456789abcdef";
  351.           const char *p;
  352.  
  353.           c = getch ();
  354.           p = strchr (hexits, tolower ((unsigned char) c));
  355.           if (p == NULL)
  356.             break;
  357.           v = v * 16 + (p - hexits);
  358.         }
  359.     else
  360.       {
  361.         int i;
  362.  
  363.         for (i = 0; i < 3; i++)
  364.           {
  365.             v = v * 8 + (c - '0');
  366.             c = getch ();
  367.             if (c < '0' || c > '7')
  368.               break;
  369.           }
  370.       }
  371.        
  372.     putch (wordp, limp, v);
  373.     ungetch (c);
  374.   }
  375.  
  376.   return 1;
  377. }
  378.  
  379. /* Capacity of putback buffer. */
  380. #define BUFSIZE 100
  381.  
  382. /* Putback buffer. */
  383. char buf[BUFSIZE];
  384.  
  385. /* Number of characters in putback buffer. */
  386. int bufp = 0;
  387.  
  388. /* Retrieves and returns a character from stdin or from the putback
  389.    buffer.
  390.    Returns EOF if end of file is encountered. */
  391. int
  392. getch (void)
  393. {
  394.   return bufp > 0 ? buf[--bufp] : getchar ();
  395. }
  396.  
  397. /* Stuffs character C into the putback buffer.
  398.    From the caller's perspective, fails silently if the putback buffer
  399.    is full. */
  400. void
  401. ungetch (int c)
  402. {
  403.   if (c == EOF)
  404.     return;
  405.  
  406.   if (bufp >= BUFSIZE)
  407.     printf ("ungetch: too many characters\n");
  408.   else
  409.     buf[bufp++] = c;
  410. }
  411.  
  412. /* Stuffs character C into buffer *WORDP, which has *LIMP bytes
  413.    available.
  414.    Advances *WORDP and reduces *LIMP as appropriate.
  415.    Drops the character on the floor if it would overflow the buffer.
  416.    Ensures that *WORDP is null terminated if possible. */
  417. static void
  418. putch (char **wordp, int *limp, int c)
  419. {
  420.   if (*limp > 1)
  421.     {
  422.       *(*wordp)++ = c;
  423.       (*limp)--;
  424.     }
  425.   if (*limp > 0)
  426.     **wordp = '\0';
  427. }
  428.  
  429. /*
  430.    Local variables:
  431.    compile-command: "checkergcc -W -Wall -ansi -pedantic knr61.c -o knr61"
  432.    End:
  433. */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement