Untitled

#include <stdio.h>      /* for fopen(), fgetc() etc. */
#include <stdlib.h>     /* for EXIT_SUCCESS etc. */
#include <string.h>     /* for memset() */
#include <limits.h>     /* for UCHAR_MAX */
#include <stdint.h>     /* for SIZE_MAX */
#include <ctype.h>      /* for islower() */

static const char *input_file_name = "5_com_five.txt";
static const char *output_file_name = "5_test.txt";
static const char *alphabet = "abcdefghijklmnopqrstuvwxyz";


static void require_uchar_max_plus_1_does_not_overflow()
{
  /* Prevent the program compiling if UCHAR_MAX+1 is not representable
     as size_t. */
  char x[UCHAR_MAX >= SIZE_MAX ? -1 : 1];
  (void)x;
}

static int self_test()
{
  int ch = 0;
  for(;;)
    {
      if (ch > UCHAR_MAX)
    {
      break;
    }
      int expected = islower(ch) ? 1 : 0;
      int got = (strchr(alphabet, (char)ch) != NULL && ch != 0) ? 1 : 0;
      if (expected != got)
    {
      fprintf(stderr, "expected=%d, got=%d\n", expected, got);
      if (isgraph(ch))
        {
          fprintf(stderr, "self-test failed for character %c (%d)\n", ch, ch);
        }
      else
        {
          fprintf(stderr, "self-test failed for non-printing character %d\n", ch);
        }
      fprintf(stderr, "If you didn't expect a self-test failure, perhaps your locale settings are unexpected; try running with LC_ALL=C\n");
      return -1;
    }
      ++ch;
    }
  return 0;
}


/* The `want` and `freq` arrays indicate respectively whether this is
 * as letter we care about and how many of them we have seen in this
 * word.
 *
 * Since main() is not re-entrant, it will not matter from a
 * correctness point of view whether we allocate these on the stack or
 * not.  But since we may be executing the code on a platform with
 * limited stack, we avoid the stack.  This is not likely to be a
 * problem in practice however, because platforms where is likely to
 * be a problem (e.g. systems with CHAR_BIT>8 and a small stack) are
 * also unlikely to have a workign fopen() function.
 */
static int want[UCHAR_MAX + 1];
/* Assume unibyte input, i.e. that all letters are represented
   by a single char in the input. */
static size_t freq[UCHAR_MAX + 1];

static int print_word(FILE *fin, const fpos_t* pos, FILE *fout)
{
  if (0 != fsetpos(fin, pos))
    {
      perror("fsetpos");
      exit(EXIT_FAILURE);
    }
  int ch;
  while ((ch=fgetc(fin)) != EOF)
    {
      if (fputc(ch, fout) == EOF)
    {
      perror(output_file_name);
      return -1;
    }
      if (ch == '\n')
    {
      break;
    }
    }
  if (ferror(fin))
    {
      perror(input_file_name);
      return -1;
    }
  return 0;
}


int main (int argc, char *argv[]) {
  (void) argc;
  (void) argv;
  (void) require_uchar_max_plus_1_does_not_overflow; /* avoid unused-function warning */

  if (self_test() < 0)
    {
      return EXIT_FAILURE;
    }

  int finished = 0;

  FILE * input = fopen(input_file_name, "r");
  FILE * output = fopen(output_file_name, "w");

  if (input == NULL)
    {
      perror(input_file_name);
      return EXIT_FAILURE;
    }
  if (output == NULL)
    {
      perror(output_file_name);
      return EXIT_FAILURE;
    }

  memset(want, 0, sizeof(want));
  for (const char* p = alphabet; *p; ++p)
    {
      want[(unsigned char)*p] = 1;
    }

  while (!finished)
    {
      /* Zero out the frequency histogram.  Reduycing the number of
       * times we execute this loop is the primary motivation for the
       * existence of `alphabet` (and `self_test()`): without it we
       * could simply use islower() from the standard library.  This
       * is almost certainly a premature optimization, and may not
       * necessarily even be faster, depending on the behaviour of CPU
       * cache.  Certainly I wouldn't choose to (initially at least)
       * implement things this way in a professional context.  The
       * only way to tell for sure is to benchmark it, but for the
       * current implementation, I/O will likely dominate anyway.
       */
      for (const char*p = alphabet; *p; ++p)
    {
      freq[(unsigned char)*p] = 0;
    }
      /* Remember where this word started. */
      fpos_t word_start;
      if (0 != fgetpos(input, &word_start))
    {
      perror("fgetpos");
      return EXIT_FAILURE;
    }

      /* Read this word/line, character by character. */
      for (;;)
    {
      int ch = fgetc(input);
      if (ch == EOF)
        {
          finished = 1;
          break;
        }

      if (ch == '\n')
        {
          /* If our word contained a repeated character we already
         printed it, so there is nothing to do here. */
          break;
        }

      if (want[(unsigned char)ch])
        {
          /* We don't need to worry about overflow in freq[] as the
         value never gets higher than 2. */
          if (++freq[(unsigned char)ch] > 1)
        {
          if (print_word(input, &word_start, output) < 0)
            {
              /* We already printed the error message */
              return EXIT_FAILURE;
            }
          /* We have printed the word, and this leaves us at
             the end of the line.  So, break out of the inner
             loop in order to process the next word. */
          break;
        }
        }
    }
   }
   if (ferror(input))
     {
       perror(input_file_name);
       return EXIT_FAILURE;
     }
   return EXIT_SUCCESS;
}