Advertisement
Woobinda

Pattern "Template Method"

Dec 28th, 2018
178
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.80 KB | None | 0 0
  1. import abc
  2. import html.parser
  3. import os
  4. import re
  5. import sys
  6.  
  7.  
  8. def main():
  9.     if len(sys.argv) == 1 or sys.argv[1] in {"-h", "--help"}:
  10.         print("usage: {} <files>".format(os.path.basename(sys.argv[0])))
  11.         sys.exit(1)
  12.     count_words_in_files(sys.argv[1:])
  13.  
  14.  
  15. def count_words_in_files(files):
  16.     total = 0
  17.     for filename in files:
  18.         count = count_words(filename)
  19.         if count is not None:
  20.             total += count
  21.             print("{:9,} words in {}".format(count, filename))
  22.     print("total: {:,} words".format(total))
  23.  
  24.  
  25. def count_words(filename):
  26.     for wordCounter in (PlainTextWordCounter, HtmlWordCounter):
  27.         if wordCounter.can_count(filename):
  28.             return wordCounter.count(filename)
  29.  
  30.  
  31. class AbstractWordCounter(
  32.         metaclass=abc.ABCMeta):
  33.  
  34.     @staticmethod
  35.     @abc.abstractmethod
  36.     def can_count(filename):
  37.         pass
  38.  
  39.  
  40.     @staticmethod
  41.     @abc.abstractmethod
  42.     def count(filename):
  43.         pass
  44.  
  45.  
  46. class PlainTextWordCounter(AbstractWordCounter):
  47.  
  48.     @staticmethod
  49.     def can_count(filename):
  50.         return filename.lower().endswith(".txt")
  51.  
  52.  
  53.     @staticmethod
  54.     def count(filename):
  55.         if not PlainTextWordCounter.can_count(filename):
  56.             return 0
  57.         regex = re.compile(r"\w+")
  58.         total = 0
  59.         with open(filename, encoding="utf-8") as file:
  60.             for line in file:
  61.                 for _ in regex.finditer(line):
  62.                     total += 1
  63.         return total
  64.  
  65.  
  66. class HtmlWordCounter(AbstractWordCounter):
  67.  
  68.     class __HtmlParser(html.parser.HTMLParser):
  69.  
  70.         def __init__(self):
  71.             super().__init__()
  72.             self.regex = re.compile(r"\w+")
  73.             self.inText = True
  74.             self.text = []
  75.             self.count = 0
  76.  
  77.  
  78.         def handle_starttag(self, tag, attrs):
  79.             if tag in {"script", "style"}:
  80.                 self.inText = False
  81.  
  82.  
  83.         def handle_endtag(self, tag):
  84.             if tag in {"script", "style"}:
  85.                 self.inText = True
  86.             else:
  87.                 for _ in self.regex.finditer(" ".join(self.text)):
  88.                     self.count += 1
  89.                 self.text = []
  90.  
  91.  
  92.         def handle_data(self, text):
  93.             if self.inText:
  94.                 text = text.rstrip()
  95.                 if text:
  96.                     self.text.append(text)
  97.  
  98.  
  99.     @staticmethod
  100.     def can_count(filename):
  101.         return filename.lower().endswith((".htm", ".html"))
  102.  
  103.  
  104.     @staticmethod
  105.     def count(filename):
  106.         if not HtmlWordCounter.can_count(filename):
  107.             return 0
  108.         parser = HtmlWordCounter.__HtmlParser()
  109.         with open(filename, encoding="utf-8") as file:
  110.             parser.feed(file.read())
  111.         return parser.count
  112.  
  113.  
  114. if __name__ == "__main__":
  115.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement