Advertisement
Guest User

Untitled

a guest
Oct 25th, 2016
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.15 KB | None | 0 0
  1. # coding=utf-8
  2. import os
  3. import sys
  4. import re
  5. import codecs
  6.  
  7.  
  8. def processFile(filepath):
  9.     fp = codecs.open(filepath, 'rU', 'iso-8859-2')
  10.     content = fp.read()
  11.  
  12.     filepath = re.findall("\d*.html", filepath)[0]
  13.     author = re.findall(r'<META NAME=\"AUTOR\" CONTENT=\"(.*)\">', content, re.UNICODE)[0]
  14.     dzial = re.findall(r'<META NAME=\"DZIAL\" CONTENT=\"(.*)\">', content, re.UNICODE)[0]
  15.     key_words = re.findall(r'<META NAME=\"KLUCZOWE_\d*\" CONTENT=\"(.*)\">', content, re.UNICODE)
  16.     key_words = ', '.join(key_words)
  17.  
  18.  
  19.     article = re.findall(r'<[pP]>\s?(.*)\s?</[pP]>', content, re.UNICODE)
  20.     article = ' '.join(article)
  21.     article = re.sub(r'<[^>]*>', '',article, re.UNICODE)
  22.  
  23.  
  24.     abb = len(re.findall('\s+([a-zA-Z]{1,3}\.)\s+[^A-Z]', article, re.UNICODE))
  25.  
  26.     article = re.sub('(\s+)([a-zA-Z]{1,3}\.)(\s+[^A-Z])', r'\3', article, re.UNICODE)
  27.  
  28.     dates = len(re.findall(r'((\d{4})(?P<delimiter>[-\/:\.])([0-2][0-9])(?P=delimiter)(0[0-9]|1[0-2])'
  29.                      r'|(\d{4})(?P<delimiter2>[-\/:\.])(3[0-1])(?P=delimiter2)(01|0[3-9]|1[0-2])|'
  30.                     r'([0-2][0-9])(?P<delimiter3>[-\/:\.])(0[0-9]|1[0-2])(?P=delimiter3)(\d{4})|'
  31.                     r'(3[0-1])(?P<delimiter4>[-\/:\.])(01|0[3-9]|1[0-2])(?P=delimiter4)(\d{4}))(?![\s\S]*\1)',
  32.                      content, re.UNICODE))
  33.  
  34.     sentences = len(re.findall(r'([A-Z][^\.!?]*[\.!?]+)', article, re.UNICODE))
  35.  
  36.  
  37.     #(yourPatternHere)(?![\s\S]*\1)
  38.  
  39.     integers = len(re.findall(r'([+-]?(?<!\.)\b[0-9]{1,4}\b(?!\.[0-9])|'
  40.                               r'[+-]?(?<!\.)\b[1-2][0-9]{4}\b(?!\.[0-9])|'
  41.                               r'[+-]?(?<!\.)\b[3][1][0-9]{3}\b(?!\.[0-9])|'
  42.                               r'[+-]?(?<!\.)\b[3][2][0-6][0-9]{2}\b(?!\.[0-9])|'
  43.                               r'[+-]?(?<!\.)\b[3][2][7][0-5][0-9]\b(?!\.[0-9])|'
  44.                               r'[+-]?(?<!\.)\b[3][2][7][6][0-7]\b(?!\.[0-9])|'
  45.                               r'-(?<!\.)\b[3][2][7][6][8]\b(?!\.[0-9]))(?![\s\S]*\1)', content, re.UNICODE))
  46.  
  47.  
  48.     floats = len(re.findall(r'(\b([-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)\b)(?![\s\S]*\1)', content, re.UNICODE))
  49.  
  50.     mails = len(re.findall(r'([a-z0-a9!#$%&\'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+)*'
  51.                            r'@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)(?![\s\S]*\1)', content, re.UNICODE))
  52.  
  53.     fp.close()
  54.  
  55.  
  56.  
  57.  
  58.     print("nazwa pliku: "+ filepath)
  59.     print("autor: " + author)
  60.     print("dzial: " + dzial)
  61.     print("slowa kluczowe: " + key_words)
  62.     print("liczba skrotow: " + str(abb))
  63.     print("liczba zdan: " + str(sentences))
  64.     print("liczba liczb calkowitych z zakresu int: " + str(integers))
  65.     print("liczba liczb zmiennoprzecinkowych: " + str(floats))
  66.     print("liczba dat: " + str(dates))
  67.     print("liczba adresow email: " + str(mails))
  68.     print("\n")
  69.  
  70.  
  71.  
  72. try:
  73.     path = sys.argv[1]
  74. except Exception:
  75.     print("Brak podanej nazwy katalogu")
  76.     sys.exit(0)
  77.  
  78. tree = os.walk(path)
  79.  
  80. for root, dirs, files in tree:
  81.     for f in files:
  82.         if f.endswith(".html"):
  83.             filepath = os.path.join(root, f)
  84.  
  85.             processFile(filepath)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement