Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding=utf-8
- import os
- import sys
- import re
- import codecs
- def processFile(filepath):
- fp = codecs.open(filepath, 'rU', 'iso-8859-2')
- content = fp.read()
- filepath = re.findall("\d*.html", filepath)[0]
- author = re.findall(r'<META NAME=\"AUTOR\" CONTENT=\"(.*)\">', content, re.UNICODE)[0]
- dzial = re.findall(r'<META NAME=\"DZIAL\" CONTENT=\"(.*)\">', content, re.UNICODE)[0]
- key_words = re.findall(r'<META NAME=\"KLUCZOWE_\d*\" CONTENT=\"(.*)\">', content, re.UNICODE)
- key_words = ', '.join(key_words)
- article = re.findall(r'<[pP]>\s?(.*)\s?</[pP]>', content, re.UNICODE)
- article = ' '.join(article)
- article = re.sub(r'<[^>]*>', '',article, re.UNICODE)
- abb = len(re.findall('\s+([a-zA-Z]{1,3}\.)\s+[^A-Z]', article, re.UNICODE))
- article = re.sub('(\s+)([a-zA-Z]{1,3}\.)(\s+[^A-Z])', r'\3', article, re.UNICODE)
- dates = len(re.findall(r'((\d{4})(?P<delimiter>[-\/:\.])([0-2][0-9])(?P=delimiter)(0[0-9]|1[0-2])'
- r'|(\d{4})(?P<delimiter2>[-\/:\.])(3[0-1])(?P=delimiter2)(01|0[3-9]|1[0-2])|'
- r'([0-2][0-9])(?P<delimiter3>[-\/:\.])(0[0-9]|1[0-2])(?P=delimiter3)(\d{4})|'
- r'(3[0-1])(?P<delimiter4>[-\/:\.])(01|0[3-9]|1[0-2])(?P=delimiter4)(\d{4}))(?![\s\S]*\1)',
- content, re.UNICODE))
- sentences = len(re.findall(r'([A-Z][^\.!?]*[\.!?]+)', article, re.UNICODE))
- #(yourPatternHere)(?![\s\S]*\1)
- integers = len(re.findall(r'([+-]?(?<!\.)\b[0-9]{1,4}\b(?!\.[0-9])|'
- r'[+-]?(?<!\.)\b[1-2][0-9]{4}\b(?!\.[0-9])|'
- r'[+-]?(?<!\.)\b[3][1][0-9]{3}\b(?!\.[0-9])|'
- r'[+-]?(?<!\.)\b[3][2][0-6][0-9]{2}\b(?!\.[0-9])|'
- r'[+-]?(?<!\.)\b[3][2][7][0-5][0-9]\b(?!\.[0-9])|'
- r'[+-]?(?<!\.)\b[3][2][7][6][0-7]\b(?!\.[0-9])|'
- r'-(?<!\.)\b[3][2][7][6][8]\b(?!\.[0-9]))(?![\s\S]*\1)', content, re.UNICODE))
- floats = len(re.findall(r'(\b([-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)\b)(?![\s\S]*\1)', content, re.UNICODE))
- mails = len(re.findall(r'([a-z0-a9!#$%&\'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+)*'
- r'@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)(?![\s\S]*\1)', content, re.UNICODE))
- fp.close()
- print("nazwa pliku: "+ filepath)
- print("autor: " + author)
- print("dzial: " + dzial)
- print("slowa kluczowe: " + key_words)
- print("liczba skrotow: " + str(abb))
- print("liczba zdan: " + str(sentences))
- print("liczba liczb calkowitych z zakresu int: " + str(integers))
- print("liczba liczb zmiennoprzecinkowych: " + str(floats))
- print("liczba dat: " + str(dates))
- print("liczba adresow email: " + str(mails))
- print("\n")
- try:
- path = sys.argv[1]
- except Exception:
- print("Brak podanej nazwy katalogu")
- sys.exit(0)
- tree = os.walk(path)
- for root, dirs, files in tree:
- for f in files:
- if f.endswith(".html"):
- filepath = os.path.join(root, f)
- processFile(filepath)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement