Advertisement
Guest User

Untitled

a guest
Jun 23rd, 2017
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.39 KB | None | 0 0
  1. import re
  2. import os
  3. import codecs
  4. import csv
  5. from os import walk
  6.  
  7. f = []
  8. for (dirpath, dirnames, filenames) in walk(os.path.dirname(__file__)+"/news"):
  9.     f.extend(filenames)
  10.     break
  11.  
  12. se_pattern = re.compile(r"(?<=<se>)[\s\S\w\W]*?(?=</se>)")
  13.  
  14.  
  15. file_toWrite = open("Sentence_number.txt", 'w', encoding='utf-8')
  16. for path in f:
  17.     file = codecs.open(os.path.dirname(__file__)+"/news/"+path, 'r', encoding='windows-1251')
  18.     text = file.read()
  19.  
  20.     words_number = se_pattern.findall(text)
  21.     file_toWrite.write(path + "\t" + str(len(words_number)) + "\n")
  22.  
  23. file_toWrite.close()
  24.  
  25.  
  26. #Second task
  27. with open('authors.csv', 'w',  encoding='utf-8') as csvfile:
  28.     fieldnames = ['Название файла', 'Автор', 'Тематика текста']
  29.  
  30.     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  31.     writer.writeheader()
  32.  
  33.     author_pattern = re.compile(r"(?<=meta content=\").*?(?=\" name=\"author\")")
  34.     topic_pattern = re.compile(r"(?<=meta content=\").*?(?=\" name=\"topic\")")
  35.  
  36.     for path in f:
  37.         file = open(os.path.dirname(__file__) + "/news/" + path, 'r', encoding='windows-1251')
  38.         text = file.read()
  39.  
  40.         author = author_pattern.search(text).group()
  41.         topic = topic_pattern.search(text).group()
  42.         writer.writerow({'Название файла' : path, 'Автор':author, 'Тематика текста':topic})
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement