Advertisement
Guest User

Untitled

a guest
Apr 1st, 2020
128
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.13 KB | None | 0 0
  1. import framler
  2. import re
  3. import os
  4. import feedparser
  5. dt = framler.NewspapersParser("vnexpress")
  6. rss_url = input("Link RSS: ")
  7. feed = feedparser.parse(rss_url)
  8. items = feed["items"]
  9. for item in items:
  10. url = item["link"]
  11. article = dt.parse(url)
  12. sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', article.text)
  13. # url.split("/")[3] trả về loại bài báo
  14. directory_save = "data/" + url.split("/")[3]
  15. # nếu folder chưa tồn tại thì tạo mới
  16. if not os.path.exists(directory_save):
  17. os.makedirs(directory_save,exist_ok=True)
  18. # tách thành các câu
  19. sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', article.text)
  20. print("Num sentences: " , len(sentences))
  21. if(len(sentences) >= 14 and len(sentences) < 20):
  22. # ghi các câu này thành dữ liệu
  23. f = open(directory_save + "/" +"data.txt","w",encoding='utf-8')
  24. # ghi url vào dòng đầu
  25. f.write(url+"\n")
  26. for sentence in sentences:
  27. f.write(sentence+ "\n")
  28. f.close()
  29. print("Saved")
  30. print(url)
  31. break
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement