Advertisement
Guest User

Untitled

a guest
Apr 1st, 2020
142
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.24 KB | None | 0 0
  1. import framler
  2. import re
  3. import os
  4. import feedparser
  5. dt = framler.NewspapersParser("vnexpress")
  6. while True:
  7. rss_url = input("Link RSS: ")
  8. feed = feedparser.parse(rss_url)
  9. items = feed["items"]
  10. for item in items:
  11. url = item["link"]
  12. article = dt.parse(url)
  13. sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', article.text)
  14. # url.split("/")[3] trả về loại bài báo
  15. directory_save = "data/" + url.split("/")[3]
  16. # nếu folder chưa tồn tại thì tạo mới
  17. if not os.path.exists(directory_save):
  18. os.makedirs(directory_save,exist_ok=True)
  19. # tách thành các câu
  20. sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', article.text)
  21. print("Num sentences: " , len(sentences))
  22. if(len(sentences) >= 14 and len(sentences) < 20):
  23. # ghi các câu này thành dữ liệu
  24. f = open(directory_save + "/" +"data.txt","w",encoding='utf-8')
  25. # ghi url vào dòng đầu
  26. f.write(url+"\n")
  27. for sentence in sentences:
  28. f.write(sentence+ "\n")
  29. f.close()
  30. print("Saved")
  31. print(url)
  32. break
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement