Advertisement
Guest User

Untitled

a guest
Jun 20th, 2018
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.68 KB | None | 0 0
  1. import xml.etree.ElementTree as ET
  2.  
  3.  
  4. def get_tree(url):
  5. with open(url) as f:
  6. root = ET.fromstring(f.read())
  7. return root[1]
  8.  
  9. def with_paragraf(data):
  10. table = []
  11. for p in data:
  12. paragraf = []
  13. for sentence in p:
  14. sentence_string = ''
  15. for word in sentence:
  16. if word[0].tail is not None:
  17. merge = word[0].tail + word.tail.replace('\n', '')
  18. print(merge)
  19. sentence_string += merge + ' '
  20. paragraf.append(sentence_string)
  21. table.append(paragraf)
  22. return table
  23.  
  24. data = get_tree('itartass4.html')
  25. bits_list = with_paragraf(data)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement