Guest User

Untitled

a guest
Nov 24th, 2017
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.00 KB | None | 0 0
  1. import xml.etree.ElementTree as ET
  2. import re
  3.  
  4. newFile = open('using_element_tree.xml', 'w', encoding="utf8")
  5.  
  6. file = "newscor.xml"
  7. context = ET.iterparse(file, events=("start", "end"))
  8. context = iter(context)
  9.  
  10.  
  11. for event, elem in context:
  12. tag = elem.tag
  13. tag = re.sub('{http://www.xml-ces.org/schema}', '', tag)
  14.  
  15. if event == 'start' and (tag == 's' or tag == 'q'):
  16. value = elem.text
  17. if value:
  18. value = value.strip("&, <, >")
  19. value = value.strip()
  20. newFile.write(value)
  21. print(value)
  22.  
  23. elem.clear()
  24.  
  25. from bs4 import BeautifulSoup
  26. from bs4 import SoupStrainer
  27.  
  28. newFile = open('using_bs4.xml', 'w', encoding="utf-8")
  29.  
  30.  
  31. def only_s_and_q_tags():
  32. return "s" or "q"
  33.  
  34. s_and_q_tags = SoupStrainer(only_s_and_q_tags())
  35.  
  36. with open("newscor.xml", encoding="utf-8") as fp:
  37. soup = BeautifulSoup(fp, "xml", parse_only=s_and_q_tags)
  38.  
  39.  
  40. for string in soup.strings:
  41. if string not in ['n', 'rn']:
  42. print(repr(string))
  43. newFile.write(string)
Add Comment
Please, Sign In to add comment