Advertisement
Try95th

studentinfo_code [edited copy - for so_q_74567311]

Dec 12th, 2022
139
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ## for https://stackoverflow.com/q/74567311/6146136
  2. #### links to outputs:
  3. ###### studentinfo_output.xml at https://pastebin.com/bPFg8TBL
  4. ###### studentinfo_op_unpretty.xml at https://pastebin.com/6NgFsYU0
  5. ###### studentinfo_op_regpretty.xml at https://pastebin.com/5FQxLzCC
  6.  
  7.  
  8. from bs4 import BeautifulSoup
  9.  
  10. with open('studentinfo.xml', 'r') as f:
  11.     xhtml = f.read() # [I just don't like "file" as a variable name]
  12. soup = BeautifulSoup(xhtml, 'lxml') ##  'xml' parser ---> lose namespaces
  13.  
  14. enumTags = ['st:name']
  15. for d in [c for c in soup.descendants if c.name]:
  16.     for name in enumTags:
  17.         for i, t in enumerate(d.find_all(name, recursive=False)):
  18.             t.name = f'{t.name}{i}'
  19.  
  20.             # print(soup.prettify)
  21.             # prettify is a method so it should be prettify()
  22.             # if you print inside loop, it'll print with every edit
  23.  
  24. # to save the new xml [3 different options]
  25. with open('studentinfo_output.xml', 'wb') as f:
  26.     f.write(soup.prettify('utf-8')) # option 1
  27. with open('studentinfo_op_unpretty.xml', 'wb') as f:
  28.     f.write(str(soup).encode('utf-8')) # option 2
  29.  
  30. import re
  31. rpx = re.sub('[^> ]\s*\n\s*<', '<', soup.prettify())
  32. rpx = re.sub('>\s*\n\s*[^< ]', '>', rpx)
  33. with open('studentinfo_op_regpretty.xml', 'wb') as f:
  34.     f.write(rpx.encode('utf-8')) # option 3
  35.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement