Advertisement
Typhoon

Text to XML Parser

Apr 19th, 2015
338
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.61 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. import re
  5. import datetime
  6. import codecs
  7. import xml.etree.ElementTree as ET
  8. from bs4 import BeautifulSoup
  9.  
  10.  
  11. start = datetime.datetime.now()
  12. file = codecs.open('150326.tjd','r','Windows-1250')
  13. source = file.read()
  14.  
  15. print "Starting parser...."
  16.  
  17. xml = re.sub(r'&',r'&', source)
  18. xml = re.sub(r'-NAZOV-(.*?(\n))',r'<article>\n<name>\1</name>\n', xml)
  19. xml = re.sub(r'-ZDROJ-(.*?(\n))',r'<source>\1</source>\n', xml)
  20. xml = re.sub(r'-CISLO-(.*?(\n))',r'<number>\1</number>\n', xml)
  21. xml = re.sub(r'-STRANA-(.*?(\n))',r'<page>\1</page>\n', xml)
  22. xml = re.sub(r'-DATUM-(.*?(\n))',r'<date>\1</date>\n', xml)
  23. xml = re.sub(r'-AUTOR-(.*?(\n))',r'<author>\1</author>\n', xml)
  24. xml = re.sub(r'-POZN-(.*?(\n))',r'<description>\1</description>\n', xml)
  25. xml = re.sub(r'-PRILOHA-(.*?(\n))',r'<attachment>\1</attachment>\n', xml)
  26. xml = re.sub(r'-REGION-(.*?(\n))',r'<region>\1</region>\n', xml)
  27. xml = re.sub(r'-TEXT-(.*?(\n))',r'<text>', xml)
  28. xml = re.sub(r'-END-(.*?(\n))',r'</text>\n</article>', xml)
  29.  
  30. xml = "<?xml version=\"1.0\"?>\n<import>\n" + xml + "</import>"
  31.  
  32. print "Finishing parser...."
  33.  
  34. doc = BeautifulSoup(xml, 'xml')
  35. output = codecs.open( 'xmlout.xml', 'w','UTF-8' )
  36. output.write( doc.prettify() )
  37. output.close()
  38.  
  39. print "Working on parsing\n"
  40.  
  41. tree = ET.parse('xmlout.xml')
  42. root = tree.getroot()
  43.  
  44. def get_values(num):
  45.     article_name = root[num][0].text
  46.     print "Name: ", article_name.encode('UTF-8')
  47.     article_source = root[num][1].text
  48.     print "Source: ",article_source.encode('UTF-8')
  49.     article_number = root[num][2].text
  50.     print "Number: ", article_number.encode('UTF-8')
  51.     article_page = root[num][3].text
  52.     print "Page: ", article_page.encode('UTF-8')
  53.     article_date = root[num][4].text
  54.     print "Date: ", article_date.encode('UTF-8')
  55.     article_author = root[num][5].text
  56.     print "Author: ", article_author.encode('UTF-8')
  57.     article_description = root[num][6].text
  58.     print "Description:", article_description.encode('UTF-8')
  59.     article_attachment = root[num][7].text
  60.     print "Attachment: ", article_attachment.encode('UTF-8')
  61.     article_region = root[num][8].text
  62.     print "Region: ", article_region.encode('UTF-8')
  63.     article_text = root[num][9].text
  64.     print "Text: ", article_text.encode('UTF-8')
  65.  
  66. num = 0
  67. for child in root:
  68.     print "Article number from source file: " , num +1
  69.     get_values(num)
  70.  
  71.     num += 1
  72.  
  73. print "Number of articles : ", len(root.findall("./"))
  74.  
  75. stop = datetime.datetime.now()
  76. took = stop - start
  77.  
  78. print "All tasks finished. It took: ", int(took.total_seconds() * 1000), " milliseconds"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement