Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import re
- import datetime
- import codecs
- import xml.etree.ElementTree as ET
- from bs4 import BeautifulSoup
- start = datetime.datetime.now()
- file = codecs.open('150326.tjd','r','Windows-1250')
- source = file.read()
- print "Starting parser...."
- xml = re.sub(r'&',r'&', source)
- xml = re.sub(r'-NAZOV-(.*?(\n))',r'<article>\n<name>\1</name>\n', xml)
- xml = re.sub(r'-ZDROJ-(.*?(\n))',r'<source>\1</source>\n', xml)
- xml = re.sub(r'-CISLO-(.*?(\n))',r'<number>\1</number>\n', xml)
- xml = re.sub(r'-STRANA-(.*?(\n))',r'<page>\1</page>\n', xml)
- xml = re.sub(r'-DATUM-(.*?(\n))',r'<date>\1</date>\n', xml)
- xml = re.sub(r'-AUTOR-(.*?(\n))',r'<author>\1</author>\n', xml)
- xml = re.sub(r'-POZN-(.*?(\n))',r'<description>\1</description>\n', xml)
- xml = re.sub(r'-PRILOHA-(.*?(\n))',r'<attachment>\1</attachment>\n', xml)
- xml = re.sub(r'-REGION-(.*?(\n))',r'<region>\1</region>\n', xml)
- xml = re.sub(r'-TEXT-(.*?(\n))',r'<text>', xml)
- xml = re.sub(r'-END-(.*?(\n))',r'</text>\n</article>', xml)
- xml = "<?xml version=\"1.0\"?>\n<import>\n" + xml + "</import>"
- print "Finishing parser...."
- doc = BeautifulSoup(xml, 'xml')
- output = codecs.open( 'xmlout.xml', 'w','UTF-8' )
- output.write( doc.prettify() )
- output.close()
- print "Working on parsing\n"
- tree = ET.parse('xmlout.xml')
- root = tree.getroot()
- def get_values(num):
- article_name = root[num][0].text
- print "Name: ", article_name.encode('UTF-8')
- article_source = root[num][1].text
- print "Source: ",article_source.encode('UTF-8')
- article_number = root[num][2].text
- print "Number: ", article_number.encode('UTF-8')
- article_page = root[num][3].text
- print "Page: ", article_page.encode('UTF-8')
- article_date = root[num][4].text
- print "Date: ", article_date.encode('UTF-8')
- article_author = root[num][5].text
- print "Author: ", article_author.encode('UTF-8')
- article_description = root[num][6].text
- print "Description:", article_description.encode('UTF-8')
- article_attachment = root[num][7].text
- print "Attachment: ", article_attachment.encode('UTF-8')
- article_region = root[num][8].text
- print "Region: ", article_region.encode('UTF-8')
- article_text = root[num][9].text
- print "Text: ", article_text.encode('UTF-8')
- num = 0
- for child in root:
- print "Article number from source file: " , num +1
- get_values(num)
- num += 1
- print "Number of articles : ", len(root.findall("./"))
- stop = datetime.datetime.now()
- took = stop - start
- print "All tasks finished. It took: ", int(took.total_seconds() * 1000), " milliseconds"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement