Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- from lxml import etree
- import re
- import gc
- import codecs
- inPage = False
- title = None
- text = None
- ns = None
- inText = False
- f = codecs.open('mygraph2.csv', 'w', 'utf-8')
- def process(title,text):
- if text is None: text = ""
- if text.find(u"#REDIRECT") != -1: return
- words = text.replace('[[',' [[ ').replace(']]',' ]] ').split();
- linkname = []
- islink = False
- for word in words:
- if word.startswith('[['):
- islink = True
- if islink: linkname.append(word)
- if word.endswith(']]'):
- islink = False
- linkname2 = " ".join(linkname)
- linkname2 = linkname2[2:-2].split("|")[0]
- if len(linkname2) > 4 and linkname2.find(':')==-1 :
- f.write(title)
- f.write("\t")
- f.write(linkname2)
- f.write("\n")
- del linkname2
- del linkname
- linkname = []
- del word
- for line in codecs.open('ruwiki-latest-pages-articles.xml','r', 'utf-8'):
- line = line.strip()
- if line == '<page>':
- inPage = True
- inText = False
- title = None
- text = ""
- ns = None
- if line == '</page>':
- inPage = False
- if ns == '0': process(title,text)
- if inPage:
- if line.startswith('<title>'): title = line[7:-8]
- if line.startswith('<text '): inText = True
- if line.startswith('<ns>'): ns = line[4:-5]
- if inText: text = text+line
- if line.find('</text>') != -1: inText = False
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement