Advertisement
Guest User

Untitled

a guest
Feb 25th, 2013
227
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.39 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. from lxml import etree
  4. import re
  5. import gc
  6. import codecs
  7.  
  8.  
  9. inPage = False
  10. title = None
  11. text = None
  12. ns = None
  13. inText = False
  14.  
  15. f = codecs.open('mygraph2.csv', 'w', 'utf-8')
  16.  
  17.  
  18. def process(title,text):
  19.         if text is None: text = ""
  20.  
  21.         if text.find(u"#REDIRECT") != -1: return
  22.  
  23.         words = text.replace('[[',' [[ ').replace(']]',' ]] ').split();
  24.  
  25.         linkname = []
  26.         islink = False
  27.  
  28.         for word in words:
  29.             if word.startswith('[['):
  30.                 islink = True
  31.  
  32.             if islink: linkname.append(word)
  33.  
  34.             if word.endswith(']]'):
  35.                 islink = False
  36.                 linkname2 = " ".join(linkname)
  37.  
  38.                 linkname2 = linkname2[2:-2].split("|")[0]
  39.  
  40.                 if len(linkname2) > 4 and linkname2.find(':')==-1 :                    
  41.                     f.write(title)
  42.                     f.write("\t")
  43.                     f.write(linkname2)
  44.                     f.write("\n")
  45.  
  46.                 del linkname2
  47.                 del linkname               
  48.                 linkname = []
  49.             del word
  50.  
  51.  
  52.  
  53.  
  54. for line in codecs.open('ruwiki-latest-pages-articles.xml','r', 'utf-8'):
  55.  
  56.     line = line.strip()
  57.     if line == '<page>':
  58.         inPage = True
  59.         inText = False
  60.         title = None
  61.         text = ""
  62.         ns = None
  63.  
  64.     if line == '</page>':
  65.         inPage = False
  66.         if ns == '0': process(title,text)
  67.  
  68.     if inPage:
  69.         if line.startswith('<title>'): title = line[7:-8]
  70.         if line.startswith('<text '): inText = True
  71.         if line.startswith('<ns>'): ns = line[4:-5]
  72.         if inText: text = text+line
  73.         if line.find('</text>') != -1: inText = False
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement