Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re,BeautifulSoup as bs,cPickle
- def main():
- ids = {}
- pages = set([])
- idcounter = 1
- f = open("wiki.xml",'rb')
- f.seek(0,2)
- length = f.tell()
- f.seek(0)
- s = ['']
- links = {}
- finder = re.compile(r'\[\[(.*?)(\||\]\])')
- stop = True
- cachehit=0
- total = 1
- wrap = 0
- while stop:
- while '</page>' not in s[-1]: #read until a whole page has been read
- s.append(f.read(16384))
- if s[-1] == '':
- stop = False
- del s[-1]
- break
- split = s[-1].split('</page>',1) #removes the contents of the next page
- s[-1] = split[0]+'</page>'
- if len(split)==1:
- leftover = ''
- else:
- leftover = split[1]
- s = ''.join(s)
- soup = bs.BeautifulSoup(s)
- s = [leftover]
- for page in soup.findAll('page'):
- title = page.find('title').text
- if title not in pages:#add to the link-to-number dict
- pages.add(title)
- ids[title] = idcounter
- idcounter+=1
- dd = "Currently on "+str(title.encode('ascii',"ignore"))
- if len(dd)<45:#stops the text from going on to the next one
- if total==0:
- total=1
- print dd+(' '*(55-len(dd)))+"Done %f"%(float(f.tell())/length)+'% '+"%f"%(cachehit/total)+'\r',
- links[ids[title]]=[]
- text = (page.findAll('text')[0]).text
- wrap+=1
- if wrap%1000==0: #stops this number from flickering too fast to be read
- yay = 0.0
- total=0.0
- wrap=0
- for link in finder.findall(text):
- if link[0] in pages:
- links[ids[title]].append(ids[link[0]])
- cachehit+=1
- total+=1
- else:
- pages.add(link[0])
- ids[link[0]]=idcounter
- idcounter+=1
- total+=1
- cPickle.dump(links,open(r'links.l','wb'))
- cPickle.dump(ids,open(r"links.id",'wb'))
- if __name__=='__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment