Wikipedia

import re,BeautifulSoup as bs,cPickle

def main():
    ids = {}
    pages = set([])
    idcounter = 1

    f = open("wiki.xml",'rb')

    f.seek(0,2)
    length = f.tell()
    f.seek(0)

    s = ['']
    links = {}

    finder = re.compile(r'\[\[(.*?)(\||\]\])')
    stop = True
    cachehit=0
    total = 1
    wrap = 0

    while stop:
        while '</page>' not in s[-1]: #read until a whole page has been read
            s.append(f.read(16384))
            if s[-1] == '':
                stop = False
                del s[-1]
                break

        split = s[-1].split('</page>',1) #removes the contents of the next page
        s[-1] = split[0]+'</page>'
        if len(split)==1:
            leftover = ''
        else:
            leftover = split[1]
        s = ''.join(s)

        soup = bs.BeautifulSoup(s)

        s = [leftover]

        for page in soup.findAll('page'):
            title = page.find('title').text

            if title not in pages:#add to the link-to-number dict
                pages.add(title)
                ids[title] = idcounter
                idcounter+=1

            dd = "Currently on "+str(title.encode('ascii',"ignore"))
            if len(dd)<45:#stops the text from going on to the next one
                if total==0:
                    total=1
                print dd+(' '*(55-len(dd)))+"Done %f"%(float(f.tell())/length)+'% '+"%f"%(cachehit/total)+'\r',

            links[ids[title]]=[]
            text = (page.findAll('text')[0]).text

            wrap+=1
            if wrap%1000==0: #stops this number from flickering too fast to be read
                yay = 0.0
                total=0.0
                wrap=0

            for link in finder.findall(text):
                if link[0] in pages:
                    links[ids[title]].append(ids[link[0]])
                    cachehit+=1
                    total+=1
                else:
                    pages.add(link[0])
                    ids[link[0]]=idcounter
                    idcounter+=1
                    total+=1
    cPickle.dump(links,open(r'links.l','wb'))
    cPickle.dump(ids,open(r"links.id",'wb'))

if __name__=='__main__':
    main()