import re
import urllib2
from urlparse import urlparse
import os
link=re.compile(r'''<a.*?href=["'`](.+?)["'`].*?>.*?</a>''',re.IGNORECASE)
def get_links(code):
m=re.findall(link,code)
return m
for x in range(len(m)):
m[x]=m[x].groups()[1]
return m
def get_html(url):
try:
opener=urllib2.build_opener()
opener.addheaders=[('User-agent','Mozilla/5.0')]
infile=opener.open(url)
return infile.read()
except urllib2.HTTPError,error:
return error.read()
#remove media links and add the host to relative links
def fix(links,host,path):
l=0
while l<len(links):
#check for media
parts=os.path.splitext(links[l])
if len(parts)>1 and parts[1][1:].lower() in ['png','gif','svg']:
del links[l]
continue
url=urlparse(links[l])
#check for external links
if url.netloc!='' and url.netloc!=host:
del links[l]
continue
if ':' in url.path:
del links[l]
continue
#probably a link to the same site
else:
links[l]='http://'+host+url.path
l+=1
return links
def valid_links(url):
u=urlparse(url)
return list(set(fix(get_links(get_html(url)),u.netloc,u.path)))
class tree(list):
def __init__(self,value=None,children=[]):
list.__init__(self,children)
self.value=value
self.parent=None
#def __str__(self):
# return '(%s)'%self.value+list.__str__(self)
def __repr__(self):
return 'tree(%r,%s)'%(self.value,list.__repr__(self))
def append(self,node):
if type(node)==tree:
node.parent=self
list.append(self,node)
else:
node=tree(node)
node.parent=self
list.append(self,node)
def leaves(self):
if len(self)==0:
return [self]
step=[0]
current=self[0]
ret=[]
while id(current)!=id(self) and len(step)>0 and len(self)>step[-1]:
if len(current)==0:
ret.append(current)
current=current.parent
step.pop()
if len(step)>0:
step[-1]+=1
continue
if len(current)>step[-1]:
current=current[step[-1]]
step.append(0)
else:
while len(current)<=step[-1] and id(current)!=id(self) and len(self)>step[-1]:
current=current.parent
step.pop()
if len(step)>0:
step[-1]+=1
return ret
def unique(self,values):
if len(self)==0:
return values
step=[0]
current=self[0]
while id(current)!=id(self) and len(step)>0 and len(self)>step[-1]:
if current.value in values:
del values[values.index(current.value)]
if len(current)>step[-1]:
current=current[step[-1]]
step.append(0)
else:
while len(step)>0 and len(current)<=step[-1] and id(current)!=id(self) and len(self)>step[-1]:
current=current.parent
step.pop()
if len(step)>0:
step[-1]+=1
return values
def parse_until(start,end):
links=tree(start)
while True:
leaves=links.leaves()
for leaf in leaves:
page=valid_links(leaf.value)
if end in page:
current=leaf
trace=[end,leaf.value]
while current!=links:
current=current.parent
trace.append(current.value)
trace.reverse()
return trace
links.unique(page)
for p in page:
leaf.append(p)
def format(links):
s=''
for l in links:
u=urlparse(l)
s+=u.path[len('/wiki/'):].replace('_',' ')+' > '
return s[:-3]
print format(parse_until('http://en.wikipedia.org/wiki/Goku','http://en.wikipedia.org/wiki/Dojo'))