Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- In [1]: import re
- In [2]: r=re.compile('.*href="(.*?)">(.*)</a.*')
- In [6]: html='''<ul><li><span><font class="hui">已浏览:1612</font></span><a rel="nofollow" target="_blank" href="http://today.hitwh.edu.cn/news_show.asp?id=10887">2012-2013学年秋季学期公共课考试时间安排</a><font color="#3a6399"> 2012-10-09</font></ul>'''
- In [7]: r.findall(html)
- Out[7]:
- [('http://today.hitwh.edu.cn/news_show.asp?id=10887',
- '2012-2013\xe5\xad\xa6\xe5\xb9\xb4\xe7\xa7\x8b\xe5\xad\xa3\xe5\xad\xa6\xe6\x9c\x9f\xe5\x85\xac\xe5\x85\xb1\xe8\xaf\xbe\xe8\x80\x83\xe8\xaf\x95\xe6\x97\xb6\xe9\x97\xb4\xe5\xae\x89\xe6\x8e\x92')]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement