Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Traceback (most recent call last):
- File "scrapewaybackblog.py", line 17, in <module>
- daypos = byline.find(re.compile("[A-Z][a-z]*s"))
- TypeError: expected a character buffer object
- for i in xrange(3, 1, -1):
- page = urllib2.urlopen("http://web.archive.org/web/20090204221349/http://www.americansforprosperity.org/nationalblog?page={}".format(i))
- soup = BeautifulSoup(page.read())
- snippet = soup.find_all('div', attrs={'class': 'blog-box'})
- for div in snippet:
- byline = div.find('div', attrs={'class': 'date'}).text.encode('utf-8')
- text = div.find('div', attrs={'class': 'right-box'}).text.encode('utf-8')
- monthpos = byline.find(",")
- daypos = byline.find(re.compile("[A-Z][a-z]*s"))
- yearpos = byline.find(re.compile("[A-Z][a-z]*Dd*w*s"))
- endpos = monthpos + len(byline)
- month = byline[monthpos+1:daypos]
- day = byline[daypos+0:yearpos]
- year = byline[yearpos+2:endpos]
- output_files_pathname = 'Data/' # path where output will go
- new_filename = year + month + day + ".txt"
- outfile = open(output_files_pathname + new_filename,'w')
- outfile.write(date)
- outfile.write("n")
- outfile.write(text)
- outfile.close()
- print "finished another url from page {}".format(i)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement