Don't like ads? PRO users don't see any ads ;-)
Guest

Scapre Daily Sangram

By: a guest on Apr 30th, 2012  |  syntax: Python  |  size: 1.09 KB  |  hits: 27  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. import codecs
  2. from urllib import urlopen
  3. from BeautifulSoup import BeautifulSoup
  4.  
  5.  
  6. def scrape(folder, start, end):
  7.     print '-*-*-*-*-*-*-*-*- Scarping Starts -*-*-*-*-*-*-*-*-*-'
  8.     for I in range(start, end+1):
  9.         url = 'http://www.dailysangram.com/news_details.php?news_id=' + str(I)
  10.         print url , ' : '
  11.         wp = urlopen(url)
  12.         soup =BeautifulSoup(wp)
  13.         if soup:
  14.             doc = ''
  15.             texts = soup.findAll(True, {'class':'news-details'})
  16.             if texts:
  17.                 for text in texts:
  18.                     doc += text.getText() + '\n'
  19.         if doc.strip()=='':
  20.             continue
  21.         print doc
  22. #        confirm = raw_input('Write in document (y/n)? ')
  23. #        if confirm != 'n':
  24.         fp = codecs.open(folder+'/'+str(I)+'.dat', 'w', encoding='utf-8')
  25.         fp.writelines(doc)
  26.         fp.close()
  27.         print '-------------- end -----------------\n\n'
  28.     print '-*-*-*-*-*-*-*-*-*- Done All Scraping -*-*-*-*-*-*-*-*-*-*-*-'
  29.  
  30.  
  31. "------------------------------------------------------"
  32.  
  33. scrape('test', 24700, 24800)