Advertisement
szabozoltan69

file_scraper.py

Apr 11th, 2019
229
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import urllib2, json
  2. from bs4 import BeautifulSoup
  3. dirname = '/var/www/cron/file_scrapper/'
  4.  
  5. def makelist(table):
  6.   result = []
  7.   allrows = table.findAll('tr')
  8.   for row in allrows:
  9.     url=''
  10.     for a in row.find_all('a', href=True):
  11.         url = a['href']
  12.     result.append([url])
  13.     allcols = row.findAll('td')
  14.     for col in allcols:
  15.       thestrings = [unicode(s).strip() for s in col.findAll(text=True)]
  16.       thetext = ''.join(thestrings)
  17.       result[-1].append(thetext)
  18.   return result
  19.  
  20. print 'Getting appeals list'
  21. url = "https://proxy.hxlstandard.org/data.json?strip-headers=on&filter03=merge&merge-url03=https%3A//docs.google.com/spreadsheets/d/1rVAE8b3uC_XIqU-eapUGLU7orIzYSUmvlPm9tI0bCbU/edit%23gid%3D0&clean-date-tags01=%23date&filter02=select&merge-keys03=%23meta%2Bid&filter04=replace-map&force=on&filter05=merge&merge-tags03=%23meta%2Bcoverage%2C%23meta%2Bfunding&select-query02-01=%23date%2Bend%3E2016-10-11&cut-include-tags06=%23meta%2Bid&merge-keys05=%23country%2Bname&merge-tags05=%23country%2Bcode&filter01=clean&replace-map-url04=https%3A//docs.google.com/spreadsheets/d/1hTE0U3V8x18homc5KxfA7IIrv1Y9F1oulhJt0Z4z3zo/edit%3Fusp%3Dsharing&filter06=cut&merge-url05=https%3A//docs.google.com/spreadsheets/d/1GugpfyzridvfezFcDsl6dNlpZDqI8TQJw-Jx52obny8/edit%3Fusp%3Dsharing&url=https%3A//docs.google.com/spreadsheets/d/19pBx2NpbgcLFeWoJGdCqECT2kw9O9_WmcZ3O41Sj4hU/edit%23gid%3D0"
  22. response = urllib2.urlopen(url)
  23. data = json.loads(response.read())
  24. print 'Getting file URLs'
  25. output = [['#meta+url','#country','#meta+id','#meta+appealname','#meta+documentname','#date']];
  26. i=0
  27. for d in data:
  28.     if i>0:
  29.         response = urllib2.urlopen('http://www.ifrc.org/en/publications-and-reports/appeals/?ac='+d[0]+'&at=0&c=&co=&dt=1&f=&re=&t=&ti=&zo=')
  30.         soup = BeautifulSoup(response.read(), "lxml")
  31.         div = soup.find('div', id='cw_content')
  32.         for t in div.findAll('tbody'):
  33.             print makelist(t);
  34.             output = output + makelist(t)
  35.     i=i+1
  36.  
  37. import gspread
  38. from oauth2client.client import SignedJwtAssertionCredentials
  39.  
  40. json_key = json.load(open(dirname+'python_scraper-d4dd99f5d776.json'))
  41. scope = ['https://spreadsheets.google.com/feeds']
  42.  
  43. credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'], scope)
  44.  
  45. gc = gspread.authorize(credentials)
  46.  
  47. sh = gc.open("Current Operation files")
  48. worksheet = sh.sheet1
  49. print "Clearing Spreadsheet"
  50. cell_list = worksheet.range('A1:F600')
  51.  
  52. for c in cell_list:  #gives us a tuple of an index and value
  53.     c.value = ''    #use the index on cell_list and the val from cell_values
  54.  
  55. worksheet.update_cells(cell_list)
  56.  
  57. print "Writing new content"
  58.  
  59. cell_list = worksheet.range('A1:F2000')
  60.  
  61. i=0
  62. for row in output:
  63.     for c in row:
  64.         cell_list[i].value = c
  65.         i=i+1
  66. worksheet.update_cells(cell_list)
Advertisement
Advertisement
Advertisement
RAW Paste Data Copied
Advertisement