file_scraper.py

import urllib2, json
from bs4 import BeautifulSoup
dirname = '/var/www/cron/file_scrapper/'

def makelist(table):
  result = []
  allrows = table.findAll('tr')
  for row in allrows:
    url=''
    for a in row.find_all('a', href=True):
        url = a['href']
    result.append([url])
    allcols = row.findAll('td')
    for col in allcols:
      thestrings = [unicode(s).strip() for s in col.findAll(text=True)]
      thetext = ''.join(thestrings)
      result[-1].append(thetext)
  return result

print 'Getting appeals list'
url = "https://proxy.hxlstandard.org/data.json?strip-headers=on&filter03=merge&merge-url03=https%3A//docs.google.com/spreadsheets/d/1rVAE8b3uC_XIqU-eapUGLU7orIzYSUmvlPm9tI0bCbU/edit%23gid%3D0&clean-date-tags01=%23date&filter02=select&merge-keys03=%23meta%2Bid&filter04=replace-map&force=on&filter05=merge&merge-tags03=%23meta%2Bcoverage%2C%23meta%2Bfunding&select-query02-01=%23date%2Bend%3E2016-10-11&cut-include-tags06=%23meta%2Bid&merge-keys05=%23country%2Bname&merge-tags05=%23country%2Bcode&filter01=clean&replace-map-url04=https%3A//docs.google.com/spreadsheets/d/1hTE0U3V8x18homc5KxfA7IIrv1Y9F1oulhJt0Z4z3zo/edit%3Fusp%3Dsharing&filter06=cut&merge-url05=https%3A//docs.google.com/spreadsheets/d/1GugpfyzridvfezFcDsl6dNlpZDqI8TQJw-Jx52obny8/edit%3Fusp%3Dsharing&url=https%3A//docs.google.com/spreadsheets/d/19pBx2NpbgcLFeWoJGdCqECT2kw9O9_WmcZ3O41Sj4hU/edit%23gid%3D0"
response = urllib2.urlopen(url)
data = json.loads(response.read())
print 'Getting file URLs'
output = [['#meta+url','#country','#meta+id','#meta+appealname','#meta+documentname','#date']];
i=0
for d in data:
    if i>0:
        response = urllib2.urlopen('http://www.ifrc.org/en/publications-and-reports/appeals/?ac='+d[0]+'&at=0&c=&co=&dt=1&f=&re=&t=&ti=&zo=')
        soup = BeautifulSoup(response.read(), "lxml")
        div = soup.find('div', id='cw_content')
        for t in div.findAll('tbody'):
            print makelist(t);
            output = output + makelist(t)
    i=i+1

import gspread
from oauth2client.client import SignedJwtAssertionCredentials

json_key = json.load(open(dirname+'python_scraper-d4dd99f5d776.json'))
scope = ['https://spreadsheets.google.com/feeds']

credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'], scope)

gc = gspread.authorize(credentials)

sh = gc.open("Current Operation files")
worksheet = sh.sheet1
print "Clearing Spreadsheet"
cell_list = worksheet.range('A1:F600')

for c in cell_list:  #gives us a tuple of an index and value
    c.value = ''    #use the index on cell_list and the val from cell_values

worksheet.update_cells(cell_list)

print "Writing new content"

cell_list = worksheet.range('A1:F2000')

i=0
for row in output:
    for c in row:
        cell_list[i].value = c
        i=i+1
worksheet.update_cells(cell_list)