Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2, json
- from bs4 import BeautifulSoup
- dirname = '/var/www/cron/file_scrapper/'
- def makelist(table):
- result = []
- allrows = table.findAll('tr')
- for row in allrows:
- url=''
- for a in row.find_all('a', href=True):
- url = a['href']
- result.append([url])
- allcols = row.findAll('td')
- for col in allcols:
- thestrings = [unicode(s).strip() for s in col.findAll(text=True)]
- thetext = ''.join(thestrings)
- result[-1].append(thetext)
- return result
- print 'Getting appeals list'
- url = "https://proxy.hxlstandard.org/data.json?strip-headers=on&filter03=merge&merge-url03=https%3A//docs.google.com/spreadsheets/d/1rVAE8b3uC_XIqU-eapUGLU7orIzYSUmvlPm9tI0bCbU/edit%23gid%3D0&clean-date-tags01=%23date&filter02=select&merge-keys03=%23meta%2Bid&filter04=replace-map&force=on&filter05=merge&merge-tags03=%23meta%2Bcoverage%2C%23meta%2Bfunding&select-query02-01=%23date%2Bend%3E2016-10-11&cut-include-tags06=%23meta%2Bid&merge-keys05=%23country%2Bname&merge-tags05=%23country%2Bcode&filter01=clean&replace-map-url04=https%3A//docs.google.com/spreadsheets/d/1hTE0U3V8x18homc5KxfA7IIrv1Y9F1oulhJt0Z4z3zo/edit%3Fusp%3Dsharing&filter06=cut&merge-url05=https%3A//docs.google.com/spreadsheets/d/1GugpfyzridvfezFcDsl6dNlpZDqI8TQJw-Jx52obny8/edit%3Fusp%3Dsharing&url=https%3A//docs.google.com/spreadsheets/d/19pBx2NpbgcLFeWoJGdCqECT2kw9O9_WmcZ3O41Sj4hU/edit%23gid%3D0"
- response = urllib2.urlopen(url)
- data = json.loads(response.read())
- print 'Getting file URLs'
- output = [['#meta+url','#country','#meta+id','#meta+appealname','#meta+documentname','#date']];
- i=0
- for d in data:
- if i>0:
- response = urllib2.urlopen('http://www.ifrc.org/en/publications-and-reports/appeals/?ac='+d[0]+'&at=0&c=&co=&dt=1&f=&re=&t=&ti=&zo=')
- soup = BeautifulSoup(response.read(), "lxml")
- div = soup.find('div', id='cw_content')
- for t in div.findAll('tbody'):
- print makelist(t);
- output = output + makelist(t)
- i=i+1
- import gspread
- from oauth2client.client import SignedJwtAssertionCredentials
- json_key = json.load(open(dirname+'python_scraper-d4dd99f5d776.json'))
- scope = ['https://spreadsheets.google.com/feeds']
- credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'], scope)
- gc = gspread.authorize(credentials)
- sh = gc.open("Current Operation files")
- worksheet = sh.sheet1
- print "Clearing Spreadsheet"
- cell_list = worksheet.range('A1:F600')
- for c in cell_list: #gives us a tuple of an index and value
- c.value = '' #use the index on cell_list and the val from cell_values
- worksheet.update_cells(cell_list)
- print "Writing new content"
- cell_list = worksheet.range('A1:F2000')
- i=0
- for row in output:
- for c in row:
- cell_list[i].value = c
- i=i+1
- worksheet.update_cells(cell_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement