Advertisement
cr_sharat

PostScript for spliting the csv

May 27th, 2015
264
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.38 KB | None | 0 0
  1. from spider.models import Run,Dataset
  2. from gevent.pool import Pool
  3. pool = Pool(500)
  4. import os
  5. from django.core.files import File
  6. import shutil
  7. import unicodecsv as csv
  8. from dropbox import client
  9. import zipfile
  10. from traceback import format_exc
  11. access_token  = 'your dropbox api key'
  12. client = client.DropboxClient(access_token)
  13. def data_change(data):
  14.     if data['hotel_url'] is not None:
  15.         state = data['state']
  16.         del data['state']
  17.         del data['id']
  18.         try:
  19.             data['required_at_checkin'] = ','.join([str(i) for i in data['required_at_checkin']]) if data['required_at_checkin'] else ''
  20.             csvfieldnames = ['hotel_name','hotel_address','required_at_checkin','checkin_age','policies','hotel_url']
  21.             filename = state.lower().replace(' ','_')+".csv"
  22.             writefile = open("../665_hotels_com/"+filename,'a')
  23.             mycsv = csv.DictWriter(writefile, fieldnames=csvfieldnames)
  24.             mycsv.writerow(data)
  25.             writefile.close()
  26.         except:
  27.             print "**************************"
  28.             trace = format_exc()
  29.             print data
  30.             print "-->",state
  31.             print trace
  32.  
  33.  
  34. def zipdir(source, destination):
  35.     folder = os.path.abspath(source)
  36.     with zipfile.ZipFile(destination, 'w') as zipf:
  37.         for root, dirs, files in os.walk(folder):
  38.             path = os.path.relpath(root, folder)
  39.             for filename in files:
  40.                 relname = os.path.join(path, filename)
  41.                 absname = os.path.join(root, filename)
  42.                 if not filename.endswith(".pyc"):
  43.                     zipf.write(absname, relname, zipfile.ZIP_DEFLATED)
  44.  
  45. def process(run_id):
  46.     run = Run.objects.get(id=run_id)
  47.     dataset = Dataset.objects.get(run=run, item__name='hotel')
  48.     state_names = [
  49.                 'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California','Colorado','Connecticut', 'Delaware',
  50.                 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
  51.                 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
  52.                 'Mississippi', 'Missouri','Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
  53.                 'New York', 'North Carolina', 'North Dakota','Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island',
  54.                 'South Carolina', 'South Dakota','Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
  55.                 'West Virginia', 'Wisconsin', 'Wyoming'
  56.               ]
  57.  
  58.     csvfieldnames = ['hotel_name','hotel_address','required_at_checkin','checkin_age','policies','hotel_url']
  59.    
  60.     if not os.path.exists('../%s'%(str(run.spider.name))):
  61.         os.mkdir('../%s'%str(run.spider.name))
  62.    
  63.     for i in state_names:
  64.         filename = i.replace(' ','_').lower()+".csv"
  65.         newfile = open('../%s/%s'%(str(run.spider.name),filename),'w')
  66.         mycsv = csv.DictWriter(newfile, fieldnames=csvfieldnames)
  67.         mycsv.writeheader()
  68.         newfile.close()
  69.     pool.map(data_change, dataset.scan())
  70.  
  71.  
  72.     try:
  73.         response = client.file_create_folder('/nest/')
  74.     except:
  75.         pass
  76.     try:
  77.         response = client.file_create_folder('/nest/%s'%(str(run.spider.name)))
  78.     except:
  79.         pass
  80.     try:
  81.         response = client.file_create_folder('/nest/%s/%s'%(str(run.spider.name),str(run.id)))
  82.     except:
  83.         pass
  84.     for i in state_names:
  85.         filename = i.replace(' ','_').lower()+".csv"
  86.         print filename
  87.         csvfile = list(csv.DictReader(open('../%s/%s'%(str(run.spider.name),filename),'r')))
  88.         if len(csvfile)>0:
  89.             print "ok -->",filename
  90.             file_to = File(open('../%s/%s'%(str(run.spider.name),filename),'r'))
  91.             client.put_file("/nest/%s/%s/%s"%(str(run.spider.name),str(run.id),filename),file_to)
  92.         else:
  93.             os.remove('../%s/%s'%(str(run.spider.name),filename))
  94.             print "file removed from ../%s/%s"%(str(run.spider.name),filename)
  95.     zipfilename = str(run.spider.name) + ".zip"
  96.     zipf = zipfile.ZipFile(zipfilename,'w')
  97.     zipdir('../%s/'%(str(run.spider.name)), zipfilename)
  98.     file_to = File(open(zipfilename,"r"))
  99.     client.put_file("/nest/%s/%s"%(str(run.spider.name),zipfilename),file_to)
  100.     os.remove(zipfilename)  
  101.     shutil.rmtree("../%s"%(str(run.spider.name)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement