Advertisement
cr_sharat

Script to upload data to S3 and notify in slack

May 27th, 2015
263
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.74 KB | None | 0 0
  1. from spider.models import Run,Dataset
  2. from gevent.pool import Pool
  3. import os
  4. from django.core.files import File
  5. import shutil
  6. from dropbox import client
  7. import unicodecsv as csv
  8. import datetime
  9. from subprocess import call
  10. client = {
  11.   "secret_access_key": "access key",
  12.   "bucket": "bucket_name",
  13.   "access_key_id": "access_id"
  14. }
  15. import boto
  16. s3 = boto.connect_s3(client['access_key_id'],client['secret_access_key'])
  17. bucket = s3.get_bucket(client['bucket'])  # bucket names must be unique
  18.  
  19. def data_change(data): # mannual function
  20.     data['L1'] = data.pop('l1')
  21.     data['L2'] = data.pop('l2')
  22.     data['L3'] = data.pop('l3')
  23.     data['T1'] = data.pop('t1')
  24.     data['T2'] = data.pop('t2')
  25.     data['T3'] = data.pop('t3')
  26.     del data['id']
  27.     return data
  28.  
  29. def process(run_id):
  30.     run = Run.objects.get(id=run_id)
  31.  
  32.     parent_dataset = Dataset.objects.get(run=run, item__name='parent')
  33.     child_dataset  = Dataset.objects.get(run=run, item__name='child')
  34.  
  35.     if not os.path.exists('../%s'%(str(run.spider.name))):
  36.         os.mkdir('../%s'%str(run.spider.name))
  37.     else:
  38.         pass
  39.     #parent_key and #child_key are the order of the field that the client need in csv
  40.     parent_keys = ["no","store","brand","brand_status","title","breadcrumb","L1","L2","L3","keywords","T1","T2","T3","desc_p","desc_b","url","status","hash","brand_sku","store_sku","currency","availability","colors","regular_price","sale_price", "images","visit_time","update_time","shipping","return","petite","tall","available_sizes"]
  41.     child_keys = ["no","store_sku","parent_hash","regular_price","sale_price","color_code","color_name","color_group_tr","color_group_sr","color_group","pattern","availability","size","image","swatch_url","size_order"]
  42.    
  43.     parent_file_name = str(run.spider.name) + '_' + str(parent_dataset.item.name) + '_' +str(run.id) + '.csv'
  44.     write_parent_file = open('../%s/%s'%(str(run.spider.name),parent_file_name),'wb')
  45.  
  46.     child_file_name = str(run.spider.name) + '_' +str(child_dataset.item.name) + '_' +str(run.id) + '.csv'
  47.     write_child_file = open('../%s/%s'%(str(run.spider.name),child_file_name),'wb')
  48.  
  49.     parent_csv = csv.DictWriter(write_parent_file, fieldnames=parent_keys, quoting=csv.QUOTE_MINIMAL)
  50.     parent_csv.writeheader()
  51.     for i in parent_dataset.scan():
  52.         row = data_change(i)
  53.         parent_csv.writerow(row)
  54.     write_parent_file.close()
  55.  
  56.     child_csv = csv.DictWriter(write_child_file, fieldnames=child_keys, quoting=csv.QUOTE_MINIMAL)
  57.     child_csv.writeheader()
  58.     for i in child_dataset.scan():
  59.         del i['id']
  60.         child_csv.writerow(i)
  61.     write_child_file.close()
  62.  
  63.     date = datetime.datetime.now()
  64.    
  65.     week_no = int(date.strftime("%U"))+1
  66.     s3_parent_filename = "Week_" + str(week_no) +'/'+ str(parent_dataset.item.name) + "/" + str(run.spider.name) + '_' +str(parent_dataset.item.name) +'_' + date.strftime('%Y-%m-%d') + '.csv'
  67.     s3_child_filename  = "Week_" + str(week_no) +'/'+ str(child_dataset.item.name) + "/" + str(run.spider.name) + '_' +str(child_dataset.item.name) +'_' + date.strftime('%Y-%m-%d') + '.csv'
  68.    
  69.     key = bucket.new_key(s3_parent_filename)
  70.     key.set_contents_from_filename("../%s/%s"%(str(run.spider.name),parent_file_name))
  71.     key = bucket.new_key(s3_child_filename)
  72.     key.set_contents_from_filename("../%s/%s"%(str(run.spider.name),child_file_name))
  73.     payload = None
  74.     url = 'https://hooks.slack.com/services/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'
  75.     #url is the channel where we post the message when the data is uploaded to s3. This will be available in slack integration option
  76.     if bucket.get_key(s3_parent_filename).exists:
  77.         text = " <@"+str(run.spider.owner.username) + "> new Data s3://schero/"+ s3_parent_filename +" uploaded to " + str(run.spider.customer.user.get_full_name()) + " S3"
  78.         if "rijesh" not in text:
  79.             text = " <@rijesh> " + text
  80.         if "sharat" not in text:
  81.             text = " <@cr.sharat> " + text
  82.         payload = '{"text":"'+text+'", "username":"'+str(run.spider.name)+'"}'
  83.         call(['curl','-XPOST',url,'-d',payload])
  84.     if bucket.get_key(s3_child_filename).exists:
  85.         text = " <@"+str(run.spider.owner.username) + "> new Data s3://schero/"+ s3_child_filename +" uploaded to " + str(run.spider.customer.user.get_full_name()) + " S3"
  86.         if "rijesh" not in text:
  87.             text = " <@rijesh> " + text
  88.         if "sharat" not in text:
  89.             text = " <@cr.sharat> " + text
  90.         payload = '{"text":"'+text+'", "username":"'+str(run.spider.name)+'"}'
  91.         call(['curl','-XPOST',url,'-d',payload])
  92.     #you need to delete every thing that u created see below
  93.     shutil.rmtree("../%s"%(str(run.spider.name)))
  94.     del parent_dataset
  95.     del child_dataset
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement