Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from spider.models import Run,Dataset
- from gevent.pool import Pool
- import os
- from django.core.files import File
- import shutil
- from dropbox import client
- import unicodecsv as csv
- import datetime
- from subprocess import call
- client = {
- "secret_access_key": "access key",
- "bucket": "bucket_name",
- "access_key_id": "access_id"
- }
- import boto
- s3 = boto.connect_s3(client['access_key_id'],client['secret_access_key'])
- bucket = s3.get_bucket(client['bucket']) # bucket names must be unique
- def data_change(data): # mannual function
- data['L1'] = data.pop('l1')
- data['L2'] = data.pop('l2')
- data['L3'] = data.pop('l3')
- data['T1'] = data.pop('t1')
- data['T2'] = data.pop('t2')
- data['T3'] = data.pop('t3')
- del data['id']
- return data
- def process(run_id):
- run = Run.objects.get(id=run_id)
- parent_dataset = Dataset.objects.get(run=run, item__name='parent')
- child_dataset = Dataset.objects.get(run=run, item__name='child')
- if not os.path.exists('../%s'%(str(run.spider.name))):
- os.mkdir('../%s'%str(run.spider.name))
- else:
- pass
- #parent_key and #child_key are the order of the field that the client need in csv
- parent_keys = ["no","store","brand","brand_status","title","breadcrumb","L1","L2","L3","keywords","T1","T2","T3","desc_p","desc_b","url","status","hash","brand_sku","store_sku","currency","availability","colors","regular_price","sale_price", "images","visit_time","update_time","shipping","return","petite","tall","available_sizes"]
- child_keys = ["no","store_sku","parent_hash","regular_price","sale_price","color_code","color_name","color_group_tr","color_group_sr","color_group","pattern","availability","size","image","swatch_url","size_order"]
- parent_file_name = str(run.spider.name) + '_' + str(parent_dataset.item.name) + '_' +str(run.id) + '.csv'
- write_parent_file = open('../%s/%s'%(str(run.spider.name),parent_file_name),'wb')
- child_file_name = str(run.spider.name) + '_' +str(child_dataset.item.name) + '_' +str(run.id) + '.csv'
- write_child_file = open('../%s/%s'%(str(run.spider.name),child_file_name),'wb')
- parent_csv = csv.DictWriter(write_parent_file, fieldnames=parent_keys, quoting=csv.QUOTE_MINIMAL)
- parent_csv.writeheader()
- for i in parent_dataset.scan():
- row = data_change(i)
- parent_csv.writerow(row)
- write_parent_file.close()
- child_csv = csv.DictWriter(write_child_file, fieldnames=child_keys, quoting=csv.QUOTE_MINIMAL)
- child_csv.writeheader()
- for i in child_dataset.scan():
- del i['id']
- child_csv.writerow(i)
- write_child_file.close()
- date = datetime.datetime.now()
- week_no = int(date.strftime("%U"))+1
- s3_parent_filename = "Week_" + str(week_no) +'/'+ str(parent_dataset.item.name) + "/" + str(run.spider.name) + '_' +str(parent_dataset.item.name) +'_' + date.strftime('%Y-%m-%d') + '.csv'
- s3_child_filename = "Week_" + str(week_no) +'/'+ str(child_dataset.item.name) + "/" + str(run.spider.name) + '_' +str(child_dataset.item.name) +'_' + date.strftime('%Y-%m-%d') + '.csv'
- key = bucket.new_key(s3_parent_filename)
- key.set_contents_from_filename("../%s/%s"%(str(run.spider.name),parent_file_name))
- key = bucket.new_key(s3_child_filename)
- key.set_contents_from_filename("../%s/%s"%(str(run.spider.name),child_file_name))
- payload = None
- url = 'https://hooks.slack.com/services/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'
- #url is the channel where we post the message when the data is uploaded to s3. This will be available in slack integration option
- if bucket.get_key(s3_parent_filename).exists:
- text = " <@"+str(run.spider.owner.username) + "> new Data s3://schero/"+ s3_parent_filename +" uploaded to " + str(run.spider.customer.user.get_full_name()) + " S3"
- if "rijesh" not in text:
- text = " <@rijesh> " + text
- if "sharat" not in text:
- text = " <@cr.sharat> " + text
- payload = '{"text":"'+text+'", "username":"'+str(run.spider.name)+'"}'
- call(['curl','-XPOST',url,'-d',payload])
- if bucket.get_key(s3_child_filename).exists:
- text = " <@"+str(run.spider.owner.username) + "> new Data s3://schero/"+ s3_child_filename +" uploaded to " + str(run.spider.customer.user.get_full_name()) + " S3"
- if "rijesh" not in text:
- text = " <@rijesh> " + text
- if "sharat" not in text:
- text = " <@cr.sharat> " + text
- payload = '{"text":"'+text+'", "username":"'+str(run.spider.name)+'"}'
- call(['curl','-XPOST',url,'-d',payload])
- #you need to delete every thing that u created see below
- shutil.rmtree("../%s"%(str(run.spider.name)))
- del parent_dataset
- del child_dataset
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement