Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from grab import Grab
- import os.path
- import xlwt
- import xlsxwriter
- import webbrowser
- #from xlutils.copy import copy
- BASE_URL = "http://www.svetlux.ru/"
- CAT_FILE = "categories.xls"
- SUBCAT_FILE = "subcategories.xls"
- SERIES_FILE = "series.txt"
- #SERIES_FILE = "cat2/myfile.txt"
- ITEMS_FILE = "items.xls"
- ATTRIBS_NAMES_FILE = "attribs.txt"
- CAT_HEADER = ['name','cat_id','parent_id']
- SUBCAT_HEADER = ['name','cat_id','parent_id']
- SERIES_HEADER = ['name','cat_id','parent_id','img_url']
- ITEMS_HEADER = ["product_id", "name(en)", "name(ru)", "categories", "sku", "upc", "ean", "jan", "isbn", "mpn", "location", "quantity", "model", "manufacturer", "image_name", "shipping", "price", "points", "date_added", "date_modified", "date_available", "weight", "weight_unit", "length", "width", "height", "length_unit", "status", "tax_class_id", "seo_keyword", "description(en)", "description(ru)", "meta_title(en)", "meta_title(ru)", "meta_description(en)", "meta_description(ru)", "meta_keywords(en)", "meta_keywords(ru)", "stock_status_id", "store_ids", "layout", "related_ids", "tags(en)", "tags(ru)", "sort_order", "subtract", "minimum"]
- FIRST_ITEM_NUMBER = 5616 # ID первого товара
- document_charset='Windows-1251'
- categories = {}
- subcategories = {}
- series={}
- series_parsed = 0
- item_attrs_names=[]
- items = [] # пачка готовых строк, по одной строке на товар серии
- myset = []
- items_count = 0 # количеcтво спарсенных, нужно для нумерации
- serie_id = 1667
- def check_url(url):
- result_url = url
- if not url.startswith('/') and not BASE_URL.endswith('/'):
- url = '/' + url
- if BASE_URL not in url:
- result_url = BASE_URL + url
- if not url.endswith('/') and not url.endswith('.jpg'):
- result_url += '/'
- return result_url
- def get_categories(): #TESTED : OK
- g = Grab()
- g.go(url = BASE_URL,document_charset=document_charset)
- found = g.doc.select('//*[@class="category"]/a')
- cat_id = 1
- parent_id = 0
- if found.exists:
- for elem in found:
- raw = str(elem.html())
- url = raw[10:raw.index('style') - 2 ]
- categories[url] = elem.text(), cat_id, parent_id
- cat_id += 1
- def get_subcategories(cat_url,next_cat_num):
- cat_id = next_cat_num
- g = Grab()
- g.go(url = BASE_URL + cat_url,document_charset=document_charset)
- found = g.doc.select('//*[@class="subcat"]/a')
- print(categories[cat_url])
- for elem in found:
- parent_id = categories[cat_url][0]
- print(parent_id)
- raw = str(elem.html())
- url = raw[10:raw.index('style') - 2 ]
- subcategories[url] = elem.text(),cat_id,parent_id, url
- #print(subcategories[url])
- cat_id+=1
- def get_series(url): #get all series of category
- url = check_url(url)
- g = Grab()
- g.go(url = url, document_charset=document_charset)
- found = g.doc.select('//*[@class="saleitem"]') # [@class="saleitem"]/div[1]
- if found.exists:
- for ser in found:
- k = ser.select('./div[*]/div[@class="zagolovok"]')
- img = ser.select('//*[@class="img lazy"]')
- for elem in k:
- try:
- value = elem.html()[32: (elem.html().index('/">') ) ]
- key = elem.text()
- if url in series.keys() and value not in series[url]:
- series[url].append([value,key])
- else:
- series[url] = [[value,key],]
- #print(elem.text()) # parse clean text
- except:
- ValueError
- def get_urls(raw, dest): #
- for elem in raw:
- dest.append( str(elem [9:elem.index('style') - 3 ]))
- def series_txt_item_parse(): # возвращает по одному урл из SERIES_FILE
- lines = open(SERIES_FILE).read().splitlines()
- try:
- x = serie_id
- while True:
- print(lines[x])
- a=lines[x].split('|')
- serie_url = a[3]
- ser_id = a[2]+','+a[1]+','+a[0]
- get_items(serie_url,ser_id) # отдаем на спарс урл и id
- x += 1
- except FileNotFoundError:
- print('File not exists')
- def get_items(url,ser_id): # парсит все товары серии по url серии
- global items_count
- url = check_url(url)
- g = Grab()
- g.go(url = url, document_charset=document_charset)
- found = g.doc.select('//*[@class="fleft name2p"]')
- if found.exists:
- print('\n' + url + ' serie completed OK' + '\n')
- for ser in found:
- item_attrs = {}
- item_url = ser.html()
- item_title = ser.text()
- item_url = item_url[item_url.index('><a href="')+11:item_url.index('"><div') ]
- img_url = ser.html()
- img_url = img_url[img_url.index(": url('/")+8:img_url.index("') no-") ]
- item_attrs['product_id'] = FIRST_ITEM_NUMBER + items_count
- items_count += 1
- item_attrs['item_name'] = item_title
- item_attrs['item_url'] = item_url
- item_attrs['item.series'] = ser_id # TODO имя серии!!!!!!!!!! пока cтавит урл серии вместо имени
- item_attrs['image_name'] = img_url
- item_attrs['price'] = get_price(item_url)
- print(item_url)
- item_attrs['description'] = get_item_table(BASE_URL + item_attrs['item_url'])
- attrs_to_row(item_attrs)
- print('\n' + url + ' serie completed OK' )
- #вместо имени серии берем первые цифры урла до _
- serie_name = url[url.index('catalog/')+8:url.index('_')]
- export_items(serie_name) #серия спарсена, можно экспортить
- print('\n' + url + ' serie saved to ' + serie_name + '.xls \n')
- def get_item_table(item_url): # характеристики для description
- item_url = check_url(item_url)
- g = Grab()
- g.go(url = item_url, document_charset=document_charset)
- found = g.doc.select('//*[@id="main"]/div[1]/div[2]/div[1]/table')
- if found.exists:
- result = []
- text = found.html()
- new_text = []
- for x in ['</table>','<table>','<b>','</b>','\n','<tr>']:
- text = text.replace(x,'')
- text = text.replace('</tr>',' ')
- text = text.split(' ')
- for a in text:
- new_text.append(a.replace('</td><td>',' '))
- for a in new_text:
- a = a.replace('<td>','')
- a = a.replace('</td>','')
- result.append(a)
- item_buffer={} # list [ключ, значение, ключ, значение ...]
- for i in result:
- if not i.partition(' ')[0] == '':
- item_buffer[i.partition(' ')[0]] = i.partition(' ')[2]
- #print(item_buffer)
- return item_buffer
- def attrs_to_row(item_attrs): # формируем строку из параметров таблицы, чтобюы потом экспортнуть её
- item_row = []
- item_attrs_names=[]
- # ЗНАЧЕНИЯ # СТОЛБЦЫ
- item_row.append(item_attrs['product_id']) # "product_id - для первого товара это FIRST_ITEM_NUMBER
- item_row.append("") # " name(en)
- item_row.append(item_attrs['item_name']) # "name(ru)"
- item_row.append(item_attrs['item.series']) # "categories", имя серии!!!!!!!!!! пока cтавит урл серии вместо имени
- if 'Артикул' in item_attrs['description'].keys():
- item_row.append(item_attrs['description']['Артикул']) # "sku", - выбрать sku
- else:
- item_row.append("")
- item_row.append("") # "upc"
- item_row.append("") # "ean",
- item_row.append("") # "jan"
- item_row.append("") # "isbn"
- item_row.append("") # "mpn"
- item_row.append("") # "location"
- item_row.append("999") #"quantity",
- item_row.append(item_row[4]) # "model",
- if 'Производитель' in item_attrs['description'].keys():
- item_row.append(item_attrs['description']['Производитель'])
- else:
- item_row.append("") # "manufacturer"
- item_row.append(item_attrs['image_name'].replace('images/pics','catalog')) # image_name
- item_row.append("yes" ) # "shipping",
- item_row.append(item_attrs['price'].replace(' ','')) # "price",
- item_row.append("0") # "points", "date_added",
- item_row.append("2015-01-01 00:00:00")
- item_row.append("2015-01-01 00:00:00" ) # date_modified",
- item_row.append("2015-01-01 00:00:00") # "date_available"
- item_row.append(0) # "weight"
- item_row.append('kg') # "weight_unit",
- item_row.append(0) # "length"
- item_row.append(0) # "width",
- item_row.append(0) # "height"
- item_row.append("cm") # "length_unit"
- item_row.append("true") # "status",
- item_row.append("0") # "tax_class_id",
- item_row.append("") # "seo_keyword",
- item_row.append("") # "description(en)"
- item_row.append(item_attrs['description']) # "description(ru)" номер [-16]
- item_row.append("") # "meta_title(en)",
- item_row.append(item_attrs['item_name']) # "meta_title(ru)",
- item_row.append("") # "meta_description(en)",
- item_row.append("") # "meta_description(ru)",
- item_row.append("") # "meta_keywords(en)",
- item_row.append("") # "meta_keywords(ru)",
- item_row.append("6") # "stock_status_id",
- item_row.append("0") # "store_ids",
- item_row.append("") # "layout",
- item_row.append("") # "related_ids",
- item_row.append("") # "tags(en)",
- item_row.append("") # "tags(ru)",
- item_row.append("1") # "sort_order"
- item_row.append("true") # "subtract"
- item_row.append("1") # "minimum"
- for attr_name in item_attrs['description'].keys():
- if attr_name not in myset: # ВСЕ названия аттрибутов товара - после экспорта в таблицу затирается из нее
- myset.append(attr_name)
- items.append(item_row)
- #print(item_row)
- def get_price(url):
- check_url(url)
- url = check_url(url)
- g = Grab()
- g.go(url = url, document_charset=document_charset)
- found = g.doc.select('/html/body/div[6]/div/div[6]/div[3]/div[1]/div[2]/div[2]/div[2]/div/div[1]/span[1] ')
- for k in found:
- price = k.text()
- return price
- def export(level,sheet_name,row): # export to txt (level = ['cat', 'subcat' ,'series' , 'items'] )
- if level == 'cat':
- source = categories
- dest_file = CAT_FILE
- header = CAT_HEADER
- elif level == 'subcat':
- source = subcategories
- dest_file = SUBCAT_FILE
- header = SUBCAT_HEADER
- elif level == 'series':
- source = series
- dest_file = SERIES_FILE
- header = SERIES_HEADER
- elif level == 'items': # используй export_items()
- export_items(serie_name) # вызов export_items()
- else:
- raise NameError
- wb = xlwt.Workbook(encoding = 'utf-8')
- ws = wb.add_sheet(sheet_name)
- col = 0
- if not os.path.isfile(dest_file): # если файл уже есть, то
- for attr in header: # заполняем таблицу
- ws.write(0,col,attr)
- col +=1
- row = 1
- else:
- print('File is exist')
- fill_table(source, dest_file, wb, ws, row, level)
- def fill_table(source, dest_file, wb, ws, row, level ):
- for elem in items:
- #print(elem)
- fill_items_table(wb, ws, row, item_row )
- col=0
- for attr in elem[1]:
- ws.write(row,col,str(attr))
- col +=1
- row+= 1
- wb.save(dest_file)
- def export_items(serie_name):
- global serie_id
- dest_file = str('sr/' + str(serie_id) + '.xls')
- serie_id +=1
- header = ITEMS_HEADER
- wb = xlsxwriter.Workbook(dest_file)
- ws = wb.add_worksheet('Products')
- col = 0
- #if not os.path.isfile(dest_file): # если файла нет, то
- for attr in header: # заполняем заголовок
- ws.write(0,col,attr)
- col +=1
- print(dest_file + ' is created. Header added')
- row = 1
- while len(items):
- #print(items)
- col = 0
- for elem in items.pop(len(items) -1):
- print(elem)
- ws.write(row,col,str(elem))
- col+= 1
- row+= 1
- ws = wb.add_worksheet('ProductAttributes')
- ws.write(0,0,'attribute_group')
- ws.write(0,1,'ProductAttributes')
- ws.write(1,0,'1')
- print(item_attrs_names)
- ws.write(1,1, str(myset[1:-1]))
- wb.close()
- # и пишем список аттрибутов в общий файл
- print(dest_file + ' is saved' )
- if __name__ == '__main__':
- series_txt_item_parse()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement