Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from grab import Grab #from grab.tools import rex #import re
- """ STRUCT of CATALOG
- category ( 'Люстры и бра. Классика ')
- subcategory (catalog/144_ODEON_Italiya/) named by vendor
- series {catalog/144_ODEON_Italiya/ , ['/catalog/1648_seriya_Alicante', 'серия Alicante'], ['/catalog/1732_seriya_Palacio1', 'серия Palacio1']
- items [attrib1 : value1, \
- attrib2 : value2 ]
- """
- BASE_URL = "http://www.svetlux.ru"
- document_charset='Windows-1251'
- items = {}
- categories_urls = []
- categories_raw = []
- categories_text = []
- subcategories = {}
- subcategories_urls = []
- subcategories_raw = []
- subcategories_text = []
- series = {} # key = subcategoryurl : [[url, title],[url, title]] items = {}
- def check_url(url):
- """ Check url for BASE_URL, append it if BASE_URL is missing
- Adds / between BASE_URL and url
- Adds end /
- """
- result_url = url
- if not url.startswith('/') and not BASE_URL.endswith('/'):
- url = '/' + url
- if BASE_URL not in url:
- result_url = BASE_URL + url
- if not url.endswith('/') and not url.endswith('.jpg'):
- result_url += '/'
- return result_url
- def get_categories_raw(url):
- url = check_url(url)
- g = Grab()
- g.go(url = url,document_charset=document_charset)
- found = g.doc.select('//*[@class="category"]/a')
- if found.exists:
- for elem in found:
- categories_raw.append(elem.html()) # parse category list in HTML
- categories_text.append(elem.text()) # parse clean text return None
- def get_subcategories_raw(url):
- url = check_url(url)
- g = Grab()
- g.go(url = url, document_charset=document_charset)
- found = g.doc.select('//*[@class="subcat"]/a')
- if found.exists:
- for elem in found: # TODO replace dict with # if url
- # subcategories[url] = [[value,key],]
- subcategories_raw.append(elem.html()) # parse category list in HTML
- subcategories_text.append(elem.text()) # parse clean text
- def get_series(url): #get all series of category
- url = check_url(url)
- g = Grab()
- g.go(url = url, document_charset=document_charset)
- found = g.doc.select('//*[@class="saleitem"]') # [@class="saleitem"]/div[1]
- if found.exists:
- for ser in found:
- k = ser.select('./div[*]/div[@class="zagolovok"]')
- img = ser.select('//*[@class="img lazy"]')
- for elem in k:
- try:
- value = elem.html()[32: (elem.html().index('/">') ) ]
- key = elem.text()
- if url in series.keys() and value not in series[url]:
- series[url].append([value,key])
- else:
- series[url] = [[value,key],]
- #print(elem.text()) # parse clean text
- except:
- ValueError
- def get_urls(raw, dest): #
- for elem in raw:
- print(elem)
- dest.append( str(elem [9:elem.index('style') - 3 ]))
- print(dest)
- def get_items(url, verbose=None):
- url = check_url(url)
- g = Grab()
- g.go(url = url, document_charset=document_charset)
- found = g.doc.select('//*[@class="fleft name2p"]')
- item_buffer = {}
- if found.exists:
- for ser in found:
- item_url = ser.html()
- item_title = ser.text()
- item_url = item_url[item_url.index('><a href="')+10:item_url.index('"><div') ]
- img_url = ser.html()
- img_url = img_url[img_url.index(": url('/")+8:img_url.index("') no-") ]
- item_buffer['item_title'] = item_title
- item_buffer['item_url'] = item_url
- item_buffer['img_url'] = img_url
- item_buffer['price'] = get_price(item_url)
- if item_url in items.keys():
- for k,v in item_buffer.items():
- if k not in items[item_url]:
- items[item_url].append({})
- items[item_url][k] = v
- else:
- items[item_url] = {}
- for k,v in item_buffer.items():
- items[item_url][k] = v
- if verbose:
- print(item_buffer)
- return item_buffer
- def get_all_items(): # Grab first page of attributes of Series
- for url in series:
- get_items(url[1] + '/')
- print(items)
- def get_item_table(item_url, verbose = None):
- item_url = check_url(item_url)
- g = Grab()
- g.go(url = item_url, document_charset=document_charset)
- found = g.doc.select('//*[@id="main"]/div[1]/div[2]/div[1]/table')
- item_buffer = {}
- if found.exists:
- result = []
- text = found.html()
- new_text = []
- for x in ['</table>','<table>','<b>','</b>','\n','<tr>']:
- text = text.replace(x,'')
- text = text.replace('</tr>',' ')
- text = text.split(' ')
- for a in text:
- new_text.append(a.replace('</td><td>',' '))
- for a in new_text:
- a = a.replace('<td>','')
- a = a.replace('</td>','')
- result.append(a)
- if verbose:
- print(result) # Вывод данных таблицы если входной verbose != None
- item_params={}
- for i in result:
- item_params[i.partition(' ')[0]] = i.partition(' ')[2]
- items[item_url] = item_params
- print(items)
- def get_price(url):
- check_url(url)
- url = check_url(url)
- g = Grab()
- g.go(url = url, document_charset=document_charset)
- found = g.doc.select('/html/body/div[6]/div/div[6]/div[3]/div[1]/div[2]/div[2]/div[2]/div/div[1]/span[1] ')
- for k in found:
- price = k.text()
- return price
- def view(source_list): # print a list (can handle)
- print(source_list) # subcategories_urls, subcategories_raw,subcategories_text
- def list_series(url_list):
- for elem in url_list:
- get_series(elem)
- if __name__ == '__main__':
- get_categories_raw(BASE_URL) # tested OK get_urls(categories_raw, categories_urls)
- get_urls(categories_raw, categories_urls) # tested OK
- get_subcategories_raw(BASE_URL) # tested OK
- get_urls(subcategories_raw, subcategories_urls) # tested OK
- #get_series(BASE_URL + '/catalog/144_ODEON_Italiya/') # tested OK
- #get_items(BASE_URL + '/catalog/1980_seriya_Valle/',1) # tested OK
- #print(items)
- #get_item_table('http://www.svetlux.ru/catalog/1980_seriya_Valle/8460_Nastolnaya_lampa_Valle/')
- get_series(subcategories_urls)
- #list_series(subcategories_urls)
- # view(categories_urls)
- view(series)
- # get_all_items()
- for i in items:
- get_item_table(i[0])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement