Advertisement
lexquarkie

Untitled

Mar 19th, 2015
237
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.90 KB | None | 0 0
  1. from grab import Grab #from grab.tools import rex #import re
  2.  
  3. """ STRUCT of CATALOG
  4. category ( 'Люстры и бра. Классика ')
  5. subcategory  (catalog/144_ODEON_Italiya/)  named by vendor
  6.  series   {catalog/144_ODEON_Italiya/ ,  ['/catalog/1648_seriya_Alicante', 'серия Alicante'], ['/catalog/1732_seriya_Palacio1', 'серия Palacio1']
  7.               items   [attrib1 : value1, \
  8.                        attrib2 : value2  ]
  9. """
  10. BASE_URL = "http://www.svetlux.ru"
  11. document_charset='Windows-1251'
  12.  
  13. items = {}
  14. categories_urls = []
  15. categories_raw = []
  16. categories_text = []
  17. subcategories = {}
  18. subcategories_urls = []
  19. subcategories_raw = []
  20. subcategories_text = []
  21. series = {} # key = subcategoryurl : [[url, title],[url, title]] items = {}
  22.  
  23.  
  24. def check_url(url):
  25.         """ Check url for BASE_URL, append it if BASE_URL is missing
  26.        Adds / between BASE_URL and url
  27.         Adds end /
  28.        """
  29.         result_url = url
  30.         if not url.startswith('/') and not BASE_URL.endswith('/'):
  31.                 url = '/' + url
  32.         if BASE_URL not in url:
  33.                 result_url = BASE_URL + url
  34.         if not url.endswith('/') and not url.endswith('.jpg'):
  35.                 result_url += '/'
  36.         return result_url
  37.  
  38. def get_categories_raw(url):
  39.         url = check_url(url)
  40.         g = Grab()
  41.         g.go(url = url,document_charset=document_charset)
  42.         found = g.doc.select('//*[@class="category"]/a')
  43.         if found.exists:
  44.                 for elem in found:
  45.                         categories_raw.append(elem.html()) # parse category list in HTML
  46.                         categories_text.append(elem.text()) # parse clean text return None
  47.  
  48. def get_subcategories_raw(url):
  49.         url = check_url(url)
  50.         g = Grab()
  51.         g.go(url = url, document_charset=document_charset)
  52.         found = g.doc.select('//*[@class="subcat"]/a')
  53.         if found.exists:
  54.                 for elem in found: # TODO replace dict with # if url
  55.              #           subcategories[url] = [[value,key],]
  56.                         subcategories_raw.append(elem.html())   # parse category list in HTML
  57.                         subcategories_text.append(elem.text())  # parse clean text
  58.  
  59.  
  60. def get_series(url): #get all series of category
  61.     url = check_url(url)
  62.     g = Grab()
  63.     g.go(url = url, document_charset=document_charset)
  64.     found = g.doc.select('//*[@class="saleitem"]') # [@class="saleitem"]/div[1]
  65.     if found.exists:
  66.         for ser in found:
  67.             k = ser.select('./div[*]/div[@class="zagolovok"]')
  68.             img = ser.select('//*[@class="img lazy"]')
  69.             for elem in k:
  70.                 try:
  71.                     value = elem.html()[32: (elem.html().index('/">') ) ]
  72.                     key = elem.text()
  73.                     if url in series.keys() and value not in series[url]:
  74.                         series[url].append([value,key])
  75.                     else:
  76.                         series[url] = [[value,key],]
  77.                         #print(elem.text())  # parse clean text
  78.                 except:
  79.                         ValueError
  80.    
  81. def get_urls(raw, dest): #
  82.         for elem in raw:
  83.                 print(elem)
  84.                 dest.append( str(elem [9:elem.index('style') - 3 ]))
  85.         print(dest)
  86.  
  87. def get_items(url, verbose=None):
  88.     url = check_url(url)
  89.     g = Grab()
  90.     g.go(url = url, document_charset=document_charset)
  91.     found = g.doc.select('//*[@class="fleft name2p"]')
  92.     item_buffer = {}
  93.     if found.exists:
  94.         for ser in found:
  95.             item_url = ser.html()
  96.             item_title = ser.text()
  97.             item_url = item_url[item_url.index('><a href="')+10:item_url.index('"><div') ]
  98.             img_url = ser.html()
  99.             img_url = img_url[img_url.index(": url('/")+8:img_url.index("') no-") ]
  100.             item_buffer['item_title'] = item_title
  101.             item_buffer['item_url'] = item_url
  102.             item_buffer['img_url'] = img_url
  103.             item_buffer['price'] = get_price(item_url)
  104.     if item_url in items.keys():
  105.         for k,v in item_buffer.items():
  106.             if k not in items[item_url]:
  107.                 items[item_url].append({})
  108.                 items[item_url][k] = v
  109.     else:
  110.         items[item_url] = {}
  111.         for k,v in item_buffer.items():
  112.             items[item_url][k] = v
  113.     if verbose:
  114.         print(item_buffer)
  115.     return item_buffer
  116.  
  117. def get_all_items():   # Grab first page of attributes of Series
  118.     for url in series:
  119.         get_items(url[1] + '/')
  120.     print(items)
  121.  
  122.  
  123. def get_item_table(item_url, verbose = None):
  124.     item_url = check_url(item_url)
  125.     g = Grab()
  126.     g.go(url = item_url, document_charset=document_charset)
  127.     found = g.doc.select('//*[@id="main"]/div[1]/div[2]/div[1]/table')
  128.     item_buffer = {}
  129.     if found.exists:
  130.         result = []
  131.         text = found.html()                    
  132.         new_text = []
  133.         for x in ['</table>','<table>','<b>','</b>','\n','<tr>']:
  134.             text = text.replace(x,'')
  135.             text = text.replace('</tr>','   ')
  136.  
  137.         text = text.split('   ')
  138.         for a in text:
  139.             new_text.append(a.replace('</td><td>','    '))
  140.         for a in new_text:
  141.             a = a.replace('<td>','')
  142.             a = a.replace('</td>','')
  143.             result.append(a)
  144.         if verbose:
  145.             print(result)   #  Вывод данных таблицы если входной verbose != None
  146.         item_params={}
  147.         for i in result:
  148.             item_params[i.partition('    ')[0]] = i.partition('    ')[2]
  149.    
  150.         items[item_url] = item_params
  151.         print(items)
  152.  
  153. def get_price(url):
  154.     check_url(url)
  155.     url = check_url(url)
  156.     g = Grab()
  157.     g.go(url = url, document_charset=document_charset)
  158.     found = g.doc.select('/html/body/div[6]/div/div[6]/div[3]/div[1]/div[2]/div[2]/div[2]/div/div[1]/span[1] ')
  159.     for k in found:
  160.         price = k.text()
  161.     return price
  162.  
  163. def view(source_list):      #  print a list (can handle)
  164.     print(source_list)  #  subcategories_urls, subcategories_raw,subcategories_text
  165.  
  166. def list_series(url_list):
  167.     for elem in url_list:
  168.         get_series(elem)
  169.  
  170.  
  171.  
  172. if __name__ == '__main__':
  173.     get_categories_raw(BASE_URL) # tested OK get_urls(categories_raw, categories_urls)
  174.     get_urls(categories_raw, categories_urls)   #  tested OK
  175.     get_subcategories_raw(BASE_URL) #  tested OK
  176.     get_urls(subcategories_raw, subcategories_urls)   #  tested OK
  177.    
  178.     #get_series(BASE_URL + '/catalog/144_ODEON_Italiya/')  #  tested OK
  179.     #get_items(BASE_URL + '/catalog/1980_seriya_Valle/',1)  #  tested OK
  180.     #print(items)
  181.    #get_item_table('http://www.svetlux.ru/catalog/1980_seriya_Valle/8460_Nastolnaya_lampa_Valle/')
  182.     get_series(subcategories_urls)  
  183.     #list_series(subcategories_urls)
  184.    # view(categories_urls)    
  185.     view(series)
  186.    # get_all_items()
  187.     for i in items:
  188.         get_item_table(i[0])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement