SHARE
TWEET

Untitled

a guest Nov 19th, 2019 85 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import requests
  2. import lxml.html
  3. import re
  4.  
  5.  
  6. def page_parser(base_url, urls_list=[]):
  7.     status = requests.get(base_url).status_code
  8.     count = 1
  9.     print(status)
  10.  
  11.     html_page = requests.get(base_url).content
  12.     print('html page is gotten')
  13.  
  14.     html_tree = lxml.html.fromstring(html_page)
  15.     print('html tree is built')
  16.  
  17.     a = html_tree.xpath(".//a")
  18.     print('object of <a>_list is gotten')
  19.  
  20.     # urls_list = []
  21.     print('loop is started')
  22.     for url in a:
  23.         # global count
  24.         print('---------')
  25.         print(count)
  26.         count += 1
  27.         print('list contains ' + str(len(urls_list)))
  28.         url = url.get("href")
  29.         print('url is gotten')
  30.         if not isinstance(url, str) or url == '':
  31.             continue
  32.         url = url.rsplit('?', 1)[0]
  33.         print(url)
  34.         # status_url = requests.get(url).status_code
  35.         if status != 200:
  36.             print('HTTP ' + str(status))
  37.             continue
  38.         elif url in urls_list:
  39.             print('Url is already in the list')
  40.             continue
  41.         else:
  42.             urls_list.append(url)
  43.             print('Url is added to the list')
  44.         # else:
  45.         #     print('url does not satisfy requirements')
  46.         #     continue
  47.     print('loop is finished')
  48.     return urls_list
  49.  
  50.  
  51. def site_parser(page_parser_fn, links_list):
  52.     for url in links_list:
  53.         new_list = page_parser_fn(url)
  54.         for i in new_list:
  55.             if i not in links_list:
  56.                 links_list.append(i)
  57.     return links_list
  58.  
  59.  
  60. site_url = 'https://www.olx.ua/'
  61. # parsed_urls_list = site_parser(page_parser, page_parser(site_url))
  62.  
  63. # # with open(r'C:\Users\Igor\PycharmProjects\untitled\Olx_links.txt', 'w') as file:
  64.  
  65. def crawl(url, indexed=[], collected=[], base_url=''):
  66.     new_urls = list(set(page_parser(url)))
  67.     indexed.append(url)
  68.     for x in new_urls:
  69.         x = x.rsplit('?', 1)[0]
  70.         if x not in collected and base_url in x:
  71.             collected.append(x)
  72.     print("Collected: " + str(len(collected)))
  73.     print("Indexed: " + str(len(indexed)))
  74.  
  75.     for x in (y for y in collected if y not in indexed):
  76.         return crawl(x, indexed, collected, base_url)
  77.     return collected
  78.  
  79. urls = crawl(site_url, base_url=site_url)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top