Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import lxml.html
- import re
- def page_parser(base_url, urls_list=[]):
- status = requests.get(base_url).status_code
- count = 1
- print(status)
- html_page = requests.get(base_url).content
- print('html page is gotten')
- html_tree = lxml.html.fromstring(html_page)
- print('html tree is built')
- a = html_tree.xpath(".//a")
- print('object of <a>_list is gotten')
- # urls_list = []
- print('loop is started')
- for url in a:
- # global count
- print('---------')
- print(count)
- count += 1
- print('list contains ' + str(len(urls_list)))
- url = url.get("href")
- print('url is gotten')
- if not isinstance(url, str) or url == '':
- continue
- url = url.rsplit('?', 1)[0]
- print(url)
- # status_url = requests.get(url).status_code
- if status != 200:
- print('HTTP ' + str(status))
- continue
- elif url in urls_list:
- print('Url is already in the list')
- continue
- else:
- urls_list.append(url)
- print('Url is added to the list')
- # else:
- # print('url does not satisfy requirements')
- # continue
- print('loop is finished')
- return urls_list
- def site_parser(page_parser_fn, links_list):
- for url in links_list:
- new_list = page_parser_fn(url)
- for i in new_list:
- if i not in links_list:
- links_list.append(i)
- return links_list
- site_url = 'https://www.olx.ua/'
- # parsed_urls_list = site_parser(page_parser, page_parser(site_url))
- # # with open(r'C:\Users\Igor\PycharmProjects\untitled\Olx_links.txt', 'w') as file:
- def crawl(url, indexed=[], collected=[], base_url=''):
- new_urls = list(set(page_parser(url)))
- indexed.append(url)
- for x in new_urls:
- x = x.rsplit('?', 1)[0]
- if x not in collected and base_url in x:
- collected.append(x)
- print("Collected: " + str(len(collected)))
- print("Indexed: " + str(len(indexed)))
- for x in (y for y in collected if y not in indexed):
- return crawl(x, indexed, collected, base_url)
- return collected
- urls = crawl(site_url, base_url=site_url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement