Advertisement
Guest User

Untitled

a guest
Nov 19th, 2019
127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.26 KB | None | 0 0
  1. import requests
  2. import lxml.html
  3. import re
  4.  
  5.  
  6. def page_parser(base_url, urls_list=[]):
  7. status = requests.get(base_url).status_code
  8. count = 1
  9. print(status)
  10.  
  11. html_page = requests.get(base_url).content
  12. print('html page is gotten')
  13.  
  14. html_tree = lxml.html.fromstring(html_page)
  15. print('html tree is built')
  16.  
  17. a = html_tree.xpath(".//a")
  18. print('object of <a>_list is gotten')
  19.  
  20. # urls_list = []
  21. print('loop is started')
  22. for url in a:
  23. # global count
  24. print('---------')
  25. print(count)
  26. count += 1
  27. print('list contains ' + str(len(urls_list)))
  28. url = url.get("href")
  29. print('url is gotten')
  30. if not isinstance(url, str) or url == '':
  31. continue
  32. url = url.rsplit('?', 1)[0]
  33. print(url)
  34. # status_url = requests.get(url).status_code
  35. if status != 200:
  36. print('HTTP ' + str(status))
  37. continue
  38. elif url in urls_list:
  39. print('Url is already in the list')
  40. continue
  41. else:
  42. urls_list.append(url)
  43. print('Url is added to the list')
  44. # else:
  45. # print('url does not satisfy requirements')
  46. # continue
  47. print('loop is finished')
  48. return urls_list
  49.  
  50.  
  51. def site_parser(page_parser_fn, links_list):
  52. for url in links_list:
  53. new_list = page_parser_fn(url)
  54. for i in new_list:
  55. if i not in links_list:
  56. links_list.append(i)
  57. return links_list
  58.  
  59.  
  60. site_url = 'https://www.olx.ua/'
  61. # parsed_urls_list = site_parser(page_parser, page_parser(site_url))
  62.  
  63. # # with open(r'C:\Users\Igor\PycharmProjects\untitled\Olx_links.txt', 'w') as file:
  64.  
  65. def crawl(url, indexed=[], collected=[], base_url=''):
  66. new_urls = list(set(page_parser(url)))
  67. indexed.append(url)
  68. for x in new_urls:
  69. x = x.rsplit('?', 1)[0]
  70. if x not in collected and base_url in x:
  71. collected.append(x)
  72. print("Collected: " + str(len(collected)))
  73. print("Indexed: " + str(len(indexed)))
  74.  
  75. for x in (y for y in collected if y not in indexed):
  76. return crawl(x, indexed, collected, base_url)
  77. return collected
  78.  
  79. urls = crawl(site_url, base_url=site_url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement