Advertisement
Guest User

Untitled

a guest
Jan 20th, 2019
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.87 KB | None | 0 0
  1. def add_external_links(bs_obj, scheme, exclude_url, title):
  2. for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
  3. if link.attrs["href"].endswith("/"):
  4. link.attrs["href"] = link.attrs["href"][:-1]
  5.  
  6. # Get matching rows
  7. select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
  8.  
  9. if select_in_return == 0 and select_em_return == 0:
  10. if link.attrs["href"].startswith("//"):
  11. cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
  12. (0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))
  13.  
  14. conn.commit()
  15.  
  16. else:
  17. cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
  18. (0, link.attrs["href"], title, "Temp contents",))
  19.  
  20. conn.commit()
  21.  
  22.  
  23. def split_address(addr):
  24. address_parts = None
  25.  
  26. if "https" in addr:
  27. address_parts = addr.replace("https://", "").split("/")
  28. if "www" in address_parts[0]:
  29. address_parts = address_parts[0].replace("www.", "")
  30. elif "http" in addr:
  31. address_parts = addr.replace("http://", "").split("/")
  32. if "www" in address_parts[0]:
  33. address_parts = address_parts[0].replace("www.", "")
  34.  
  35. return address_parts
  36.  
  37.  
  38. def get_random_external_links(starting_page):
  39. html = urlopen(starting_page)
  40.  
  41. try:
  42. bs_obj = BeautifulSoup(html, "html.parser")
  43. except AttributeError as e:
  44. return -1
  45.  
  46. title = bs_obj.find("title")
  47.  
  48. # Get scheme, netloc and title of URI and pass it to add_external_links()
  49. add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())
  50.  
  51. cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")
  52. fetch = cur.fetchall()
  53. selected_tuple = str(fetch[0][0])
  54.  
  55. if selected_tuple.startswith("b'"):
  56. selected_tuple = selected_tuple[2:]
  57.  
  58. if selected_tuple.endswith("'"):
  59. selected_tuple = selected_tuple[:-1]
  60.  
  61. return selected_tuple
  62.  
  63.  
  64. def find_random_link(url):
  65. get_link = get_random_external_link(url)
  66.  
  67. if get_link == -1:
  68. return -1
  69. else:
  70. return find_random_link(get_link)
  71.  
  72. +----------+--------------+------+-----+---------+----------------+
  73. | Field | Type | Null | Key | Default | Extra |
  74. +----------+--------------+------+-----+---------+----------------+
  75. | idx | int(11) | NO | PRI | <null> | auto_increment |
  76. | href | blob | NO | | <null> | |
  77. | title | varchar(255) | NO | | <null> | |
  78. | contents | blob | NO | | <null> | |
  79. +----------+--------------+------+-----+---------+----------------+
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement