Advertisement
Guest User

Untitled

a guest
Jun 25th, 2016
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.69 KB | None | 0 0
  1. import sys
  2. import getpass
  3. import mysql.connector
  4.  
  5. class WordNotFoundError(Exception):
  6. def __init__(self, word):
  7. self.word = word
  8. def __str__(self):
  9. return self.word + " was not found."
  10.  
  11.  
  12. class PageIdNotFoundError(Exception):
  13. def __init__(self, pid):
  14. self.pid = pid
  15. def __str(self):
  16. return self.pid + " was not found."
  17.  
  18.  
  19. class LinkNotFoundError(Exception):
  20. def __init__(self, msg):
  21. self.msg = msg
  22. def __str__(self):
  23. return self.msg
  24.  
  25.  
  26. def get_pageid(c, w):
  27. c.execute("SELECT page_id FROM page WHERE page_namespace=0 AND page_title=%s", (w,))
  28. result = c.fetchone()
  29. if result is None:
  30. raise WordNotFoundError(w)
  31. return result[0]
  32.  
  33.  
  34. def get_title(c, pid):
  35. c.execute("SELECT page_title FROM page WHERE page_id=%s", (pid,))
  36. result = c.fetchone()
  37. if result is None:
  38. raise PageIdNotFoundError(pid)
  39. return result[0].decode("utf-8")
  40.  
  41.  
  42. def get_linkfrom(c, w):
  43. c.execute("SELECT pl_from FROM pagelinks WHERE pl_from_namespace=0 AND pl_namespace=0 AND pl_title=%s", (w,))
  44. result = c.fetchall()
  45. if result:
  46. return [t[0] for t in result]
  47. else:
  48. return []
  49.  
  50.  
  51. def wp_hops(c, w_from, w_to):
  52. # Raise WordNotFoundError if input w_from, w_to are not in Wikipedia.
  53. w_to_pid = get_pageid(c, w_to)
  54. target = get_pageid(c, w_from)
  55.  
  56. title_list = [w_to]
  57. links = {}
  58. pids = set()
  59. n_link = 0 # for debug purpose.
  60.  
  61. while 1:
  62. next_title_list = []
  63. for title in title_list:
  64. print(n_link, title)
  65. linkfrom = get_linkfrom(c, title)
  66. if target in linkfrom:
  67. result = [w_from, title]
  68. t = title
  69. while t != w_to:
  70. t = links[t]
  71. result.append(t)
  72. return result
  73.  
  74. for lf in linkfrom:
  75. if lf not in pids:
  76. try:
  77. t = get_title(c, lf)
  78. except PageIdNotFoundError:
  79. pass
  80. else:
  81. links[t] = title
  82. pids.add(lf)
  83. next_title_list.append(t)
  84. title_list = next_title_list
  85. n_link += 1
  86.  
  87.  
  88. if __name__ == "__main__":
  89. if len(sys.argv) == 4:
  90. user = sys.argv[1]
  91. pw = getpass.getpass()
  92. w_from = sys.argv[2]
  93. w_to = sys.argv[3]
  94. elif len(sys.argv) == 5:
  95. user = sys.argv[1]
  96. pw = sys.argv[2]
  97. w_from = sys.argv[3]
  98. w_to = sys.argv[4]
  99. conn = mysql.connector.Connect(user=user, password=pw, db="jawiki", charset="utf8")
  100. c = conn.cursor()
  101. print(wp_hops(c, w_from, w_to))
  102. c.close()
  103. conn.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement