Advertisement
collinsanele

latest

Sep 5th, 2019
191
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.30 KB | None | 0 0
  1. from lxml import html
  2. from bs4 import BeautifulSoup
  3. import json
  4.  
  5.  
  6. """pip install bs4"""
  7.  
  8.  
  9. def html2json(path):
  10.  
  11. #NB: Works for any html file that has 5397124106352075058.htm and 1414701776531394236.htm format
  12.  
  13. with open(path) as f:
  14. content = f.read()
  15.  
  16. soup = BeautifulSoup(content, "lxml")
  17.  
  18. paras = soup.findAll("p", {"align":"JUSTIFY"})
  19. text = soup.text
  20.  
  21. with open(path) as f:
  22. content = f.read()
  23.  
  24.  
  25. soup = BeautifulSoup(content, "lxml")
  26. text = soup.text.strip()
  27.  
  28. if "Ans" in text.strip():
  29. qnums = []
  30. c_options = []
  31. results = []
  32. solutions = []
  33. container = []
  34. str_container = []
  35.  
  36. with open(path) as f:
  37. content = f.read()
  38.  
  39. soup = BeautifulSoup(content, "lxml")
  40. paras = soup.findAll("p", {"align":"JUSTIFY"})
  41. indexes = []
  42. for i,item in enumerate(paras):
  43. if "Sol" in item.text:
  44. indexes.append(i)
  45.  
  46. for i,item in enumerate(indexes):
  47. try:
  48. container.append(paras[indexes[i]:indexes[i+1]])
  49. except IndexError:
  50. pass
  51.  
  52. container1 = []
  53.  
  54. for item in container:
  55. for sub in item:
  56. if "Ans" in sub.text:
  57. container1.append(item)
  58.  
  59.  
  60. for item in container1:
  61. for sub in item:
  62. if "Sol" in sub.text:
  63. qn = sub.text.split()[0]
  64. qnums.append(qn)
  65.  
  66. if "Ans" in sub.text:
  67. c_option = sub.text
  68. c_options.append(c_option)
  69.  
  70.  
  71.  
  72. c_options = [item.replace("Ans.", "").replace("Ans", "").strip() for item in c_options]
  73.  
  74.  
  75. str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "")
  76. .replace("</p>", "").replace(",", "")
  77. .replace("]", "").replace("IND", "NEL").strip() for item in container1]
  78.  
  79.  
  80. str_container = [item.replace("<b>Ans. (A)", "")
  81. .replace("<b>Ans. (B)", "")
  82. .replace("<b>Ans. (C)", "").replace("<b>Ans. (D)", "")
  83. .replace("<b>", "").replace("</b>", "").strip() for item in str_container]
  84.  
  85.  
  86.  
  87. for x in range(len(qnums)):
  88. obj = {"Question Number":"", "Correct Option":"", "Solution":""}
  89. obj["Question Number"] = qnums[x]
  90. obj["Correct Option"] = c_options[x]
  91. obj["Solution"] = str_container[x]
  92.  
  93. results.append(obj)
  94.  
  95. final = {"results":results}
  96.  
  97. if "\\" in path:
  98. result_name = path.split("\\")[-1].split(".")[0]
  99.  
  100. else:
  101. result_name = path.split("/")[-1].split(".")[0]
  102. #Saves result as json in the current working directory
  103. with open(f"{result_name}.json", "w") as f:
  104. json.dump(final, f)
  105. print("Done!")
  106.  
  107. elif "Sol" in text.strip():
  108. results = []
  109. container = []
  110. with open(path) as f:
  111. content = f.read()
  112.  
  113. soup = BeautifulSoup(content, "lxml")
  114. paras = soup.findAll("p", {"align":"JUSTIFY"})
  115. indexes = []
  116. for i,item in enumerate(paras):
  117. if "Sol" in item.text:
  118. indexes.append(i)
  119.  
  120. for i,item in enumerate(indexes):
  121. try:
  122. container.append(paras[indexes[i]:indexes[i+1]])
  123. except IndexError:
  124. pass
  125.  
  126.  
  127. qnums = []
  128. c_options = []
  129. str_container = []
  130.  
  131. for i, item in enumerate(container):
  132. if not item[0].findAll("b"):
  133. del container[i]
  134.  
  135.  
  136. for item in container:
  137. try:
  138. target = item[0].findAll("b")
  139. except Exception as e:
  140. print(e)
  141. pass
  142.  
  143. try:
  144. qn = target[0].text.split()[0].strip()
  145. qnums.append(qn)
  146. except Exception:
  147. pass
  148.  
  149. try:
  150. if len(target) == 2:
  151. c_option = target[1].text
  152. c_options.append(c_option.strip())
  153. elif len(target) == 1:
  154. c_option = target[0].text.split()[1]
  155. c_options.append(c_option.strip())
  156.  
  157. except IndexError:
  158. c_option = " "
  159. c_options.append(c_option)
  160. pass
  161.  
  162.  
  163. #container
  164. for item in container:
  165. try:
  166. trash = item[0].find("b").decompose()
  167. except:
  168. pass
  169.  
  170.  
  171. str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "")
  172. .replace("</p>", "").replace(",", "")
  173. .replace("]", "").replace("IND", "NEL").strip() for item in container]
  174.  
  175. for x in range(len(qnums)):
  176. obj = {"Question Number":"", "Correct Option":"", "Solution":""}
  177. obj["Question Number"] = qnums[x]
  178. obj["Correct Option"] = c_options[x]
  179. obj["Solution"] = str_container[x]
  180.  
  181. results.append(obj)
  182.  
  183. final = {"results":results}
  184.  
  185.  
  186. if "\\" in path:
  187. result_name = path.split("\\")[-1].split(".")[0]
  188.  
  189. else:
  190. result_name = path.split("/")[-1].split(".")[0]
  191.  
  192. #Saves result as json in the current working directory
  193. with open(f"{result_name}.json", "w") as f:
  194. json.dump(final, f)
  195. print("Done!")
  196.  
  197.  
  198.  
  199.  
  200. def html2json2(path):
  201. #NB: Works for any html file that has 1414701776531394236.htm and 315022854031645383.htm format
  202. qnums = []
  203. results = []
  204. container = []
  205. qnum = ''
  206. tank = []
  207. str_container = []
  208.  
  209. with open(path) as f:
  210. content = f.read()
  211.  
  212. soup = BeautifulSoup(content, "html5lib")
  213. paras = soup.findAll("p", {"align":"JUSTIFY"})
  214. indexes = []
  215.  
  216. a = [f"{x}." for x in range(1, 340)]
  217. b = [f"{x}" for x in range(1, 340)]
  218.  
  219. for i,item in enumerate(paras):
  220. try:
  221. if item.find("b").text.strip() in a or item.find("b").text.strip() in b:
  222. indexes.append(i)
  223.  
  224. except Exception as e:
  225. pass
  226.  
  227.  
  228. for i,item in enumerate(indexes):
  229. try:
  230. container.append(paras[indexes[i]:indexes[i+1]])
  231. except IndexError:
  232. pass
  233.  
  234.  
  235.  
  236. for item in container:
  237. for sub in item:
  238. try:
  239. if sub.find("b"):
  240. content = sub.find("b").text.strip()
  241. except:
  242. pass
  243. qnums.append(content)
  244.  
  245. str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "")
  246. .replace("</p>", "").replace(",", "")
  247. .replace("]", "").replace("IND", "NEL").strip() for item in container]
  248.  
  249.  
  250. for x in range(len(qnums)):
  251. obj = {"Question Number":"", "Correct Option":"", "Solution":""}
  252. obj["Question Number"] = qnums[x]
  253. obj["Correct Option"] = ""
  254. obj["Solution"] = str_container[x]
  255.  
  256. results.append(obj)
  257.  
  258. final = {"results":results}
  259.  
  260.  
  261. if "\\" in path:
  262. result_name = path.split("\\")[-1].split(".")[0]
  263.  
  264. else:
  265. result_name = path.split("/")[-1].split(".")[0]
  266.  
  267. #Saves result as json in the current working directory
  268. with open(f"{result_name}.json", "w") as f:
  269. json.dump(final, f)
  270. print("Done!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement