Advertisement
collinsanele

html_to_json1

Sep 9th, 2019
145
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.64 KB | None | 0 0
  1. from lxml import html
  2. from bs4 import BeautifulSoup
  3. import json
  4. import os
  5.  
  6.  
  7. """pip install bs4"""
  8.  
  9.  
  10. def html2json(path):
  11.  
  12. #NB: Works for any html file that has 5397124106352075058.htm and 1259815981046145990.htm format
  13. #NB path is the path to folder
  14. #NB The python script should be run in the folder where the html files are located
  15.  
  16. for path_, dirs, files in os.walk(path):
  17. for file in files:
  18. if file.endswith("htm") or file.endswith("html"):
  19. with open(os.path.join(path_, file)) as f:
  20. content = f.read()
  21.  
  22. soup = BeautifulSoup(content, "lxml")
  23.  
  24. paras = soup.findAll("p", {"align":"JUSTIFY"})
  25. text = soup.text
  26.  
  27.  
  28. if "Ans" in text.strip():
  29. qnums = []
  30. c_options = []
  31. results = []
  32. solutions = []
  33. container = []
  34. str_container = []
  35.  
  36. indexes = []
  37. for i,item in enumerate(paras):
  38. if "Sol" in item.text:
  39. indexes.append(i)
  40.  
  41. for i,item in enumerate(indexes):
  42. try:
  43. container.append(paras[indexes[i]:indexes[i+1]])
  44. except IndexError:
  45. pass
  46.  
  47. container1 = []
  48.  
  49. for item in container:
  50. for sub in item:
  51. if "Ans" in sub.text:
  52. container1.append(item)
  53.  
  54.  
  55. for item in container1:
  56. for sub in item:
  57. if "Sol" in sub.text:
  58. qn = sub.text.split()[0]
  59. qnums.append(qn)
  60.  
  61. if "Ans" in sub.text:
  62. c_option = sub.text
  63. c_options.append(c_option)
  64.  
  65.  
  66.  
  67. c_options = [item.replace("Ans.", "").replace("Ans", "").strip() for item in c_options]
  68.  
  69.  
  70. str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "")
  71. .replace("</p>", "").replace(",", "")
  72. .replace("]", "").replace("IND", "NEL").strip() for item in container1]
  73.  
  74.  
  75. str_container = [item.replace("<b>Ans. (A)", "")
  76. .replace("<b>Ans. (B)", "")
  77. .replace("<b>Ans. (C)", "").replace("<b>Ans. (D)", "")
  78. .replace("<b>", "").replace("</b>", "").strip() for item in str_container]
  79.  
  80.  
  81.  
  82. for x in range(len(qnums)):
  83. obj = {"Question Number":"", "Correct Option":"", "Solution":""}
  84. obj["Question Number"] = qnums[x]
  85. obj["Correct Option"] = c_options[x]
  86. obj["Solution"] = str_container[x]
  87.  
  88. results.append(obj)
  89.  
  90. final = {"results":results}
  91.  
  92. result_name = file.split(".")[0]
  93.  
  94. #Saves result as json in the current working directory
  95. with open(f"{result_name}.json", "w") as f:
  96. json.dump(final, f)
  97. print("Done!")
  98.  
  99. elif "Sol" in text.strip():
  100. results = []
  101. container = []
  102.  
  103. soup = BeautifulSoup(content, "lxml")
  104. paras = soup.findAll("p", {"align":"JUSTIFY"})
  105. indexes = []
  106. for i,item in enumerate(paras):
  107. if "Sol" in item.text:
  108. indexes.append(i)
  109.  
  110. for i,item in enumerate(indexes):
  111. try:
  112. container.append(paras[indexes[i]:indexes[i+1]])
  113. except IndexError:
  114. pass
  115.  
  116.  
  117. qnums = []
  118. c_options = []
  119. str_container = []
  120.  
  121. for i, item in enumerate(container):
  122. if not item[0].findAll("b"):
  123. del container[i]
  124.  
  125.  
  126. for item in container:
  127. try:
  128. target = item[0].findAll("b")
  129. except Exception as e:
  130. print(e)
  131. pass
  132.  
  133. try:
  134. qn = target[0].text.split()[0].strip()
  135. qnums.append(qn)
  136. except Exception:
  137. pass
  138.  
  139. try:
  140. if len(target) == 2:
  141. c_option = target[1].text
  142. c_options.append(c_option.strip())
  143. elif len(target) == 1:
  144. c_option = target[0].text.split()[1]
  145. c_options.append(c_option.strip())
  146.  
  147. except IndexError:
  148. c_option = " "
  149. c_options.append(c_option)
  150. pass
  151.  
  152. for item in container:
  153. try:
  154. trash = item[0].find("b").decompose()
  155. except:
  156. pass
  157.  
  158.  
  159. str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "")
  160. .replace("</p>", "").replace(",", "")
  161. .replace("]", "").replace("IND", "NEL").strip() for item in container]
  162.  
  163. for x in range(len(qnums)):
  164. obj = {"Question Number":"", "Correct Option":"", "Solution":""}
  165. obj["Question Number"] = qnums[x]
  166. obj["Correct Option"] = c_options[x]
  167. obj["Solution"] = str_container[x]
  168.  
  169. results.append(obj)
  170.  
  171. final = {"results":results}
  172.  
  173.  
  174. result_name = file.split(".")[0]
  175.  
  176.  
  177. #Saves result as json in the current working directory
  178. with open(f"{result_name}.json", "w") as f:
  179. json.dump(final, f)
  180. print("Done!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement