Advertisement
collinsanele

html_to_json2

Sep 9th, 2019
226
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.78 KB | None | 0 0
  1. from lxml import html
  2. from bs4 import BeautifulSoup
  3. import json
  4. import os
  5.  
  6.  
  7. """pip install bs4"""
  8.  
  9.  
  10. def html2json2(path):
  11. #NB: Works for any html file that has 1414701776531394236.htm and 315022854031645383.htm format
  12. #NB: Works for any html file that has 5397124106352075058.htm and 1259815981046145990.htm format
  13. #NB path is the path to folder
  14. #NB The python script should be run in the folder where the html files are located
  15.  
  16. qnums = []
  17. results = []
  18. container = []
  19. qnum = ''
  20. tank = []
  21. str_container = []
  22.  
  23. for path_, dirs, files in os.walk(path):
  24. for file in files:
  25. if file.endswith("htm") or file.endswith("html"):
  26. with open(os.path.join(path_, file)) as f:
  27. content = f.read()
  28.  
  29.  
  30. soup = BeautifulSoup(content, "html5lib")
  31. paras = soup.findAll("p", {"align":"JUSTIFY"})
  32. indexes = []
  33.  
  34. a = [f"{x}." for x in range(1, 340)]
  35. b = [f"{x}" for x in range(1, 340)]
  36.  
  37. for i,item in enumerate(paras):
  38. try:
  39. if item.find("b").text.strip() in a or item.find("b").text.strip() in b:
  40. indexes.append(i)
  41.  
  42. except Exception as e:
  43. pass
  44.  
  45.  
  46. for i,item in enumerate(indexes):
  47. try:
  48. container.append(paras[indexes[i]:indexes[i+1]])
  49. except IndexError:
  50. pass
  51.  
  52.  
  53.  
  54. for item in container:
  55. for sub in item:
  56. try:
  57. if sub.find("b"):
  58. content = sub.find("b").text.strip()
  59. except:
  60. pass
  61. qnums.append(content)
  62.  
  63. str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "")
  64. .replace("</p>", "").replace(",", "")
  65. .replace("]", "").replace("IND", "NEL").strip() for item in container]
  66.  
  67.  
  68. for x in range(len(str_container)):
  69. obj = {"Question Number":"", "Correct Option":"", "Solution":""}
  70. obj["Question Number"] = qnums[x]
  71. obj["Correct Option"] = ""
  72.  
  73. try:
  74. obj["Solution"] = str_container[x]
  75. except IndexError:
  76. pass
  77.  
  78. results.append(obj)
  79.  
  80. final = {"results":results}
  81.  
  82.  
  83. result_name = file.split(".")[0]
  84.  
  85. #Saves result as json in the current working directory
  86. with open(f"{result_name}.json", "w") as f:
  87. json.dump(final, f)
  88. print("Done!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement