Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from lxml import html
- from bs4 import BeautifulSoup
- import json
- import os
- """pip install bs4"""
- def html2json2(path):
- #NB: Works for any html file that has 1414701776531394236.htm and 315022854031645383.htm format
- #NB: Works for any html file that has 5397124106352075058.htm and 1259815981046145990.htm format
- #NB path is the path to folder
- #NB The python script should be run in the folder where the html files are located
- qnums = []
- results = []
- container = []
- qnum = ''
- tank = []
- str_container = []
- for path_, dirs, files in os.walk(path):
- for file in files:
- if file.endswith("htm") or file.endswith("html"):
- with open(os.path.join(path_, file)) as f:
- content = f.read()
- soup = BeautifulSoup(content, "html5lib")
- paras = soup.findAll("p", {"align":"JUSTIFY"})
- indexes = []
- a = [f"{x}." for x in range(1, 340)]
- b = [f"{x}" for x in range(1, 340)]
- for i,item in enumerate(paras):
- try:
- if item.find("b").text.strip() in a or item.find("b").text.strip() in b:
- indexes.append(i)
- except Exception as e:
- pass
- for i,item in enumerate(indexes):
- try:
- container.append(paras[indexes[i]:indexes[i+1]])
- except IndexError:
- pass
- for item in container:
- for sub in item:
- try:
- if sub.find("b"):
- content = sub.find("b").text.strip()
- except:
- pass
- qnums.append(content)
- str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "")
- .replace("</p>", "").replace(",", "")
- .replace("]", "").replace("IND", "NEL").strip() for item in container]
- for x in range(len(str_container)):
- obj = {"Question Number":"", "Correct Option":"", "Solution":""}
- obj["Question Number"] = qnums[x]
- obj["Correct Option"] = ""
- try:
- obj["Solution"] = str_container[x]
- except IndexError:
- pass
- results.append(obj)
- final = {"results":results}
- result_name = file.split(".")[0]
- #Saves result as json in the current working directory
- with open(f"{result_name}.json", "w") as f:
- json.dump(final, f)
- print("Done!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement