from lxml import html from bs4 import BeautifulSoup import json import os """pip install bs4""" def html2json2(path): #NB: Works for any html file that has 1414701776531394236.htm and 315022854031645383.htm format #NB: Works for any html file that has 5397124106352075058.htm and 1259815981046145990.htm format #NB path is the path to folder #NB The python script should be run in the folder where the html files are located qnums = [] results = [] container = [] qnum = '' tank = [] str_container = [] for path_, dirs, files in os.walk(path): for file in files: if file.endswith("htm") or file.endswith("html"): with open(os.path.join(path_, file)) as f: content = f.read() soup = BeautifulSoup(content, "html5lib") paras = soup.findAll("p", {"align":"JUSTIFY"}) indexes = [] a = [f"{x}." for x in range(1, 340)] b = [f"{x}" for x in range(1, 340)] for i,item in enumerate(paras): try: if item.find("b").text.strip() in a or item.find("b").text.strip() in b: indexes.append(i) except Exception as e: pass for i,item in enumerate(indexes): try: container.append(paras[indexes[i]:indexes[i+1]]) except IndexError: pass for item in container: for sub in item: try: if sub.find("b"): content = sub.find("b").text.strip() except: pass qnums.append(content) str_container = [str(item).replace("[", "").replace('
', "") .replace("
", "").replace(",", "") .replace("]", "").replace("IND", "NEL").strip() for item in container] for x in range(len(str_container)): obj = {"Question Number":"", "Correct Option":"", "Solution":""} obj["Question Number"] = qnums[x] obj["Correct Option"] = "" try: obj["Solution"] = str_container[x] except IndexError: pass results.append(obj) final = {"results":results} result_name = file.split(".")[0] #Saves result as json in the current working directory with open(f"{result_name}.json", "w") as f: json.dump(final, f) print("Done!")