from lxml import html from bs4 import BeautifulSoup import json import os """pip install bs4""" def html2json(path): #NB: Works for any html file that has 5397124106352075058.htm and 1259815981046145990.htm format #NB path is the path to folder #NB The python script should be run in the folder where the html files are located for path_, dirs, files in os.walk(path): for file in files: if file.endswith("htm") or file.endswith("html"): with open(os.path.join(path_, file)) as f: content = f.read() soup = BeautifulSoup(content, "lxml") paras = soup.findAll("p", {"align":"JUSTIFY"}) text = soup.text if "Ans" in text.strip(): qnums = [] c_options = [] results = [] solutions = [] container = [] str_container = [] indexes = [] for i,item in enumerate(paras): if "Sol" in item.text: indexes.append(i) for i,item in enumerate(indexes): try: container.append(paras[indexes[i]:indexes[i+1]]) except IndexError: pass container1 = [] for item in container: for sub in item: if "Ans" in sub.text: container1.append(item) for item in container1: for sub in item: if "Sol" in sub.text: qn = sub.text.split()[0] qnums.append(qn) if "Ans" in sub.text: c_option = sub.text c_options.append(c_option) c_options = [item.replace("Ans.", "").replace("Ans", "").strip() for item in c_options] str_container = [str(item).replace("[", "").replace('
', "") .replace("
", "").replace(",", "") .replace("]", "").replace("IND", "NEL").strip() for item in container1] str_container = [item.replace("Ans. (A)", "") .replace("Ans. (B)", "") .replace("Ans. (C)", "").replace("Ans. (D)", "") .replace("", "").replace("", "").strip() for item in str_container] for x in range(len(qnums)): obj = {"Question Number":"", "Correct Option":"", "Solution":""} obj["Question Number"] = qnums[x] obj["Correct Option"] = c_options[x] obj["Solution"] = str_container[x] results.append(obj) final = {"results":results} result_name = file.split(".")[0] #Saves result as json in the current working directory with open(f"{result_name}.json", "w") as f: json.dump(final, f) print("Done!") elif "Sol" in text.strip(): results = [] container = [] soup = BeautifulSoup(content, "lxml") paras = soup.findAll("p", {"align":"JUSTIFY"}) indexes = [] for i,item in enumerate(paras): if "Sol" in item.text: indexes.append(i) for i,item in enumerate(indexes): try: container.append(paras[indexes[i]:indexes[i+1]]) except IndexError: pass qnums = [] c_options = [] str_container = [] for i, item in enumerate(container): if not item[0].findAll("b"): del container[i] for item in container: try: target = item[0].findAll("b") except Exception as e: print(e) pass try: qn = target[0].text.split()[0].strip() qnums.append(qn) except Exception: pass try: if len(target) == 2: c_option = target[1].text c_options.append(c_option.strip()) elif len(target) == 1: c_option = target[0].text.split()[1] c_options.append(c_option.strip()) except IndexError: c_option = " " c_options.append(c_option) pass for item in container: try: trash = item[0].find("b").decompose() except: pass str_container = [str(item).replace("[", "").replace('', "") .replace("
", "").replace(",", "") .replace("]", "").replace("IND", "NEL").strip() for item in container] for x in range(len(qnums)): obj = {"Question Number":"", "Correct Option":"", "Solution":""} obj["Question Number"] = qnums[x] obj["Correct Option"] = c_options[x] obj["Solution"] = str_container[x] results.append(obj) final = {"results":results} result_name = file.split(".")[0] #Saves result as json in the current working directory with open(f"{result_name}.json", "w") as f: json.dump(final, f) print("Done!")