Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from lxml import html
- from bs4 import BeautifulSoup
- import json
- """pip install bs4"""
- def html2json(path):
- #NB: Works for any html file that has 5397124106352075058.htm and 1414701776531394236.htm format
- with open(path) as f:
- content = f.read()
- soup = BeautifulSoup(content, "lxml")
- paras = soup.findAll("p", {"align":"JUSTIFY"})
- text = soup.text
- with open(path) as f:
- content = f.read()
- soup = BeautifulSoup(content, "lxml")
- text = soup.text.strip()
- if "Ans" in text.strip():
- qnums = []
- c_options = []
- results = []
- solutions = []
- container = []
- str_container = []
- with open(path) as f:
- content = f.read()
- soup = BeautifulSoup(content, "lxml")
- paras = soup.findAll("p", {"align":"JUSTIFY"})
- indexes = []
- for i,item in enumerate(paras):
- if "Sol" in item.text:
- indexes.append(i)
- for i,item in enumerate(indexes):
- try:
- container.append(paras[indexes[i]:indexes[i+1]])
- except IndexError:
- pass
- container1 = []
- for item in container:
- for sub in item:
- if "Ans" in sub.text:
- container1.append(item)
- for item in container1:
- for sub in item:
- if "Sol" in sub.text:
- qn = sub.text.split()[0]
- qnums.append(qn)
- if "Ans" in sub.text:
- c_option = sub.text
- c_options.append(c_option)
- c_options = [item.replace("Ans.", "").replace("Ans", "").strip() for item in c_options]
- str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "")
- .replace("</p>", "").replace(",", "")
- .replace("]", "").replace("IND", "NEL").strip() for item in container1]
- str_container = [item.replace("<b>Ans. (A)", "")
- .replace("<b>Ans. (B)", "")
- .replace("<b>Ans. (C)", "").replace("<b>Ans. (D)", "")
- .replace("<b>", "").replace("</b>", "").strip() for item in str_container]
- for x in range(len(qnums)):
- obj = {"Question Number":"", "Correct Option":"", "Solution":""}
- obj["Question Number"] = qnums[x]
- obj["Correct Option"] = c_options[x]
- obj["Solution"] = str_container[x]
- results.append(obj)
- final = {"results":results}
- if "\\" in path:
- result_name = path.split("\\")[-1].split(".")[0]
- else:
- result_name = path.split("/")[-1].split(".")[0]
- #Saves result as json in the current working directory
- with open(f"{result_name}.json", "w") as f:
- json.dump(final, f)
- print("Done!")
- elif "Sol" in text.strip():
- results = []
- container = []
- with open(path) as f:
- content = f.read()
- soup = BeautifulSoup(content, "lxml")
- paras = soup.findAll("p", {"align":"JUSTIFY"})
- indexes = []
- for i,item in enumerate(paras):
- if "Sol" in item.text:
- indexes.append(i)
- for i,item in enumerate(indexes):
- try:
- container.append(paras[indexes[i]:indexes[i+1]])
- except IndexError:
- pass
- qnums = []
- c_options = []
- str_container = []
- for i, item in enumerate(container):
- if not item[0].findAll("b"):
- del container[i]
- for item in container:
- try:
- target = item[0].findAll("b")
- except Exception as e:
- print(e)
- pass
- try:
- qn = target[0].text.split()[0].strip()
- qnums.append(qn)
- except Exception:
- pass
- try:
- if len(target) == 2:
- c_option = target[1].text
- c_options.append(c_option.strip())
- elif len(target) == 1:
- c_option = target[0].text.split()[1]
- c_options.append(c_option.strip())
- except IndexError:
- c_option = " "
- c_options.append(c_option)
- pass
- #container
- for item in container:
- try:
- trash = item[0].find("b").decompose()
- except:
- pass
- str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "")
- .replace("</p>", "").replace(",", "")
- .replace("]", "").replace("IND", "NEL").strip() for item in container]
- for x in range(len(qnums)):
- obj = {"Question Number":"", "Correct Option":"", "Solution":""}
- obj["Question Number"] = qnums[x]
- obj["Correct Option"] = c_options[x]
- obj["Solution"] = str_container[x]
- results.append(obj)
- final = {"results":results}
- if "\\" in path:
- result_name = path.split("\\")[-1].split(".")[0]
- else:
- result_name = path.split("/")[-1].split(".")[0]
- #Saves result as json in the current working directory
- with open(f"{result_name}.json", "w") as f:
- json.dump(final, f)
- print("Done!")
- def html2json2(path):
- #NB: Works for any html file that has 1414701776531394236.htm and 315022854031645383.htm format
- qnums = []
- results = []
- container = []
- qnum = ''
- tank = []
- str_container = []
- with open(path) as f:
- content = f.read()
- soup = BeautifulSoup(content, "html5lib")
- paras = soup.findAll("p", {"align":"JUSTIFY"})
- indexes = []
- a = [f"{x}." for x in range(1, 340)]
- b = [f"{x}" for x in range(1, 340)]
- for i,item in enumerate(paras):
- try:
- if item.find("b").text.strip() in a or item.find("b").text.strip() in b:
- indexes.append(i)
- except Exception as e:
- pass
- for i,item in enumerate(indexes):
- try:
- container.append(paras[indexes[i]:indexes[i+1]])
- except IndexError:
- pass
- for item in container:
- for sub in item:
- try:
- if sub.find("b"):
- content = sub.find("b").text.strip()
- except:
- pass
- qnums.append(content)
- str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "")
- .replace("</p>", "").replace(",", "")
- .replace("]", "").replace("IND", "NEL").strip() for item in container]
- for x in range(len(qnums)):
- obj = {"Question Number":"", "Correct Option":"", "Solution":""}
- obj["Question Number"] = qnums[x]
- obj["Correct Option"] = ""
- obj["Solution"] = str_container[x]
- results.append(obj)
- final = {"results":results}
- if "\\" in path:
- result_name = path.split("\\")[-1].split(".")[0]
- else:
- result_name = path.split("/")[-1].split(".")[0]
- #Saves result as json in the current working directory
- with open(f"{result_name}.json", "w") as f:
- json.dump(final, f)
- print("Done!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement