Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from lxml import html
- from bs4 import BeautifulSoup
- import json
- """ NB: for only 5397124106352075058.htm """
- def html2json(path):
- results = []
- container = []
- with open(path) as f:
- content = f.read()
- soup = BeautifulSoup(content, "lxml")
- paras = soup.findAll("p", {"align":"JUSTIFY"})
- indexes = []
- for i,item in enumerate(paras):
- if "Sol" in item.text:
- indexes.append(i)
- for i,item in enumerate(indexes):
- try:
- container.append(paras[indexes[i]:indexes[i+1]])
- except IndexError:
- pass
- qnums = []
- c_options = []
- str_container = []
- for i, item in enumerate(container):
- if not item[0].findAll("b"):
- del container[i]
- for item in container:
- try:
- target = item[0].findAll("b")
- except Exception as e:
- print(e)
- pass
- try:
- qn = target[0].text.split()[0].strip()
- qnums.append(qn)
- except Exception:
- pass
- try:
- if len(target) == 2:
- c_option = target[1].text
- c_options.append(c_option.strip())
- elif len(target) == 1:
- c_option = target[0].text.split()[1]
- c_options.append(c_option.strip())
- except IndexError:
- c_option = " "
- c_options.append(c_option)
- pass
- #container
- for item in container:
- try:
- trash = item[0].find("b").decompose()
- except:
- pass
- str_container = [str(item).replace("[", "").replace('<p align="JUSTIFY">', "").replace("</p>", "").replace(",", "").replace("]", "") for item in container]
- for x in range(len(qnums)):
- obj = {"Question Number":"", "Correct Option":"", "Solution":""}
- obj["Question Number"] = qnums[x]
- obj["Correct Option"] = c_options[x]
- obj["Solution"] = str_container[x]
- results.append(obj)
- final = {"results":results}
- #Saves result as json in the current working directory
- with open("539.json", "w") as f:
- json.dump(final, f)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement