Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- from bs4 import BeautifulSoup
- import json
- """You would need to pip install bs4 """
- def read_html(path):
- #Note path is the full path to the file
- #For example on windows:
- #C:\\Users\\anele\\Desktop\\KUMAR\\5397124106352075058.htm
- tank = []
- qans= []
- qnums = []
- indexes = []
- htmls = []
- with open(path) as f:
- container = f.read()
- soup = BeautifulSoup(container, "html.parser")
- #To find all p tags
- paras = soup.findAll("p", {"align":"JUSTIFY"})
- #To convert soup object to string
- content = str(paras)
- #split by new line
- target = content.split("\n")
- #To get the indexes of where the solutions are located in string text
- for i, item in enumerate(target):
- if "Sol" in item:
- indexes.append(int(i))
- #To append all the solutions in an array/list called tank
- for i, item in enumerate(indexes):
- current = i
- next = i+1
- try:
- tank.append(target[indexes[current]:indexes[next]])
- except IndexError:
- pass
- #To clean the solutions in tank list of unwanted tags
- tank = [str(item).replace('p align="JUSTIFY"', "br").replace(",", " ")
- .replace('<p align="CENTER"', "")
- .replace('<td colspan="1" height="196">', "")
- .replace('<td colspan="1" height="62">', "")
- .replace('<td colspan="8" height="5" width="496">', "")
- .replace('<td colspan="1" height="5">', "")
- .replace("[", "").replace("]", "")
- .replace("</p>", "").replace("</td>", "")
- .replace("</tr>", "")
- .replace('<td colspan="2" height="27" width="108">',"")
- .replace('<td colspan="4" height="83" width="272">', "")
- .replace('<td colspan="1">', "")
- .replace('<tr align="LEFT" valign="TOP">', "")
- .replace('<td colspan="1" height="27">', "")
- .replace('<td colspan="4" height="28" rowspan="2" valign="TOP" width="369">', "")
- .replace('<td colspan="1" height="7">', "")
- .replace('<td colspan="6">', "")
- .replace('394 50 - Rajeev Gandhi Nagar Kota Ph. No. : 93141-87482 0744-2209671', "")
- .replace(""">IVRS No : 0744-2439051 52 53 www. motioniitjee.com \\""","")
- .replace(" \\", "")
- .replace('<td colspan="2">', "")
- .replace('<td colspan="1" height="692">\\', "")
- .replace('<td colspan="1" height="692">\\', "")
- .replace('<td colspan="1" height="692">', "") for item in tank]
- #To remove duplicates solution if any
- tank = list(set(tank))
- #Here i am trying to seperate the solution, question number and correct option
- #Not working as expected though
- for item in tank[0:]:
- try:
- qnums.append(item.split("<br><b>")[1].split("</b>")[0].split(" ")[0])
- except IndexError:
- pass
- try:
- qans.append(item.split("<b>")[2].split("</b>")[0])
- except IndexError:
- qans.append("")
- pass
- #Loops through all solutions in tank array and prints each seperated
- #by broken lines
- for item in tank:
- print(item)
- print(50*"-")
- print()
- #Execution of function
- read_html("C:\\Users\\anele\\Desktop\\KUMAR\\5397124106352075058.htm")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement