Code so far

import os
from bs4 import BeautifulSoup
import json


"""You would need to pip install bs4 """


def read_html(path):
    #Note path is the full path to the file
    #For example on windows:
    #C:\\Users\\anele\\Desktop\\KUMAR\\5397124106352075058.htm
    tank = []
    qans= []
    qnums = []
    indexes = []
    htmls = []

    with open(path) as f:
        container = f.read()


    soup = BeautifulSoup(container, "html.parser")

    #To find all p tags
    paras = soup.findAll("p", {"align":"JUSTIFY"})

    #To convert soup object to string
    content = str(paras)

    #split by new line
    target = content.split("\n")


    #To get the indexes of where the solutions are located in string text
    for i, item in enumerate(target):
        if "Sol" in item:
            indexes.append(int(i))

    #To append all the solutions in an array/list called tank
    for i, item in enumerate(indexes):
        current = i
        next = i+1
        try:
            tank.append(target[indexes[current]:indexes[next]])
        except IndexError:
            pass


    #To clean the solutions in tank list of unwanted tags
    tank = [str(item).replace('p align="JUSTIFY"', "br").replace(",", " ")
        .replace('<p align="CENTER"', "")
        .replace('<td colspan="1" height="196">', "")
        .replace('<td colspan="1" height="62">', "")
        .replace('<td colspan="8" height="5" width="496">', "")
        .replace('<td colspan="1" height="5">', "")
        .replace("[", "").replace("]", "")
        .replace("</p>", "").replace("</td>", "")
        .replace("</tr>", "")
        .replace('<td colspan="2" height="27" width="108">',"")
        .replace('<td colspan="4" height="83" width="272">', "")
        .replace('<td colspan="1">', "")
        .replace('<tr align="LEFT" valign="TOP">', "")
        .replace('<td colspan="1" height="27">', "")
        .replace('<td colspan="4" height="28" rowspan="2" valign="TOP" width="369">', "")
        .replace('<td colspan="1" height="7">', "")
        .replace('<td colspan="6">', "")
        .replace('394 50 - Rajeev Gandhi Nagar Kota  Ph. No. : 93141-87482  0744-2209671', "")
        .replace(""">IVRS No : 0744-2439051  52  53   www. motioniitjee.com   \\""","")
        .replace(" \\", "")
        .replace('<td colspan="2">', "")
        .replace('<td colspan="1" height="692">\\', "")
        .replace('<td colspan="1" height="692">\\', "")
        .replace('<td colspan="1" height="692">', "") for item in tank]

    #To remove duplicates solution if any
    tank = list(set(tank))

    #Here i am trying to seperate the solution, question number and correct option
    #Not working as expected though
    for item in tank[0:]:
        try:
            qnums.append(item.split("<br><b>")[1].split("</b>")[0].split(" ")[0])
        except IndexError:
            pass
        try:
            qans.append(item.split("<b>")[2].split("</b>")[0])
        except IndexError:
            qans.append("")
            pass


    #Loops through all solutions in tank array and prints each seperated
    #by broken lines
    for item in tank:
        print(item)
        print(50*"-")
        print()


#Execution of function
read_html("C:\\Users\\anele\\Desktop\\KUMAR\\5397124106352075058.htm")