Advertisement
collinsanele

Code so far

Sep 1st, 2019
169
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.40 KB | None | 0 0
  1. import os
  2. from bs4 import BeautifulSoup
  3. import json
  4.  
  5.  
  6. """You would need to pip install bs4 """
  7.  
  8.  
  9.  
  10. def read_html(path):
  11. #Note path is the full path to the file
  12. #For example on windows:
  13. #C:\\Users\\anele\\Desktop\\KUMAR\\5397124106352075058.htm
  14. tank = []
  15. qans= []
  16. qnums = []
  17. indexes = []
  18. htmls = []
  19.  
  20. with open(path) as f:
  21. container = f.read()
  22.  
  23.  
  24. soup = BeautifulSoup(container, "html.parser")
  25.  
  26. #To find all p tags
  27. paras = soup.findAll("p", {"align":"JUSTIFY"})
  28.  
  29. #To convert soup object to string
  30. content = str(paras)
  31.  
  32. #split by new line
  33. target = content.split("\n")
  34.  
  35.  
  36. #To get the indexes of where the solutions are located in string text
  37. for i, item in enumerate(target):
  38. if "Sol" in item:
  39. indexes.append(int(i))
  40.  
  41. #To append all the solutions in an array/list called tank
  42. for i, item in enumerate(indexes):
  43. current = i
  44. next = i+1
  45. try:
  46. tank.append(target[indexes[current]:indexes[next]])
  47. except IndexError:
  48. pass
  49.  
  50.  
  51.  
  52. #To clean the solutions in tank list of unwanted tags
  53. tank = [str(item).replace('p align="JUSTIFY"', "br").replace(",", " ")
  54. .replace('<p align="CENTER"', "")
  55. .replace('<td colspan="1" height="196">', "")
  56. .replace('<td colspan="1" height="62">', "")
  57. .replace('<td colspan="8" height="5" width="496">', "")
  58. .replace('<td colspan="1" height="5">', "")
  59. .replace("[", "").replace("]", "")
  60. .replace("</p>", "").replace("</td>", "")
  61. .replace("</tr>", "")
  62. .replace('<td colspan="2" height="27" width="108">',"")
  63. .replace('<td colspan="4" height="83" width="272">', "")
  64. .replace('<td colspan="1">', "")
  65. .replace('<tr align="LEFT" valign="TOP">', "")
  66. .replace('<td colspan="1" height="27">', "")
  67. .replace('<td colspan="4" height="28" rowspan="2" valign="TOP" width="369">', "")
  68. .replace('<td colspan="1" height="7">', "")
  69. .replace('<td colspan="6">', "")
  70. .replace('394 50 - Rajeev Gandhi Nagar Kota Ph. No. : 93141-87482 0744-2209671', "")
  71. .replace(""">IVRS No : 0744-2439051 52 53 www. motioniitjee.com \\""","")
  72. .replace(" \\", "")
  73. .replace('<td colspan="2">', "")
  74. .replace('<td colspan="1" height="692">\\', "")
  75. .replace('<td colspan="1" height="692">\\', "")
  76. .replace('<td colspan="1" height="692">', "") for item in tank]
  77.  
  78. #To remove duplicates solution if any
  79. tank = list(set(tank))
  80.  
  81. #Here i am trying to seperate the solution, question number and correct option
  82. #Not working as expected though
  83. for item in tank[0:]:
  84. try:
  85. qnums.append(item.split("<br><b>")[1].split("</b>")[0].split(" ")[0])
  86. except IndexError:
  87. pass
  88. try:
  89. qans.append(item.split("<b>")[2].split("</b>")[0])
  90. except IndexError:
  91. qans.append("")
  92. pass
  93.  
  94.  
  95. #Loops through all solutions in tank array and prints each seperated
  96. #by broken lines
  97. for item in tank:
  98. print(item)
  99. print(50*"-")
  100. print()
  101.  
  102.  
  103. #Execution of function
  104. read_html("C:\\Users\\anele\\Desktop\\KUMAR\\5397124106352075058.htm")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement