Advertisement
Guest User

boletescrape

a guest
Jul 15th, 2019
175
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.04 KB | None | 0 0
  1. raw_data = ["bolete_raw1.txt", "bolete_raw2.txt", "bolete_raw3.txt"] # source code from 120 page view at wpa bolete filter
  2. boletes = {}
  3. filter_list= []
  4. boletes_list = []
  5. def raw_split(texts):
  6. boletes_split = []
  7. for text in texts:
  8. with open(text,"r") as file:
  9. data = file.read()
  10. data_split = data.split("<li class=\"product type-product")
  11. del data_split[0] # Removes Junk HTML, CSS Etc prior to first bolete
  12. for x in data_split:
  13. boletes_split = boletes_split + [x]
  14. return boletes_split
  15. def clean_data(dirty_data):
  16.  
  17. for x in dirty_data:
  18. name = ""
  19. unsorted_attributes = []
  20. attributes = []
  21. common_name = "N/A"
  22. #filter_list= [], MOVED OUT OF FUNCTION
  23. scientific_name = ""
  24. #Begin splitter
  25. temporary_name = x
  26. temporary_name = temporary_name.split("woocommerce-loop-product__title\">")
  27. temporary_name = temporary_name[1].split("</h2><div class=")
  28. name = temporary_name[0]
  29. #isolates and cleans description part of entry
  30. temporary_description = temporary_name[1].split("short-description\">")
  31. temp_desc_holder = temporary_description[1].split("p>")
  32. dirty_description = temp_desc_holder[1]
  33. description = dirty_description.replace(".</", ".")
  34. #turns attributes into legible strings in list
  35. temp_attributes = x
  36. temp_attributes = temp_attributes.split("instock ")
  37. del temp_attributes[0]
  38. temp_attributes = temp_attributes[0].split(" product-type-simple")
  39. del temp_attributes[1]
  40. temp_attributes = temp_attributes[0].split()
  41. temp_attributes.sort()
  42. if temp_attributes[0] == "has-post-thumbnail":
  43. del temp_attributes[0]
  44. for att in temp_attributes:
  45. phrase = ""
  46. temp_att = att
  47. temp_att = temp_att.split("-")
  48. del temp_att[0]
  49. if temp_att[0] == "filter":
  50. del temp_att
  51. else:
  52. #Not sure which of these got rid of empty/spaces only items. Maybe next time use a set?
  53. if " " in temp_att:
  54. del temp_att
  55. if temp_att == " ":
  56. del temp_att
  57. for i in range(1,9):
  58. if temp_att[0] == str(i):
  59. del temp_att[0]
  60. for word in temp_att:
  61. if " " in word:
  62. del word
  63. else:
  64. if phrase == "":
  65. phrase = word
  66. else:
  67. phrase = phrase + " " + word
  68. unsorted_attributes = unsorted_attributes + [phrase]
  69. unsorted_attributes.sort()
  70. for tester in unsorted_attributes:
  71. if len(tester) != 0:
  72. if ("filters" in tester) == False:
  73. attributes = attributes + [tester]
  74. #name cleaner, fixes formatting of common names
  75. if "(&#8220;" in name:
  76. name_split = name.split(" (&#8220;")
  77. scientific_name = name_split[0]
  78. del name_split[0]
  79. name_split = name_split[0].split("&#8221")
  80. del name_split[1]
  81. if "8217" in name_split[0]:
  82. common_name = (name_split[0].replace("&#8217;", "\'"))
  83. else:
  84. common_name = name_split[0]
  85. else:
  86. scientific_name = name
  87. #creates a dictionary for each bolete, stores in boletes dictionary
  88. boletes.update({scientific_name:{"SCIENTIFIC NAME":scientific_name, "COMMON NAME": common_name, "ATTRIBUTES": attributes, "DESCRIPTION": description}})
  89.  
  90. print("\n\n\nBOLETE SCRAPE RUNNING\n\n\n")
  91. bolete_items = (raw_split(raw_data))
  92. clean_data(bolete_items)
  93. #generates list of all attributes across boletes
  94. for x in boletes:
  95. for y in boletes[x]:
  96. if y == "ATTRIBUTES":
  97. for z in (boletes[x][y]):
  98. if (z in filter_list) == False:
  99. filter_list += [z]
  100. filter_list.sort()
  101. #writes a file for each bolete entry, a file for filterlist
  102. for x in boletes:
  103. file_name = x.replace(" ","_")
  104. file_name = file_name.replace("/","+")
  105. with open("Boletes_In_Filter/{}.txt".format(file_name), "w") as f:
  106. f.write("SCIENTIFIC NAME: {}\n".format(boletes[x]["SCIENTIFIC NAME"]))
  107. f.write("COMMON NAME: {}\n".format(boletes[x]["COMMON NAME"]))
  108. f.write("DESCRIPTION: {}\n".format(boletes[x]["DESCRIPTION"]))
  109. f.write("ATTRIBUTES:\n")
  110. for attribute in boletes[x]["ATTRIBUTES"]:
  111. f.write ("{}\n".format(attribute))
  112. with open("List_of_Filters.txt", "w") as f:
  113. for x in filter_list:
  114. f.write("{}\n".format(x))
  115. print("BOLETE SCRAPE COMPLETE!\n\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement