Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- raw_data = ["bolete_raw1.txt", "bolete_raw2.txt", "bolete_raw3.txt"] # source code from 120 page view at wpa bolete filter
- boletes = {}
- filter_list= []
- boletes_list = []
- def raw_split(texts):
- boletes_split = []
- for text in texts:
- with open(text,"r") as file:
- data = file.read()
- data_split = data.split("<li class=\"product type-product")
- del data_split[0] # Removes Junk HTML, CSS Etc prior to first bolete
- for x in data_split:
- boletes_split = boletes_split + [x]
- return boletes_split
- def clean_data(dirty_data):
- for x in dirty_data:
- name = ""
- unsorted_attributes = []
- attributes = []
- common_name = "N/A"
- #filter_list= [], MOVED OUT OF FUNCTION
- scientific_name = ""
- #Begin splitter
- temporary_name = x
- temporary_name = temporary_name.split("woocommerce-loop-product__title\">")
- temporary_name = temporary_name[1].split("</h2><div class=")
- name = temporary_name[0]
- #isolates and cleans description part of entry
- temporary_description = temporary_name[1].split("short-description\">")
- temp_desc_holder = temporary_description[1].split("p>")
- dirty_description = temp_desc_holder[1]
- description = dirty_description.replace(".</", ".")
- #turns attributes into legible strings in list
- temp_attributes = x
- temp_attributes = temp_attributes.split("instock ")
- del temp_attributes[0]
- temp_attributes = temp_attributes[0].split(" product-type-simple")
- del temp_attributes[1]
- temp_attributes = temp_attributes[0].split()
- temp_attributes.sort()
- if temp_attributes[0] == "has-post-thumbnail":
- del temp_attributes[0]
- for att in temp_attributes:
- phrase = ""
- temp_att = att
- temp_att = temp_att.split("-")
- del temp_att[0]
- if temp_att[0] == "filter":
- del temp_att
- else:
- #Not sure which of these got rid of empty/spaces only items. Maybe next time use a set?
- if " " in temp_att:
- del temp_att
- if temp_att == " ":
- del temp_att
- for i in range(1,9):
- if temp_att[0] == str(i):
- del temp_att[0]
- for word in temp_att:
- if " " in word:
- del word
- else:
- if phrase == "":
- phrase = word
- else:
- phrase = phrase + " " + word
- unsorted_attributes = unsorted_attributes + [phrase]
- unsorted_attributes.sort()
- for tester in unsorted_attributes:
- if len(tester) != 0:
- if ("filters" in tester) == False:
- attributes = attributes + [tester]
- #name cleaner, fixes formatting of common names
- if "(“" in name:
- name_split = name.split(" (“")
- scientific_name = name_split[0]
- del name_split[0]
- name_split = name_split[0].split("”")
- del name_split[1]
- if "8217" in name_split[0]:
- common_name = (name_split[0].replace("’", "\'"))
- else:
- common_name = name_split[0]
- else:
- scientific_name = name
- #creates a dictionary for each bolete, stores in boletes dictionary
- boletes.update({scientific_name:{"SCIENTIFIC NAME":scientific_name, "COMMON NAME": common_name, "ATTRIBUTES": attributes, "DESCRIPTION": description}})
- print("\n\n\nBOLETE SCRAPE RUNNING\n\n\n")
- bolete_items = (raw_split(raw_data))
- clean_data(bolete_items)
- #generates list of all attributes across boletes
- for x in boletes:
- for y in boletes[x]:
- if y == "ATTRIBUTES":
- for z in (boletes[x][y]):
- if (z in filter_list) == False:
- filter_list += [z]
- filter_list.sort()
- #writes a file for each bolete entry, a file for filterlist
- for x in boletes:
- file_name = x.replace(" ","_")
- file_name = file_name.replace("/","+")
- with open("Boletes_In_Filter/{}.txt".format(file_name), "w") as f:
- f.write("SCIENTIFIC NAME: {}\n".format(boletes[x]["SCIENTIFIC NAME"]))
- f.write("COMMON NAME: {}\n".format(boletes[x]["COMMON NAME"]))
- f.write("DESCRIPTION: {}\n".format(boletes[x]["DESCRIPTION"]))
- f.write("ATTRIBUTES:\n")
- for attribute in boletes[x]["ATTRIBUTES"]:
- f.write ("{}\n".format(attribute))
- with open("List_of_Filters.txt", "w") as f:
- for x in filter_list:
- f.write("{}\n".format(x))
- print("BOLETE SCRAPE COMPLETE!\n\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement