boletescrape

raw_data = ["bolete_raw1.txt", "bolete_raw2.txt", "bolete_raw3.txt"] # source code from 120 page view at wpa bolete filter
boletes = {}
filter_list= []
boletes_list = []
def raw_split(texts):
	boletes_split = []
	for text in texts:
		with open(text,"r") as file:
			data = file.read()
			data_split = data.split("<li class=\"product type-product")
			del data_split[0] # Removes Junk HTML, CSS Etc prior to first bolete
			for x in data_split:
				boletes_split = boletes_split + [x]
	return boletes_split
def clean_data(dirty_data):

	for x in dirty_data:
		name = ""
		unsorted_attributes = []
		attributes = []
		common_name = "N/A"
		#filter_list= [], MOVED OUT OF FUNCTION
		scientific_name = ""
		#Begin splitter
		temporary_name = x
		temporary_name = temporary_name.split("woocommerce-loop-product__title\">")
		temporary_name = temporary_name[1].split("</h2><div class=")
		name = temporary_name[0]
		#isolates and cleans description part of entry
		temporary_description = temporary_name[1].split("short-description\">")
		temp_desc_holder = temporary_description[1].split("p>")
		dirty_description = temp_desc_holder[1]
		description = dirty_description.replace(".</", ".")
		#turns attributes into legible strings in list
		temp_attributes = x
		temp_attributes = temp_attributes.split("instock ")
		del temp_attributes[0]
		temp_attributes = temp_attributes[0].split(" product-type-simple")
		del temp_attributes[1]
		temp_attributes = temp_attributes[0].split()
		temp_attributes.sort()
		if temp_attributes[0] == "has-post-thumbnail":
			del temp_attributes[0]
		for att in temp_attributes:
			phrase = ""
			temp_att = att
			temp_att = temp_att.split("-")
			del temp_att[0]
			if temp_att[0] == "filter":
				del temp_att
			else:
				#Not sure which of these got rid of empty/spaces only items. Maybe next time use a set?
				if "  " in temp_att:
					del temp_att
				if temp_att == " ":
					del temp_att
				for i in range(1,9):
					if temp_att[0] == str(i):
						del temp_att[0]
				for word in temp_att:
					if " " in word:
						del word
					else:
						if phrase == "":
							phrase = word
						else:
							phrase = phrase + " " + word
			unsorted_attributes = unsorted_attributes + [phrase]
		unsorted_attributes.sort()
		for tester in unsorted_attributes:
			if len(tester) != 0:
				if ("filters" in tester) == False:
					attributes = attributes + [tester]
		#name cleaner, fixes formatting of common names
		if "(&#8220;" in name:
			name_split = name.split(" (&#8220;")
			scientific_name = name_split[0]
			del name_split[0]
			name_split = name_split[0].split("&#8221")
			del name_split[1]
			if "8217" in name_split[0]:
				common_name = (name_split[0].replace("&#8217;", "\'"))
			else:
				common_name = name_split[0]
		else:
			scientific_name = name
		#creates a dictionary for each bolete, stores in boletes dictionary
		boletes.update({scientific_name:{"SCIENTIFIC NAME":scientific_name, "COMMON NAME": common_name, "ATTRIBUTES": attributes, "DESCRIPTION": description}})

print("\n\n\nBOLETE SCRAPE RUNNING\n\n\n")
bolete_items = (raw_split(raw_data))
clean_data(bolete_items)
#generates list of all attributes across boletes
for x in boletes:
	for y in boletes[x]:
		if y == "ATTRIBUTES":
			for z in (boletes[x][y]):
				if (z in filter_list) == False:
					filter_list += [z]
filter_list.sort()
#writes a file for each bolete entry, a file for filterlist
for x in boletes:
	file_name = x.replace(" ","_")
	file_name = file_name.replace("/","+")
	with open("Boletes_In_Filter/{}.txt".format(file_name), "w") as f:
		f.write("SCIENTIFIC NAME: {}\n".format(boletes[x]["SCIENTIFIC NAME"]))
		f.write("COMMON NAME: {}\n".format(boletes[x]["COMMON NAME"]))
		f.write("DESCRIPTION: {}\n".format(boletes[x]["DESCRIPTION"]))
		f.write("ATTRIBUTES:\n")
		for attribute in boletes[x]["ATTRIBUTES"]:
			f.write ("{}\n".format(attribute))
with open("List_of_Filters.txt", "w") as f:
	for x in filter_list:
		f.write("{}\n".format(x))
print("BOLETE SCRAPE COMPLETE!\n\n")