Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pdb
- import time
- import urllib
- import urllib2
- import re
- import sys
- import os
- import shutil
- import codecs
- import unicodedata
- from BeautifulSoup import BeautifulSoup
- #debugger, just in case :S
- #pdb.set_trace()
- print("eBay allGrab Scraper by Ben Fishman, 2013n")
- print("n")
- #Create Input.csv
- file = open("input.csv", "w")
- file.close()
- #Set counter to 0
- i=0
- urlinn = raw_input('Number of search pages you want to enter:n')
- urlinnn=int(urlinn)
- for i in range(0, urlinnn):
- print "Please copy & paste search page URL #",i+1,":"
- urlin = raw_input('')
- soup = BeautifulSoup(urllib2.urlopen(urlin).read())
- for link in soup.findAll('a',{'itemprop':'name'}):
- url = link.get('href')
- file = open("Input.csv", "a")
- file.write(url)
- file.write(",")
- file.close()
- #Delete the last comma so it doesn't screw up the processing later
- with open("input.csv", 'rb+') as filehandle:
- filehandle.seek(-1, os.SEEK_END)
- filehandle.truncate()
- print "Done."
- dlcheck = raw_input('Do you want to download the product pictures? (y/n)n')
- print("n")
- #if the images folder doesn't exist, create a new one
- if dlcheck == "y":
- if not os.path.exists('images'):
- os.mkdir('images')
- print("Reading input file...")
- #open and read Input.csv, split at commas and create a list of URLs
- try:
- with open('input.csv') as f:
- content = f.read().split(',')
- print("Input file read.")
- print("Creating output file...")
- #create the Output.csv file and write a header
- file = open("Output.csv", "w")
- file.write('Name,Price,icIMG URLn')
- file.close()
- #get list lenght
- llen=content.__len__()
- #throw an error if the list is invalid
- if llen==1:
- print "Hey! The Input file is empty!"
- print "Please put some URLs in there."
- wait = input("Press Enter to exit and try again.")
- #state the number of detected valid URLs
- print "Detected",llen," valid URLs."
- print "n"
- print "n"
- #fatal error... Hope this doesn't happen :(
- except IOError:
- print 'Oh dear.'
- print 'Something went horribly wrong.n'
- print 'Did you place the Input.csv file into the dist folder nlike the readme told you to?'
- print 'Do you have the Input file open in some program?'
- print 'Did you give the Input file the correct name?n'
- wait = input("Press Enter to exit and try again.")
- #set counter "i" to 0 and error checker "checker" to None
- i=0
- errcheck = None
- #The main loop. Go through the loop until every list item has been processed
- try:
- for i in range(0, llen):
- print "Reading URL #",i+1,"/",llen
- url=content[i]
- doc = urllib2.urlopen(url).read()
- soup = BeautifulSoup(''.join(doc))
- #Find the title and write it to the Output file
- try:
- s = soup.find('h1').text
- print s
- file = codecs.open('Output.csv','a','windows-1252')
- file.write('"')
- file.write(s)
- file.write('"')
- file.write(',')
- file.close()
- except:
- print "Product title not readable! Writing as UTF-8 instead... "
- file = codecs.open('Output.csv','a','utf-8')
- file.write('"')
- file.write(s)
- file.write('"')
- file.write(',')
- file.close()
- errcheck = True
- #Find the price and write it to the Output file
- try:
- ppr = soup.find('span',{'class':'notranslate'}).text
- print "Found article price:"
- print ppr
- newpr = ppr.replace("EUR ", "")
- file = open("Output.csv", "a")
- file.write('"')
- file.write(newpr)
- file.write('"')
- file.write(",")
- file.close()
- except:
- print 'No product price found. Are you sure this URL leads to an eBay article?n'
- file = open("Output.csv", "a")
- file.write('"')
- file.write(' ')
- file.write('"')
- file.write(",")
- file.close()
- errcheck = True
- #Find the icIMG URL and write it to the Output file
- try:
- imgurl = soup.find('img',id="icImg")['src']
- print "Found picture URL:"
- print imgurl
- file = open("Output.csv", "a")
- file.write('"')
- file.write(imgurl)
- file.write('"')
- file.write(",n")
- file.close()
- i+=1
- if dlcheck == "y":
- print "Downloading image"
- urllib.urlretrieve(imgurl, os.path.join("images", str(i)+".jpg"))
- print "n"
- except:
- print 'No icImg ID found. Are you sure this URL leads to an eBay article?n'
- errcheck = True
- #Fatal error again :(
- except:
- print 'Oh dear.n'
- print 'Something went horribly wrong.n'
- print 'The Input file is corrupt!'
- print 'Did you check the Input.csv file for mistakes?'
- print 'Pay attention to double commas!'
- file = open("Output.csv", "w")
- file.write('')
- file.close()
- wait = input("Press Enter to exit and try again.")
- #Done :D
- print "n"
- print "n"
- print "Done! n"
- print "The data has been saved to Output.csv n"
- #If the program saved images, point the user to their direction
- if dlcheck == "y":
- print "The images have been saved to the /images/ folder. n"
- #Warn the user if there have been errors.
- if errcheck == True:
- print "WARNING:"
- print "There were encoding errors during processing."
- print "That means that some product titles may not be written correctlyn into the .csv file."
- print "I recommend checking the Output.csv file for mistakesn and fixing them manually."
- #The end!
- wait = input("Press Enter to continue.")
- def build_list_of_links(ebay_page_url):
- page = requests.get(page_url).text
- soup = BeautifulSoup(page)
- list_of_links = []
- for item in soup.find_all('a', {'itemprop':'name'}):
- list_of_links.append(item.get('href'))
- return(list_of_links)
- def write_links_file(output_links_file, links_list):
- with open(file_destination, "a") as output_file:
- link_writer = csv.writer(output_file)
- link_writer.writerow(links_list)
- with open(....)
- file = open("input.csv", "w")
- file.close()
- write_links_file("/some/dir/", build_list_of_links("some.page"))
- def create_img_dir():
- save_imgages = raw_input('Do you want to download the product pictures? (y/n)nn')
- if save_imgages == "y":
- if os.path.exists('images'):
- pass
- else:
- os.mkdir('images')
- else:
- ...
- if not os.path.exists
- if os.path.exists
- pass
- else
- os.mkdir
- content.__len__()
- len(content)
- for i in range(0, urlinnn)
- for i in range(urlinnn)
- print 'Oh dear.n'
- print 'Something went horribly wrong.n'
- print 'The Input file is corrupt!'
- print 'Did you check the Input.csv file for mistakes?'
- print 'Pay attention to double commas!'
- print("Oh dear.n
- Something went horribly wrong.n
- The Input file is corrupt!n
- Did you check the Input.csv file for mistakes?n
- Pay attention to double commas!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement