Advertisement
Guest User

Untitled

a guest
Aug 2nd, 2015
161
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.40 KB | None | 0 0
  1. import pdb
  2. import time
  3. import urllib
  4. import urllib2
  5. import re
  6. import sys
  7. import os
  8. import shutil
  9. import codecs
  10. import unicodedata
  11. from BeautifulSoup import BeautifulSoup
  12.  
  13. #debugger, just in case :S
  14. #pdb.set_trace()
  15. print("eBay allGrab Scraper by Ben Fishman, 2013n")
  16. print("n")
  17.  
  18. #Create Input.csv
  19. file = open("input.csv", "w")
  20. file.close()
  21. #Set counter to 0
  22. i=0
  23.  
  24. urlinn = raw_input('Number of search pages you want to enter:n')
  25. urlinnn=int(urlinn)
  26. for i in range(0, urlinnn):
  27. print "Please copy & paste search page URL #",i+1,":"
  28. urlin = raw_input('')
  29. soup = BeautifulSoup(urllib2.urlopen(urlin).read())
  30. for link in soup.findAll('a',{'itemprop':'name'}):
  31. url = link.get('href')
  32. file = open("Input.csv", "a")
  33. file.write(url)
  34. file.write(",")
  35. file.close()
  36. #Delete the last comma so it doesn't screw up the processing later
  37. with open("input.csv", 'rb+') as filehandle:
  38. filehandle.seek(-1, os.SEEK_END)
  39. filehandle.truncate()
  40. print "Done."
  41.  
  42.  
  43.  
  44.  
  45.  
  46.  
  47.  
  48.  
  49. dlcheck = raw_input('Do you want to download the product pictures? (y/n)n')
  50. print("n")
  51. #if the images folder doesn't exist, create a new one
  52. if dlcheck == "y":
  53. if not os.path.exists('images'):
  54. os.mkdir('images')
  55.  
  56.  
  57. print("Reading input file...")
  58. #open and read Input.csv, split at commas and create a list of URLs
  59.  
  60. try:
  61. with open('input.csv') as f:
  62. content = f.read().split(',')
  63. print("Input file read.")
  64. print("Creating output file...")
  65. #create the Output.csv file and write a header
  66. file = open("Output.csv", "w")
  67. file.write('Name,Price,icIMG URLn')
  68. file.close()
  69. #get list lenght
  70. llen=content.__len__()
  71. #throw an error if the list is invalid
  72. if llen==1:
  73. print "Hey! The Input file is empty!"
  74. print "Please put some URLs in there."
  75. wait = input("Press Enter to exit and try again.")
  76. #state the number of detected valid URLs
  77. print "Detected",llen," valid URLs."
  78. print "n"
  79. print "n"
  80.  
  81. #fatal error... Hope this doesn't happen :(
  82. except IOError:
  83. print 'Oh dear.'
  84. print 'Something went horribly wrong.n'
  85. print 'Did you place the Input.csv file into the dist folder nlike the readme told you to?'
  86. print 'Do you have the Input file open in some program?'
  87. print 'Did you give the Input file the correct name?n'
  88. wait = input("Press Enter to exit and try again.")
  89.  
  90. #set counter "i" to 0 and error checker "checker" to None
  91. i=0
  92. errcheck = None
  93.  
  94. #The main loop. Go through the loop until every list item has been processed
  95. try:
  96. for i in range(0, llen):
  97. print "Reading URL #",i+1,"/",llen
  98. url=content[i]
  99. doc = urllib2.urlopen(url).read()
  100. soup = BeautifulSoup(''.join(doc))
  101. #Find the title and write it to the Output file
  102. try:
  103. s = soup.find('h1').text
  104.  
  105. print s
  106.  
  107. file = codecs.open('Output.csv','a','windows-1252')
  108. file.write('"')
  109. file.write(s)
  110. file.write('"')
  111. file.write(',')
  112. file.close()
  113. except:
  114. print "Product title not readable! Writing as UTF-8 instead... "
  115. file = codecs.open('Output.csv','a','utf-8')
  116. file.write('"')
  117. file.write(s)
  118. file.write('"')
  119. file.write(',')
  120. file.close()
  121. errcheck = True
  122. #Find the price and write it to the Output file
  123. try:
  124. ppr = soup.find('span',{'class':'notranslate'}).text
  125. print "Found article price:"
  126. print ppr
  127. newpr = ppr.replace("EUR ", "")
  128. file = open("Output.csv", "a")
  129. file.write('"')
  130. file.write(newpr)
  131. file.write('"')
  132. file.write(",")
  133. file.close()
  134. except:
  135. print 'No product price found. Are you sure this URL leads to an eBay article?n'
  136. file = open("Output.csv", "a")
  137. file.write('"')
  138. file.write(' ')
  139. file.write('"')
  140. file.write(",")
  141. file.close()
  142. errcheck = True
  143. #Find the icIMG URL and write it to the Output file
  144. try:
  145. imgurl = soup.find('img',id="icImg")['src']
  146. print "Found picture URL:"
  147. print imgurl
  148. file = open("Output.csv", "a")
  149. file.write('"')
  150. file.write(imgurl)
  151. file.write('"')
  152. file.write(",n")
  153. file.close()
  154. i+=1
  155. if dlcheck == "y":
  156. print "Downloading image"
  157.  
  158. urllib.urlretrieve(imgurl, os.path.join("images", str(i)+".jpg"))
  159. print "n"
  160. except:
  161. print 'No icImg ID found. Are you sure this URL leads to an eBay article?n'
  162. errcheck = True
  163. #Fatal error again :(
  164. except:
  165. print 'Oh dear.n'
  166. print 'Something went horribly wrong.n'
  167. print 'The Input file is corrupt!'
  168. print 'Did you check the Input.csv file for mistakes?'
  169. print 'Pay attention to double commas!'
  170. file = open("Output.csv", "w")
  171. file.write('')
  172. file.close()
  173. wait = input("Press Enter to exit and try again.")
  174.  
  175.  
  176. #Done :D
  177. print "n"
  178. print "n"
  179. print "Done! n"
  180. print "The data has been saved to Output.csv n"
  181. #If the program saved images, point the user to their direction
  182. if dlcheck == "y":
  183. print "The images have been saved to the /images/ folder. n"
  184. #Warn the user if there have been errors.
  185. if errcheck == True:
  186. print "WARNING:"
  187. print "There were encoding errors during processing."
  188. print "That means that some product titles may not be written correctlyn into the .csv file."
  189. print "I recommend checking the Output.csv file for mistakesn and fixing them manually."
  190. #The end!
  191. wait = input("Press Enter to continue.")
  192.  
  193. def build_list_of_links(ebay_page_url):
  194. page = requests.get(page_url).text
  195. soup = BeautifulSoup(page)
  196. list_of_links = []
  197. for item in soup.find_all('a', {'itemprop':'name'}):
  198. list_of_links.append(item.get('href'))
  199. return(list_of_links)
  200.  
  201. def write_links_file(output_links_file, links_list):
  202. with open(file_destination, "a") as output_file:
  203. link_writer = csv.writer(output_file)
  204. link_writer.writerow(links_list)
  205.  
  206. with open(....)
  207.  
  208. file = open("input.csv", "w")
  209. file.close()
  210.  
  211. write_links_file("/some/dir/", build_list_of_links("some.page"))
  212.  
  213. def create_img_dir():
  214. save_imgages = raw_input('Do you want to download the product pictures? (y/n)nn')
  215. if save_imgages == "y":
  216. if os.path.exists('images'):
  217. pass
  218. else:
  219. os.mkdir('images')
  220. else:
  221. ...
  222.  
  223. if not os.path.exists
  224.  
  225. if os.path.exists
  226. pass
  227. else
  228. os.mkdir
  229.  
  230. content.__len__()
  231.  
  232. len(content)
  233.  
  234. for i in range(0, urlinnn)
  235.  
  236. for i in range(urlinnn)
  237.  
  238. print 'Oh dear.n'
  239. print 'Something went horribly wrong.n'
  240. print 'The Input file is corrupt!'
  241. print 'Did you check the Input.csv file for mistakes?'
  242. print 'Pay attention to double commas!'
  243.  
  244. print("Oh dear.n
  245. Something went horribly wrong.n
  246. The Input file is corrupt!n
  247. Did you check the Input.csv file for mistakes?n
  248. Pay attention to double commas!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement