SHARE
TWEET

Mediafax_scrap

mdan Feb 11th, 2018 100 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Feb  7 19:31:40 2018
  5.  
  6. @author: dan
  7. """
  8. # Pasul 1: Importam cele de trebuintza
  9.  
  10. import requests
  11. from bs4 import BeautifulSoup as bs
  12. import pandas as pd
  13.  
  14.  
  15. # Pasul 2: Cerem toate paginile cu rezultatele cautarii si le atribuim cate unei variabile
  16.  
  17. page1 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc')
  18. page2 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=2')
  19. page3 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=3')
  20. page4 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=4')
  21. page5 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=5')
  22. page6 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=6')
  23. page7 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=7')
  24. page8 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=8')
  25. page9 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=9')
  26. page10 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=10')
  27. page11 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=11')
  28. page12 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=12')
  29. page13 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=13')
  30.  
  31. # Pasul 3: Facem o ciorba din ele :)))
  32.  
  33. soup1 = bs(page1.content, 'html.parser')
  34. soup2 = bs(page2.content, 'html.parser')    
  35. soup3 = bs(page3.content, 'html.parser')
  36. soup4 = bs(page4.content, 'html.parser')
  37. soup5 = bs(page5.content, 'html.parser')
  38. soup6 = bs(page6.content, 'html.parser')
  39. soup7 = bs(page7.content, 'html.parser')
  40. soup8 = bs(page8.content, 'html.parser')
  41. soup9 = bs(page9.content, 'html.parser')
  42. soup10 = bs(page10.content, 'html.parser')
  43. soup11 = bs(page11.content, 'html.parser')
  44. soup12 = bs(page12.content, 'html.parser')
  45. soup13 = bs(page13.content, 'html.parser')
  46.  
  47. # Pasul 4: Gasim un tag reprezentativ pentru noi si facem o lista cu ce ne intereseaza
  48.  
  49. lista = [21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37]
  50.  
  51. # Pasul 5: Ne holbam la ce-am gasit
  52.  
  53. x = []
  54. y = []
  55.  
  56. for i in lista:
  57.    
  58.     pagina_1 = soup1.find_all('li')[i].get_text()
  59.     x.append(pagina_1.split()[0:3])
  60.    
  61.     pagina_2 = soup2.find_all('li')[i].get_text()
  62.     x.append(pagina_2.split()[0:3])
  63.    
  64.     pagina_3 = soup3.find_all('li')[i].get_text()
  65.     x.append(pagina_3.split()[0:3])
  66.    
  67.     pagina_4 = soup4.find_all('li')[i].get_text()
  68.     x.append(pagina_4.split()[0:3])
  69.  
  70.     pagina_5 = soup5.find_all('li')[i].get_text()
  71.     x.append(pagina_5.split()[0:3])
  72.    
  73.     pagina_6 = soup6.find_all('li')[i].get_text()
  74.     x.append(pagina_6.split()[0:3])    
  75.  
  76.     pagina_7 = soup7.find_all('li')[i].get_text()
  77.     x.append(pagina_7.split()[0:3])    
  78.  
  79.     pagina_8 = soup8.find_all('li')[i].get_text()
  80.     x.append(pagina_8.split()[0:3])    
  81.  
  82.     pagina_9 = soup9.find_all('li')[i].get_text()
  83.     x.append(pagina_9.split()[0:3])    
  84.  
  85.     pagina_10 = soup10.find_all('li')[i].get_text()
  86.     x.append(pagina_10.split()[0:3])    
  87.  
  88.     pagina_11 = soup11.find_all('li')[i].get_text()
  89.     x.append(pagina_11.split()[0:3])    
  90.  
  91.     pagina_12 = soup12.find_all('li')[i].get_text()
  92.     x.append(pagina_12.split()[0:3])    
  93.  
  94.     pagina_13 = soup13.find_all('li')[i].get_text()
  95.     x.append(pagina_13.split()[0:3])    
  96.  
  97.  
  98. for j in lista:
  99.     pagina_1 = soup1.find_all('li')[j].get_text()
  100.     y.append(pagina_1.split()[3:12])
  101.    
  102.     pagina_2 = soup2.find_all('li')[j].get_text()
  103.     y.append(pagina_2.split()[3:12])
  104.    
  105.     pagina_3 = soup3.find_all('li')[j].get_text()
  106.     y.append(pagina_3.split()[3:12])
  107.  
  108.     pagina_4 = soup4.find_all('li')[j].get_text()
  109.     y.append(pagina_4.split()[3:12])
  110.  
  111.     pagina_5 = soup5.find_all('li')[j].get_text()
  112.     y.append(pagina_5.split()[3:12])
  113.  
  114.     pagina_6 = soup6.find_all('li')[j].get_text()
  115.     y.append(pagina_6.split()[3:12])
  116.  
  117.     pagina_7 = soup7.find_all('li')[j].get_text()
  118.     y.append(pagina_7.split()[3:12])
  119.  
  120.     pagina_8 = soup8.find_all('li')[j].get_text()
  121.     y.append(pagina_8.split()[3:12])
  122.  
  123.     pagina_9 = soup9.find_all('li')[j].get_text()
  124.     y.append(pagina_9.split()[3:12])
  125.  
  126.     pagina_10 = soup10.find_all('li')[j].get_text()
  127.     y.append(pagina_10.split()[3:12])
  128.  
  129.     pagina_11 = soup11.find_all('li')[j].get_text()
  130.     y.append(pagina_11.split()[3:12])
  131.  
  132.     pagina_12 = soup12.find_all('li')[j].get_text()
  133.     y.append(pagina_12.split()[3:12])
  134.    
  135.     pagina_13 = soup13.find_all('li')[j].get_text()
  136.     y.append(pagina_13.split()[3:12])
  137.  
  138.  
  139.  
  140. print(x)
  141. print(y)
  142.  
  143. # Ca sa vedem mai bine, punem totul intr-un excel sanatos
  144.  
  145. df1 = pd.DataFrame({'Data Stire' : x})
  146. df2 = pd.DataFrame({'Text Stire' : y})
  147.  
  148. writer = pd.ExcelWriter('Mediafax.xlsx', engine = 'xlsxwriter')
  149. df1.to_excel(writer, sheet_name = 'Mediafax', index = True)
  150. df2.to_excel(writer, sheet_name = "Mediafax", startcol = 2, index = False)
  151. writer.save()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top