Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Wed Feb 7 19:31:40 2018
- @author: dan
- """
- # Pasul 1: Importam cele de trebuintza
- import requests
- from bs4 import BeautifulSoup as bs
- import pandas as pd
- # Pasul 2: Cerem toate paginile cu rezultatele cautarii si le atribuim cate unei variabile
- page1 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc')
- page2 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=2')
- page3 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=3')
- page4 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=4')
- page5 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=5')
- page6 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=6')
- page7 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=7')
- page8 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=8')
- page9 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=9')
- page10 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=10')
- page11 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=11')
- page12 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=12')
- page13 = requests.get('http://www.mediafax.ro/cautare.html?q=autonomia%20tinutului%20secuiesc&p=13')
- # Pasul 3: Facem o ciorba din ele :)))
- soup1 = bs(page1.content, 'html.parser')
- soup2 = bs(page2.content, 'html.parser')
- soup3 = bs(page3.content, 'html.parser')
- soup4 = bs(page4.content, 'html.parser')
- soup5 = bs(page5.content, 'html.parser')
- soup6 = bs(page6.content, 'html.parser')
- soup7 = bs(page7.content, 'html.parser')
- soup8 = bs(page8.content, 'html.parser')
- soup9 = bs(page9.content, 'html.parser')
- soup10 = bs(page10.content, 'html.parser')
- soup11 = bs(page11.content, 'html.parser')
- soup12 = bs(page12.content, 'html.parser')
- soup13 = bs(page13.content, 'html.parser')
- # Pasul 4: Gasim un tag reprezentativ pentru noi si facem o lista cu ce ne intereseaza
- lista = [21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37]
- # Pasul 5: Ne holbam la ce-am gasit
- x = []
- y = []
- for i in lista:
- pagina_1 = soup1.find_all('li')[i].get_text()
- x.append(pagina_1.split()[0:3])
- pagina_2 = soup2.find_all('li')[i].get_text()
- x.append(pagina_2.split()[0:3])
- pagina_3 = soup3.find_all('li')[i].get_text()
- x.append(pagina_3.split()[0:3])
- pagina_4 = soup4.find_all('li')[i].get_text()
- x.append(pagina_4.split()[0:3])
- pagina_5 = soup5.find_all('li')[i].get_text()
- x.append(pagina_5.split()[0:3])
- pagina_6 = soup6.find_all('li')[i].get_text()
- x.append(pagina_6.split()[0:3])
- pagina_7 = soup7.find_all('li')[i].get_text()
- x.append(pagina_7.split()[0:3])
- pagina_8 = soup8.find_all('li')[i].get_text()
- x.append(pagina_8.split()[0:3])
- pagina_9 = soup9.find_all('li')[i].get_text()
- x.append(pagina_9.split()[0:3])
- pagina_10 = soup10.find_all('li')[i].get_text()
- x.append(pagina_10.split()[0:3])
- pagina_11 = soup11.find_all('li')[i].get_text()
- x.append(pagina_11.split()[0:3])
- pagina_12 = soup12.find_all('li')[i].get_text()
- x.append(pagina_12.split()[0:3])
- pagina_13 = soup13.find_all('li')[i].get_text()
- x.append(pagina_13.split()[0:3])
- for j in lista:
- pagina_1 = soup1.find_all('li')[j].get_text()
- y.append(pagina_1.split()[3:12])
- pagina_2 = soup2.find_all('li')[j].get_text()
- y.append(pagina_2.split()[3:12])
- pagina_3 = soup3.find_all('li')[j].get_text()
- y.append(pagina_3.split()[3:12])
- pagina_4 = soup4.find_all('li')[j].get_text()
- y.append(pagina_4.split()[3:12])
- pagina_5 = soup5.find_all('li')[j].get_text()
- y.append(pagina_5.split()[3:12])
- pagina_6 = soup6.find_all('li')[j].get_text()
- y.append(pagina_6.split()[3:12])
- pagina_7 = soup7.find_all('li')[j].get_text()
- y.append(pagina_7.split()[3:12])
- pagina_8 = soup8.find_all('li')[j].get_text()
- y.append(pagina_8.split()[3:12])
- pagina_9 = soup9.find_all('li')[j].get_text()
- y.append(pagina_9.split()[3:12])
- pagina_10 = soup10.find_all('li')[j].get_text()
- y.append(pagina_10.split()[3:12])
- pagina_11 = soup11.find_all('li')[j].get_text()
- y.append(pagina_11.split()[3:12])
- pagina_12 = soup12.find_all('li')[j].get_text()
- y.append(pagina_12.split()[3:12])
- pagina_13 = soup13.find_all('li')[j].get_text()
- y.append(pagina_13.split()[3:12])
- print(x)
- print(y)
- # Ca sa vedem mai bine, punem totul intr-un excel sanatos
- df1 = pd.DataFrame({'Data Stire' : x})
- df2 = pd.DataFrame({'Text Stire' : y})
- writer = pd.ExcelWriter('Mediafax.xlsx', engine = 'xlsxwriter')
- df1.to_excel(writer, sheet_name = 'Mediafax', index = True)
- df2.to_excel(writer, sheet_name = "Mediafax", startcol = 2, index = False)
- writer.save()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement