Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: UTF-8 -*-
- import requests
- from bs4 import BeautifulSoup
- import os, re
- import sys, traceback
- URLS = [
- ' salery. livejournal. com/2006',
- 'salery. livejournal. com/2007' до 2017 г.
- ]
- #тхт файлы
- pre = 'C:\\Users\\...\\Desktop\\pre.txt'
- post = 'C:\\Users\\...\\Desktop\\post.txt'
- links = 'C:\\Users\\...\\Desktop\\links.txt'
- linksAndTitles = 'C:\\Users\\...\\Desktop\\linksAndT.txt'
- end = 'C:\\Users\\...\\Desktop\\end.txt'
- #получаем линки постов жж по месяцам
- # salery. livejournal. com/2006/11/ - первая запись в жж была 11 ноября 2006
- # salery.livejournal. com /2006/12/
- # и т.д.
- def preLinks():
- f = open(pre, 'w')
- try:
- for link in URLS:
- r = requests.get(link, 'html.parser')
- soup = BeautifulSoup(r.content, 'html.parser')
- for link in soup.find_all('td', class_='caption', align='right'):
- link2 = link.a.get('href')
- f.write(link2 + '\n')
- print(link2)
- except Exception:
- print(traceback.format_exception(*sys.exc_info())[1])
- input()
- def postLinks():
- f = open(pre, 'r')
- f1 = open(post, 'w')
- for i in f:
- print(i)
- r = requests.get(i, 'html.parser')
- soup = BeautifulSoup(r.content, 'html.parser')
- for link in soup.find_all('dt'):
- link2 = link.a.get('href')
- print(link2)
- f1.write(link2 + '\n')
- f.close()
- f1.close()
- def linkz():
- f = open(post, 'r')
- f1 = open(links, 'w')
- for i in f:
- print(i)
- r = requests.get(i, 'html.parser')
- soup = BeautifulSoup(r.content, 'html.parser')
- for link in soup.find_all('dt'):
- link2 = link.a.get('href')
- #linkPure = link2.get('href')
- print(link2)
- f1.write(link2 + '\n')
- f1 = open(end, 'w') ???
- f = open(post, 'r') ???
- def main():
- f1 = open(end, 'w') ????
- f = open(post, 'r') ???
- try:
- for i in f:
- r = requests.get(i, 'html.parser')
- soup = BeautifulSoup(r.content, 'html.parser')
- # post = soup.find('td', colspan='2').text
- # print(post)
- #получаем дату поста, записываем ее
- title = soup.title.text
- print(title)
- f1.write(title + '\n')
- #получаем название поста
- for i in soup.find_all('td', class_="caption post-title"):
- try:
- name = (i.text)
- f1.write(name + '\n')
- print(name)
- except ValueError:
- print('No name')
- print(traceback.format_exception(*sys.exc_info())[1])
- continue
- #получаем линк поста
- for i in soup.find_all('td', align="left", class_="comments", valign="top"):
- link = (i.a.get('href') + '\n')
- f1.write(link)
- f1.write('\n')
- print(link, '\n')
- except Exception:
- print(traceback.format_exception(*sys.exc_info())[1])
- input()
- f1.close()
- #ПОЛУЧАЕМ ТЕКСТ ПОСТА
- def text():
- f1 = open(end, 'w')
- f = open('C:\\Users\\....\\Desktop\\Volkov\\end.txt', 'r')
- for i in f:
- f1.write(i)
- match = re.match('http', i)
- if match:
- print(i)
- enter = '\n'
- r = requests.get(i, 'html.parser')
- soup = BeautifulSoup(r.content, 'html.parser')
- t = (soup.find('td', colspan='2').text)
- try:
- print(t)
- except UnicodeError as err:
- print("Error: {0}".format(err))
- continue
- f1.write(t + '\n\n\n\n\n')
- input()
- text()
- #f1.close()
Add Comment
Please, Sign In to add comment