Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import datetime
- import drudge_parser
- import pandas as pd
- import webbrowser
- from binaryornot.check import is_binary
- pd.set_option('display.max_colwidth', None)
- pd.set_option('display.max_rows', None)
- # pd.set_option('display.min_rows', 5000)
- pd.set_option('display.max_columns', None)
- pd.set_option('display.width', None)
- def read_txt_file_basic(file_loc):
- check_file_type = is_binary(file_loc)
- if not check_file_type:
- file_list = []
- try:
- fh = open(file_loc, 'r', encoding='utf-8')
- lines = fh.readlines()
- for line in range(len(lines)):
- file_list.append(lines[line].strip())
- return file_list
- except IOError as e:
- print("unable to read file ({})".format(e))
- def write_txt_file(file_loc, lines):
- try:
- fh = open(file_loc, 'w', encoding='utf-8')
- fh.writelines("%s\n" % place for place in lines)
- fh.close()
- except IOError as e:
- print("witting file failed ({})".format(e))
- def domain_name(url):
- black_list = ['.co.', '.com.au']
- stripped_url = ""
- # Strip http:// or https:// if the url begins with it.
- if url[:1] == "/":
- url = url[1:]
- if url[:7] == "http://":
- url = url[7:]
- elif url[:8] == "https://":
- url = url[8:]
- # Strip www. if it exists in url.
- if url[:4] == "www.":
- url = url[4:]
- # Build the stripped url with chars up until a forward slash.
- for char in url:
- if char == "/":
- break
- else:
- stripped_url += char
- if not any(x in stripped_url for x in black_list):
- if stripped_url.__contains__('.'):
- stripped_url_split = stripped_url.split('.')
- stripped_url = ''
- for x in range(-2, 0):
- if x == -2:
- stripped_url += stripped_url_split[x] + '.'
- else:
- stripped_url += stripped_url_split[x]
- # Return the stripped url.
- return stripped_url
- def crawl_drudge():
- titles = []
- links = []
- dates = []
- domains = []
- # setup ignore title list
- # drudge_ignore.txt in the same folder as script file
- check_list = []
- ignore_list_loc = os.getcwd() + os.sep + 'drudge_ignore.txt'
- ignore_list = read_txt_file_basic(ignore_list_loc)
- check_list += ignore_list
- ignore_list.sort()
- if ignore_list == check_list:
- pass
- else:
- write_txt_file(ignore_list_loc, ignore_list)
- # end setup ignore title list
- # change default parser drudge_parser.py -> source = source.decode('latin-1')
- # change default parser drudge_parser.py -> def handle_data(self, data): to use try except for titles
- articles = drudge_parser.scrape_page()
- for dic in articles:
- for sub_dic in dic['articles']:
- try:
- curr_date = datetime.datetime.now()
- title = sub_dic['title']
- link = sub_dic['href']
- domain = domain_name(link)
- if title in ignore_list:
- pass
- else:
- dates.append(curr_date)
- titles.append(title)
- links.append(link)
- domains.append(domain)
- except KeyError:
- pass
- return dates, titles, links, domains
- def generate_drudge_csv():
- diff_list = []
- dates, titles, links, domains = crawl_drudge()
- csv_loc = os.getcwd() + os.sep + 'drudge.csv'
- csv_loc_diff = os.getcwd() + os.sep + 'drudge_diff.csv'
- zipped_list_curr = list(zip(dates, titles, links, domains))
- if os.path.isfile(csv_loc):
- df_start = pd.read_csv(csv_loc, index_col=0)
- df_start['date'] = pd.to_datetime(df_start['date'], format='%A %d %B %Y')
- date_list_prev = df_start['date'].to_list()
- title_list_prev = df_start['title'].to_list()
- link_list_prev = df_start['link'].to_list()
- domain_list_prev = df_start['domain'].to_list()
- # generate diff list
- index = -1
- for link in links:
- index += 1
- if link not in link_list_prev:
- diff_list += [(dates[index], titles[index], links[index], domains[index])]
- # end generate diff listf
- zipped_list_prev = list(zip(date_list_prev, title_list_prev, link_list_prev, domain_list_prev))
- write_list = zipped_list_prev + zipped_list_curr
- df = pd.DataFrame(write_list, columns=['date', 'title', 'link', 'domain'])
- df.drop_duplicates(subset='link', keep='first', inplace=True)
- df = df.sort_values(by=['date', 'domain', 'title'], inplace=False, ascending=(False, True, True))
- df.reset_index(drop=True, inplace=True)
- if df_start.equals(df):
- print('no new posts')
- else:
- df['date'] = df['date'].dt.strftime('%A %d %B %Y')
- df.rename_axis('id', inplace=True)
- df.to_csv(csv_loc, encoding='utf-8', header=True, date_format='%A %d %B %Y')
- print('updating main csv')
- if len(diff_list) == 0:
- print('no new posts')
- # generate diff csv
- if len(diff_list) > 0:
- df_diff = pd.DataFrame(diff_list, columns=['date', 'title', 'link', 'domain'])
- df_diff.drop_duplicates(subset='link', keep='first', inplace=True)
- df_diff = df_diff.sort_values(by=['domain', 'date', 'title'], inplace=False, ascending=(True, False, True))
- df_diff.reset_index(drop=True, inplace=True)
- df_diff['date'] = df_diff['date'].dt.strftime('%A %d %B %Y')
- df_diff.rename_axis('id', inplace=True)
- df_diff.to_csv(csv_loc_diff, encoding='utf-8', header=True, date_format='%A %d %B %Y')
- webbrowser.open_new_tab(csv_loc_diff)
- else:
- df = pd.DataFrame(zipped_list_curr, columns=['date', 'title', 'link', 'domain'])
- df.index.name = 'id'
- df.drop_duplicates(subset='link', keep='first', inplace=True)
- df = df.sort_values(by=['date', 'domain', 'title'], inplace=False, ascending=(False, True, True))
- df.reset_index(drop=True, inplace=True)
- df['date'] = df['date'].dt.strftime('%A %d %B %Y')
- df.rename_axis('id', inplace=True)
- df.to_csv(csv_loc, encoding='utf-8', header=True, date_format='%A %d %B %Y')
- webbrowser.open_new_tab(csv_loc)
- if __name__ == '__main__':
- generate_drudge_csv()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement