Advertisement
Guest User

drudge report crawler

a guest
Jun 23rd, 2022
56
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.48 KB | None | 0 0
  1. import os
  2. import datetime
  3. import drudge_parser
  4. import pandas as pd
  5. import webbrowser
  6. from binaryornot.check import is_binary
  7.  
  8. pd.set_option('display.max_colwidth', None)
  9. pd.set_option('display.max_rows', None)
  10. # pd.set_option('display.min_rows', 5000)
  11. pd.set_option('display.max_columns', None)
  12. pd.set_option('display.width', None)
  13.  
  14.  
  15. def read_txt_file_basic(file_loc):
  16.     check_file_type = is_binary(file_loc)
  17.     if not check_file_type:
  18.         file_list = []
  19.         try:
  20.             fh = open(file_loc, 'r', encoding='utf-8')
  21.             lines = fh.readlines()
  22.             for line in range(len(lines)):
  23.                 file_list.append(lines[line].strip())
  24.             return file_list
  25.         except IOError as e:
  26.             print("unable to read file ({})".format(e))
  27.  
  28.  
  29. def write_txt_file(file_loc, lines):
  30.     try:
  31.         fh = open(file_loc, 'w', encoding='utf-8')
  32.         fh.writelines("%s\n" % place for place in lines)
  33.         fh.close()
  34.     except IOError as e:
  35.         print("witting file failed ({})".format(e))
  36.  
  37.  
  38. def domain_name(url):
  39.     black_list = ['.co.', '.com.au']
  40.     stripped_url = ""
  41.     # Strip http:// or https:// if the url begins with it.
  42.     if url[:1] == "/":
  43.         url = url[1:]
  44.     if url[:7] == "http://":
  45.         url = url[7:]
  46.     elif url[:8] == "https://":
  47.         url = url[8:]
  48.     # Strip www. if it exists in url.
  49.     if url[:4] == "www.":
  50.         url = url[4:]
  51.     # Build the stripped url with chars up until a forward slash.
  52.     for char in url:
  53.         if char == "/":
  54.             break
  55.         else:
  56.             stripped_url += char
  57.     if not any(x in stripped_url for x in black_list):
  58.         if stripped_url.__contains__('.'):
  59.             stripped_url_split = stripped_url.split('.')
  60.             stripped_url = ''
  61.             for x in range(-2, 0):
  62.                 if x == -2:
  63.                     stripped_url += stripped_url_split[x] + '.'
  64.                 else:
  65.                     stripped_url += stripped_url_split[x]
  66.     # Return the stripped url.
  67.     return stripped_url
  68.  
  69.  
  70. def crawl_drudge():
  71.     titles = []
  72.     links = []
  73.     dates = []
  74.     domains = []
  75.     # setup ignore title list
  76.     # drudge_ignore.txt in the same folder as script file
  77.     check_list = []
  78.     ignore_list_loc = os.getcwd() + os.sep + 'drudge_ignore.txt'
  79.     ignore_list = read_txt_file_basic(ignore_list_loc)
  80.     check_list += ignore_list
  81.     ignore_list.sort()
  82.     if ignore_list == check_list:
  83.         pass
  84.     else:
  85.         write_txt_file(ignore_list_loc, ignore_list)
  86.     # end setup ignore title list
  87.     # change default parser drudge_parser.py -> source = source.decode('latin-1')
  88.     # change default parser drudge_parser.py -> def handle_data(self, data): to use try except for titles
  89.     articles = drudge_parser.scrape_page()
  90.     for dic in articles:
  91.         for sub_dic in dic['articles']:
  92.             try:
  93.                 curr_date = datetime.datetime.now()
  94.                 title = sub_dic['title']
  95.                 link = sub_dic['href']
  96.                 domain = domain_name(link)
  97.                 if title in ignore_list:
  98.                     pass
  99.                 else:
  100.                     dates.append(curr_date)
  101.                     titles.append(title)
  102.                     links.append(link)
  103.                     domains.append(domain)
  104.             except KeyError:
  105.                 pass
  106.     return dates, titles, links, domains
  107.  
  108.  
  109. def generate_drudge_csv():
  110.     diff_list = []
  111.     dates, titles, links, domains = crawl_drudge()
  112.     csv_loc = os.getcwd() + os.sep + 'drudge.csv'
  113.     csv_loc_diff = os.getcwd() + os.sep + 'drudge_diff.csv'
  114.     zipped_list_curr = list(zip(dates, titles, links, domains))
  115.     if os.path.isfile(csv_loc):
  116.         df_start = pd.read_csv(csv_loc, index_col=0)
  117.         df_start['date'] = pd.to_datetime(df_start['date'], format='%A %d %B %Y')
  118.         date_list_prev = df_start['date'].to_list()
  119.         title_list_prev = df_start['title'].to_list()
  120.         link_list_prev = df_start['link'].to_list()
  121.         domain_list_prev = df_start['domain'].to_list()
  122.         # generate diff list
  123.         index = -1
  124.         for link in links:
  125.             index += 1
  126.             if link not in link_list_prev:
  127.                 diff_list += [(dates[index], titles[index], links[index], domains[index])]
  128.         # end generate diff listf
  129.         zipped_list_prev = list(zip(date_list_prev, title_list_prev, link_list_prev, domain_list_prev))
  130.         write_list = zipped_list_prev + zipped_list_curr
  131.         df = pd.DataFrame(write_list, columns=['date', 'title', 'link', 'domain'])
  132.         df.drop_duplicates(subset='link', keep='first', inplace=True)
  133.         df = df.sort_values(by=['date', 'domain', 'title'], inplace=False, ascending=(False, True, True))
  134.         df.reset_index(drop=True, inplace=True)
  135.         if df_start.equals(df):
  136.             print('no new posts')
  137.         else:
  138.             df['date'] = df['date'].dt.strftime('%A %d %B %Y')
  139.             df.rename_axis('id', inplace=True)
  140.             df.to_csv(csv_loc, encoding='utf-8', header=True, date_format='%A %d %B %Y')
  141.             print('updating main csv')
  142.             if len(diff_list) == 0:
  143.                 print('no new posts')
  144.         # generate diff csv
  145.         if len(diff_list) > 0:
  146.             df_diff = pd.DataFrame(diff_list, columns=['date', 'title', 'link', 'domain'])
  147.             df_diff.drop_duplicates(subset='link', keep='first', inplace=True)
  148.             df_diff = df_diff.sort_values(by=['domain', 'date', 'title'], inplace=False, ascending=(True, False, True))
  149.             df_diff.reset_index(drop=True, inplace=True)
  150.             df_diff['date'] = df_diff['date'].dt.strftime('%A %d %B %Y')
  151.             df_diff.rename_axis('id', inplace=True)
  152.             df_diff.to_csv(csv_loc_diff, encoding='utf-8', header=True, date_format='%A %d %B %Y')
  153.             webbrowser.open_new_tab(csv_loc_diff)
  154.     else:
  155.         df = pd.DataFrame(zipped_list_curr, columns=['date', 'title', 'link', 'domain'])
  156.         df.index.name = 'id'
  157.         df.drop_duplicates(subset='link', keep='first', inplace=True)
  158.         df = df.sort_values(by=['date', 'domain', 'title'], inplace=False, ascending=(False, True, True))
  159.         df.reset_index(drop=True, inplace=True)
  160.         df['date'] = df['date'].dt.strftime('%A %d %B %Y')
  161.         df.rename_axis('id', inplace=True)
  162.         df.to_csv(csv_loc, encoding='utf-8', header=True, date_format='%A %d %B %Y')
  163.         webbrowser.open_new_tab(csv_loc)
  164.  
  165.  
  166. if __name__ == '__main__':
  167.     generate_drudge_csv()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement