Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """Processing Vanilla forum bookmark page link extractor
- 1. Manually save html pages of bookmarks from forum before decomissioned
- 2. Run this to extract link data
- """
- import fnmatch
- import os
- from bs4 import BeautifulSoup
- def scrape_bookmarks(filename):
- """Extract perosnal bookmark data from Processing Vanilla forum pages"""
- bookmark_list = []
- with open(filename, 'r') as inputfile:
- filestring=inputfile.read()
- soup = BeautifulSoup(filestring, 'html.parser')
- passages = soup.select('div.Title')
- for psg in passages:
- plink = psg.a['href']
- ptitle = psg.get_text()
- bookmark_list.append((ptitle.strip(), plink))
- return bookmark_list
- def fpath_to_fnamelist(fpath, fnpattern):
- """
- Filepath to filename list:
- Take a directory and pattern, return a list of file paths.
- fnpattern filters results use Unix shell-style wildcards: (*, ?, [abc], [!abc])
- Uses fnmatch.filter.
- """
- return [os.path.join(dirpath, f)
- for dirpath, _dirnames, files in os.walk(fpath)
- for f in fnmatch.filter(files, fnpattern)]
- def save_bookmarks(list_, filename):
- """Save list_ of lines into text file."""
- if not list_: raise ValueError('No data to write.')
- if not filename: raise ValueError('No filename given.')
- try:
- with open(filename, 'w') as outputfile:
- for item in list_:
- for title, url in item:
- outputfile.write("{}\t{}\n".format(title.encode('utf-8'), url))
- except OSError:
- print "File not written."
- if __name__ == '__main__':
- results = []
- fname_list = fpath_to_fnamelist('./', '*.html')
- for file in fname_list:
- results.append(scrape_bookmarks(file))
- save_bookmarks(results, 'bookmarks.txt')
Add Comment
Please, Sign In to add comment