Untitled

# -*- coding: utf-8 -*-
"""Processing Vanilla forum bookmark page link extractor

1. Manually save html pages of bookmarks from forum before decomissioned
2. Run this to extract link data

"""

import fnmatch
import os
from bs4 import BeautifulSoup

def scrape_bookmarks(filename):
    """Extract perosnal bookmark data from Processing Vanilla forum pages"""
    bookmark_list = []
    with open(filename, 'r') as inputfile:
        filestring=inputfile.read()
    soup = BeautifulSoup(filestring, 'html.parser')
    passages = soup.select('div.Title')
    for psg in passages:
        plink = psg.a['href']
        ptitle = psg.get_text()
        bookmark_list.append((ptitle.strip(), plink))
    return bookmark_list

def fpath_to_fnamelist(fpath, fnpattern):
    """
    Filepath to filename list:
    Take a directory and pattern, return a list of file paths.

    fnpattern filters results use Unix shell-style wildcards: (*, ?, [abc], [!abc])
    Uses fnmatch.filter.
    """
    return [os.path.join(dirpath, f)
            for dirpath, _dirnames, files in os.walk(fpath)
            for f in fnmatch.filter(files, fnpattern)]

def save_bookmarks(list_, filename):
    """Save list_ of lines into text file."""
    if not list_: raise ValueError('No data to write.')
    if not filename: raise ValueError('No filename given.')
    try:
        with open(filename, 'w') as outputfile:
            for item in list_:
                for title, url in item:
                    outputfile.write("{}\t{}\n".format(title.encode('utf-8'), url))
    except OSError:
        print "File not written."

if __name__ == '__main__':
    results = []
    fname_list = fpath_to_fnamelist('./', '*.html')
    for file in fname_list:
        results.append(scrape_bookmarks(file))
        save_bookmarks(results, 'bookmarks.txt')