Guest User

Untitled

a guest
May 26th, 2018
141
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.78 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """Processing Vanilla forum bookmark page link extractor
  3.  
  4. 1. Manually save html pages of bookmarks from forum before decomissioned
  5. 2. Run this to extract link data
  6.  
  7. """
  8.  
  9. import fnmatch
  10. import os
  11. from bs4 import BeautifulSoup
  12.  
  13. def scrape_bookmarks(filename):
  14. """Extract perosnal bookmark data from Processing Vanilla forum pages"""
  15. bookmark_list = []
  16. with open(filename, 'r') as inputfile:
  17. filestring=inputfile.read()
  18. soup = BeautifulSoup(filestring, 'html.parser')
  19. passages = soup.select('div.Title')
  20. for psg in passages:
  21. plink = psg.a['href']
  22. ptitle = psg.get_text()
  23. bookmark_list.append((ptitle.strip(), plink))
  24. return bookmark_list
  25.  
  26. def fpath_to_fnamelist(fpath, fnpattern):
  27. """
  28. Filepath to filename list:
  29. Take a directory and pattern, return a list of file paths.
  30.  
  31. fnpattern filters results use Unix shell-style wildcards: (*, ?, [abc], [!abc])
  32. Uses fnmatch.filter.
  33. """
  34. return [os.path.join(dirpath, f)
  35. for dirpath, _dirnames, files in os.walk(fpath)
  36. for f in fnmatch.filter(files, fnpattern)]
  37.  
  38. def save_bookmarks(list_, filename):
  39. """Save list_ of lines into text file."""
  40. if not list_: raise ValueError('No data to write.')
  41. if not filename: raise ValueError('No filename given.')
  42. try:
  43. with open(filename, 'w') as outputfile:
  44. for item in list_:
  45. for title, url in item:
  46. outputfile.write("{}\t{}\n".format(title.encode('utf-8'), url))
  47. except OSError:
  48. print "File not written."
  49.  
  50. if __name__ == '__main__':
  51. results = []
  52. fname_list = fpath_to_fnamelist('./', '*.html')
  53. for file in fname_list:
  54. results.append(scrape_bookmarks(file))
  55. save_bookmarks(results, 'bookmarks.txt')
Add Comment
Please, Sign In to add comment