Advertisement
darkmist

lecture_parser.py

Dec 31st, 2016
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.94 KB | None | 0 0
  1. import os
  2. from itertools import groupby
  3. from collections import OrderedDict
  4.  
  5.  
  6. class LectureParser():
  7.     """ Creates a single ~/Desktop/output.txt file in the following format:
  8.            Lecture1
  9.                Lesson1
  10.                ------------
  11.                Lesson2
  12.                ------------
  13.            =======================
  14.        Assumptions:
  15.            The list of lecture directories in Downloads folder
  16.            The user executing this script has admin privilages.
  17.                
  18.    """
  19.     def __init__(self):
  20.         """ Args:
  21.                self.home_dir: a user's home directory independent of the os systems used (Unix-based os is assumed).
  22.                self.output_file: a path to the user's ~/Desktop where the output file is generated.
  23.        """
  24.         self.home_dir = os.path.expanduser('~')
  25.         self.output_file = os.path.join(self.get_desktop_dir(), 'output.txt')
  26.        
  27.     def get_downloads_dir(self):
  28.         """ Returns:
  29.                downloads_dir: the user's ~/Downloads directory.
  30.        """
  31.         downloads_dir = os.path.join(self.home_dir, 'Downloads')
  32.         return downloads_dir
  33.  
  34.     def get_desktop_dir(self):
  35.         """ Returns:
  36.                desktop_dir: the user's ~/Desktop directory.
  37.        """
  38.         desktop_dir = os.path.join(self.home_dir, 'Desktop')
  39.         return desktop_dir
  40.  
  41.     def get_lectures_dirs(self):
  42.         """ Generates an ordered dictionary based on the available list of lectures.
  43.            Returns:
  44.                lectures: an ordered dictioanry of format:
  45.                            {'lecture_name_1': {}, 'lecture_name_2: {}, ... }
  46.        """
  47.         dirs = ['1 - Introduction Subtitles', '2 - Architecture & Principles Subtitles',
  48.                 '3 - Switching Subtitles', '4 - Routing Subtitles',
  49.                 '5 - Naming, Addressing & Forwarding Subtitles', '5. 1 - Router Design Basics Subtitles',
  50.                 '5.2 - DNS Subtitles', '6 - Congestion control & streaming Subtitles',
  51.                 '7 - Rate limiting and traffic shaping Subtitles', '8 - Content distribution Subtitles',
  52.                 '9 - Software Defined Networking Subtitles', '9.1 - Programming SDNs Subtitles',
  53.                 '10 - Traffic Engineering Subtitles', '11 - Network Security Subtitles',
  54.                 '11.1 - Internet Worms Subtitles', '11.2 - Spam Subtitles', '11.3 - Denial of Service Attacks Subtitles',]
  55.        
  56.         # initiate ordered dictionary (imported from collections)
  57.         lectures = OrderedDict()
  58.         # loop through the list of lectures and set the lecture names as the key in dictioanry;
  59.         #+ setdefault method allows for the key to point to a data structure (dictionary in our case)
  60.         #+ without values
  61.         for dir in dirs:
  62.             lectures.setdefault(dir, {})
  63.         return lectures
  64.    
  65.     def get_lectures(self):
  66.         """ Continues to build ordered dictionary by adding full path and file names.
  67.            Returns:
  68.                lectures_all: an ordered dictionary of format:
  69.                            {'lecture_name_1': {'full/path/': [lesson_file_name_1, lesson_file_name_2, ...]}, ...}
  70.        """
  71.         # get the ordered dictionary with keys and empty values
  72.         lectures_all = self.get_lectures_dirs()
  73.         # get the keys from the dictionary to avoid other files in the ~/Downloads folder
  74.         valid_dirs = lectures_all.keys()
  75.         # walk via ~/Downloads folder:
  76.             # dirname - full path to each file in the ~/Downloads directory
  77.             # dirnames - only the names of other directories inside ~Downloads directory
  78.             # filenames - all file names in the ~/Downloads directory and its children
  79.         for dirname, dirnames, filenames in os.walk(self.get_downloads_dir()):
  80.             # while looping via all directories, check if the directory basename (last part of the path)
  81.             #+ is within the desired list
  82.             dir_basename = os.path.basename(dirname)
  83.             # if it is, add the full path as a value to our ordered dictionary
  84.             # use .setdefault to generate a dictioanry of dictionary of lists:
  85.                 # lectures_all -> {'2 - Architecture & Principles Subtitles':
  86.                 #                       {
  87.                 #                           '/Users/user_name/Downloads/2 - Architecture & Principles Subtitles': [],
  88.                 #                            ...
  89.                 #                       },
  90.                 #                       ...
  91.                 #               }
  92.             if dir_basename in valid_dirs:
  93.                 lectures_all[dir_basename].setdefault(dirname, [])
  94.                 for filename in filenames:
  95.                     # finally, complete the ordered dictionary
  96.                         # lectures_all -> {'2 - Architecture & Principles Subtitles':
  97.                         #                       {
  98.                         #                           '/Users/user_name/Downloads/2 - Architecture & Principles Subtitles':
  99.                         #                            ['01 - Lesson 2 Intro.srt',
  100.                         #                             '02 - A Brief History of the Internet.srt',
  101.                         #                              ...
  102.                         #                             ],
  103.                         #                            ...
  104.                         #                       },
  105.                         #                       ...
  106.                         #               }
  107.                     lectures_all[dir_basename][dirname].append(filename)
  108.         return lectures_all
  109.    
  110.     def build_transcript(self):
  111.         """ Here the transcripts are parsed and written to the output file.
  112.        """
  113.         with open(self.output_file, 'w') as output_file:
  114.             # loop through our ordered dictionary
  115.             for lecture, filenames in self.get_lectures().iteritems():
  116.                 # write the name of the lecture
  117.                 output_file.writelines(['\n\n', 'Lecture ', lecture, '\n\n'])
  118.                 # loop through our ordered dictionary of dictionaries
  119.                 for file_dir, file_name_list in filenames.iteritems():
  120.                     # loop through our ordered dictionary of dictionaries of lists
  121.                     for file_name in file_name_list:
  122.                         # write the name of the lesson
  123.                         output_file.writelines([file_name, '\n'])
  124.                         # open the Udacity provided .srt file
  125.                         with open(os.path.join(file_dir, file_name) , 'r') as f:
  126.                             # the following list comprehention of code:
  127.                                 # goes through each line in the file
  128.                                 # groups sets of lines separated by an empty line
  129.                                 # converts the groupped lines into lists of lists of lines
  130.                                 # the actual text is the 2nd element (0-based) of each list
  131.                             all_lines = [list(g) for b,g in groupby(f, lambda x: bool(x.strip())) if b]
  132.                             # if the list is not empty
  133.                             if all_lines:
  134.                                 for line in all_lines:
  135.                                     # if the list inside the list contains 3 items, that is the valid list
  136.                                     if len(line) == 3:
  137.                                         # write out the text
  138.                                         output_file.write(line[2].rstrip() + ' ')
  139.                                 output_file.writelines(['\n\n', '---'*10, '\n'])
  140.                             else:
  141.                                 output_file.writelines(['Lecture without words :)','\n\n', '---'*10, '\n'])
  142.                     output_file.write('==='*20)
  143.                     output_file.write('\n\n')
  144.  
  145. # Initiate the class
  146. parser = LectureParser()
  147. # run the build method
  148. parser.build_transcript()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement