Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- from itertools import groupby
- from collections import OrderedDict
- class LectureParser():
- """ Creates a single ~/Desktop/output.txt file in the following format:
- Lecture1
- Lesson1
- ------------
- Lesson2
- ------------
- =======================
- Assumptions:
- The list of lecture directories in Downloads folder
- The user executing this script has admin privilages.
- """
- def __init__(self):
- """ Args:
- self.home_dir: a user's home directory independent of the os systems used (Unix-based os is assumed).
- self.output_file: a path to the user's ~/Desktop where the output file is generated.
- """
- self.home_dir = os.path.expanduser('~')
- self.output_file = os.path.join(self.get_desktop_dir(), 'output.txt')
- def get_downloads_dir(self):
- """ Returns:
- downloads_dir: the user's ~/Downloads directory.
- """
- downloads_dir = os.path.join(self.home_dir, 'Downloads')
- return downloads_dir
- def get_desktop_dir(self):
- """ Returns:
- desktop_dir: the user's ~/Desktop directory.
- """
- desktop_dir = os.path.join(self.home_dir, 'Desktop')
- return desktop_dir
- def get_lectures_dirs(self):
- """ Generates an ordered dictionary based on the available list of lectures.
- Returns:
- lectures: an ordered dictioanry of format:
- {'lecture_name_1': {}, 'lecture_name_2: {}, ... }
- """
- dirs = ['1 - Introduction Subtitles', '2 - Architecture & Principles Subtitles',
- '3 - Switching Subtitles', '4 - Routing Subtitles',
- '5 - Naming, Addressing & Forwarding Subtitles', '5. 1 - Router Design Basics Subtitles',
- '5.2 - DNS Subtitles', '6 - Congestion control & streaming Subtitles',
- '7 - Rate limiting and traffic shaping Subtitles', '8 - Content distribution Subtitles',
- '9 - Software Defined Networking Subtitles', '9.1 - Programming SDNs Subtitles',
- '10 - Traffic Engineering Subtitles', '11 - Network Security Subtitles',
- '11.1 - Internet Worms Subtitles', '11.2 - Spam Subtitles', '11.3 - Denial of Service Attacks Subtitles',]
- # initiate ordered dictionary (imported from collections)
- lectures = OrderedDict()
- # loop through the list of lectures and set the lecture names as the key in dictioanry;
- #+ setdefault method allows for the key to point to a data structure (dictionary in our case)
- #+ without values
- for dir in dirs:
- lectures.setdefault(dir, {})
- return lectures
- def get_lectures(self):
- """ Continues to build ordered dictionary by adding full path and file names.
- Returns:
- lectures_all: an ordered dictionary of format:
- {'lecture_name_1': {'full/path/': [lesson_file_name_1, lesson_file_name_2, ...]}, ...}
- """
- # get the ordered dictionary with keys and empty values
- lectures_all = self.get_lectures_dirs()
- # get the keys from the dictionary to avoid other files in the ~/Downloads folder
- valid_dirs = lectures_all.keys()
- # walk via ~/Downloads folder:
- # dirname - full path to each file in the ~/Downloads directory
- # dirnames - only the names of other directories inside ~Downloads directory
- # filenames - all file names in the ~/Downloads directory and its children
- for dirname, dirnames, filenames in os.walk(self.get_downloads_dir()):
- # while looping via all directories, check if the directory basename (last part of the path)
- #+ is within the desired list
- dir_basename = os.path.basename(dirname)
- # if it is, add the full path as a value to our ordered dictionary
- # use .setdefault to generate a dictioanry of dictionary of lists:
- # lectures_all -> {'2 - Architecture & Principles Subtitles':
- # {
- # '/Users/user_name/Downloads/2 - Architecture & Principles Subtitles': [],
- # ...
- # },
- # ...
- # }
- if dir_basename in valid_dirs:
- lectures_all[dir_basename].setdefault(dirname, [])
- for filename in filenames:
- # finally, complete the ordered dictionary
- # lectures_all -> {'2 - Architecture & Principles Subtitles':
- # {
- # '/Users/user_name/Downloads/2 - Architecture & Principles Subtitles':
- # ['01 - Lesson 2 Intro.srt',
- # '02 - A Brief History of the Internet.srt',
- # ...
- # ],
- # ...
- # },
- # ...
- # }
- lectures_all[dir_basename][dirname].append(filename)
- return lectures_all
- def build_transcript(self):
- """ Here the transcripts are parsed and written to the output file.
- """
- with open(self.output_file, 'w') as output_file:
- # loop through our ordered dictionary
- for lecture, filenames in self.get_lectures().iteritems():
- # write the name of the lecture
- output_file.writelines(['\n\n', 'Lecture ', lecture, '\n\n'])
- # loop through our ordered dictionary of dictionaries
- for file_dir, file_name_list in filenames.iteritems():
- # loop through our ordered dictionary of dictionaries of lists
- for file_name in file_name_list:
- # write the name of the lesson
- output_file.writelines([file_name, '\n'])
- # open the Udacity provided .srt file
- with open(os.path.join(file_dir, file_name) , 'r') as f:
- # the following list comprehention of code:
- # goes through each line in the file
- # groups sets of lines separated by an empty line
- # converts the groupped lines into lists of lists of lines
- # the actual text is the 2nd element (0-based) of each list
- all_lines = [list(g) for b,g in groupby(f, lambda x: bool(x.strip())) if b]
- # if the list is not empty
- if all_lines:
- for line in all_lines:
- # if the list inside the list contains 3 items, that is the valid list
- if len(line) == 3:
- # write out the text
- output_file.write(line[2].rstrip() + ' ')
- output_file.writelines(['\n\n', '---'*10, '\n'])
- else:
- output_file.writelines(['Lecture without words :)','\n\n', '---'*10, '\n'])
- output_file.write('==='*20)
- output_file.write('\n\n')
- # Initiate the class
- parser = LectureParser()
- # run the build method
- parser.build_transcript()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement