Advertisement
Guest User

Khan Academy API Crawler

a guest
Nov 18th, 2019
247
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.13 KB | None | 0 0
  1. import argparse
  2. import os
  3. import requests
  4. import sys
  5.  
  6. class KhanAcademyVideoDownloader:
  7.  
  8.     def __init__(self):
  9.         self._kindString = "kind"
  10.         self._topicString = "Topic"
  11.         self._videoString = "Video"
  12.  
  13.         self._topicUrl = "http://www.khanacademy.org/api/v1/topic/"
  14.         self._videoUrl = "http://www.khanacademy.org/api/v1/videos/"
  15.  
  16.         self._fileExtension = "mp4"
  17.         self._filePath = []
  18.  
  19.     def parseCommandLineArguments(self):
  20.         parser = argparse.ArgumentParser()
  21.         parser.add_argument("course_name", type=str, help="The name of the course you want to download the videos of.")
  22.         parser.add_argument("-e", type=str, help="The extension of the videos to be downloaded", metavar="video_extension")
  23.         args = parser.parse_args()
  24.         if args.e is not None:
  25.             if len(args.e) > 1:
  26.                 self._fileExtension = args.e
  27.             else:
  28.                 print("The provided extension is too short, by default using mp4.")
  29.         return args.course_name
  30.  
  31.     def visitCourse(self, topic):
  32.         self._filePath.append(topic)
  33.         self.visitElement(requests.get(self._topicUrl + topic).json())
  34.  
  35.     def visitElement(self, element):
  36.         elementKind = element[self._kindString]
  37.         if elementKind == self._topicString:
  38.             self.visitTopic(element)
  39.         elif elementKind == self._videoString:
  40.             self.visitVideo(element)
  41.  
  42.     def visitTopic(self, topicElement):
  43.         self._childrenString = "children"
  44.         self._idString = "id"
  45.  
  46.         childCounter = 0
  47.         for child in topicElement[self._childrenString]:
  48.             childId = child[self._idString]
  49.             childKind = child[self._kindString]
  50.  
  51.             if(childKind == self._topicString or childKind == self._videoString):
  52.                 if childKind == self._topicString:
  53.                     url =  self._topicUrl + childId
  54.                 elif childKind == self._videoString:
  55.                     url = self._videoUrl + childId
  56.                 print(url)
  57.                 childCounter += 1
  58.                 self._filePath.append(str(childCounter) + "-" + child["title"])
  59.                 self.visitElement(requests.get(url).json())
  60.                 self._filePath.pop()
  61.  
  62.     def visitVideo(self, videoElement):
  63.         downloadUrls = videoElement["download_urls"]
  64.         if self._fileExtension in downloadUrls:
  65.             fileRequest = requests.get(downloadUrls[self._fileExtension], stream=True)
  66.             fileName = "/".join(self._filePath) + "." + self._fileExtension
  67.             print("Downloading " + fileName)
  68.             self.saveFile(fileRequest, fileName)
  69.  
  70.     def saveFile(self, fileRequest, fileName):
  71.         if not os.path.exists(os.path.dirname(fileName)):
  72.             os.makedirs(os.path.dirname(fileName))
  73.  
  74.         chunkSize = 2 ** 16
  75.         with open(fileName, 'wb') as fileDescriptor:
  76.             for chunk in fileRequest.iter_content(chunkSize):
  77.                 fileDescriptor.write(chunk)
  78.  
  79. def main():
  80.     k = KhanAcademyVideoDownloader()
  81.     k.visitCourse(k.parseCommandLineArguments())
  82.  
  83. if __name__ == '__main__':
  84.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement