Guest User

Untitled

a guest
Jun 18th, 2018
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.69 KB | None | 0 0
  1. ##################################################
  2. #
  3. # The main walker
  4. #
  5. # 1) called with arguments: inputDir, outputDir
  6. # 2) Walks through entire inputDir and outputs to outputDir
  7. #
  8. ##################################################
  9.  
  10.  
  11. import os
  12. from UniqueNameGenerator import UniqueNameGenerator
  13. import HtmlParser
  14. import traceback
  15. from HtmlParser import logToOutput
  16.  
  17.  
  18. def walk(inputDir, outputDir, failDir):
  19. """Initialization function for walker"""
  20.  
  21. # Some initialziation
  22. nameGenerator = UniqueNameGenerator()
  23. inputDir = os.path.abspath( inputDir)
  24. outputDir = os.path.abspath( outputDir)
  25. failDir = os.path.abspath( failDir)
  26. g = open(failDir,'w+')
  27.  
  28. # Now walk
  29. for root, dirs, files in os.walk(inputDir):
  30.  
  31. # Recurse on each file
  32. for file in files:
  33.  
  34. #print os.getcwd()
  35.  
  36. # Note that the root contains the root
  37. file = os.path.join( os.path.abspath(root), file)
  38. outputFDir = getOutputFDir(outputDir, inputDir, file)
  39.  
  40. if getFileExtension( file) == ".html":
  41. try:
  42. parseFunction( file, nameGenerator.getID(), outputFDir)
  43.  
  44. except Exception, e:
  45. #For the output
  46. traceback.print_exc()
  47.  
  48. g.write("==================================================\n")
  49. g.write("Error at file: " + file + "\n")
  50. g.write( str(e) )
  51. g.write( "\n")
  52. traceback.print_exc(100, g)
  53.  
  54.  
  55.  
  56.  
  57.  
  58.  
  59. def getFileExtension( fileName):
  60. """Raises Exception if none. """
  61.  
  62. ext = os.path.splitext(fileName)[1]
  63. if ext == '':
  64. return "NO_FILE_EXTENSION"
  65.  
  66. else:
  67. return ext
  68.  
  69.  
  70.  
  71.  
  72. def getOutputFDir(outputDir, baseString, currentFName):
  73. """1) Finds the "difference between file and inputDir, then adds that to outputDir,
  74. then output just to the folder level.
  75.  
  76. e.g. (/output/dir, /input/dir, /input/dir/shit/fuck) => /output/dir/shit"""
  77.  
  78. # Check if outputDir, baseString or currentFName all ends with /
  79. outputDir = fixSlashes( outputDir)
  80. baseString = fixSlashes( baseString)
  81. currentFName = fixSlashes( currentFName )
  82.  
  83. baseList = baseString.split(os.path.sep)
  84. currentList = currentFName.split( os.path.sep)
  85. differenceList = currentList[len(baseList):]
  86.  
  87. ########################################
  88. # Important here: we should only return up to the last directory
  89. # so /home/disappearedng/python.py you should just return /home/disappearedng
  90. # because in htmlParser the python.py will be changed to python_py.TREC
  91.  
  92. if len(differenceList) == 0:
  93. raise Exception("No difference")
  94.  
  95. elif len(differenceList) > 1:
  96. suffix = os.path.sep.join( differenceList)
  97.  
  98. else:
  99. suffix = differenceList[0]
  100.  
  101. # Modfication: return up to the last folder
  102. ret = os.path.sep.join( [outputDir, suffix])
  103. ret = os.path.split(ret)
  104.  
  105. return ret[0]
  106.  
  107.  
  108.  
  109.  
  110. def fixSlashes(string):
  111. """Removes trailing / and ensures that string begins with /"""
  112.  
  113. stringList = string.split(os.path.sep)
  114.  
  115. # Now remove every '' except the first one
  116. stringList = [x for x in stringList if x]
  117. stringList.insert(0, '')
  118.  
  119. return os.path.sep.join( stringList)
  120.  
  121.  
  122. if __name__ == "__main__":
  123. import sys
  124.  
  125. if len(sys.argv) != 4:
  126. print "Usage: python main_walker.py inputDirectory outputDirectory errorLog"
  127.  
  128. else:
  129. parseFunction = HtmlParser.parseFile
  130. walk( sys.argv[1], sys.argv[2], sys.argv[3] )
Add Comment
Please, Sign In to add comment