Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ##################################################
- #
- # The main walker
- #
- # 1) called with arguments: inputDir, outputDir
- # 2) Walks through entire inputDir and outputs to outputDir
- #
- ##################################################
- import os
- from UniqueNameGenerator import UniqueNameGenerator
- import HtmlParser
- import traceback
- from HtmlParser import logToOutput
- def walk(inputDir, outputDir, failDir):
- """Initialization function for walker"""
- # Some initialziation
- nameGenerator = UniqueNameGenerator()
- inputDir = os.path.abspath( inputDir)
- outputDir = os.path.abspath( outputDir)
- failDir = os.path.abspath( failDir)
- g = open(failDir,'w+')
- # Now walk
- for root, dirs, files in os.walk(inputDir):
- # Recurse on each file
- for file in files:
- #print os.getcwd()
- # Note that the root contains the root
- file = os.path.join( os.path.abspath(root), file)
- outputFDir = getOutputFDir(outputDir, inputDir, file)
- if getFileExtension( file) == ".html":
- try:
- parseFunction( file, nameGenerator.getID(), outputFDir)
- except Exception, e:
- #For the output
- traceback.print_exc()
- g.write("==================================================\n")
- g.write("Error at file: " + file + "\n")
- g.write( str(e) )
- g.write( "\n")
- traceback.print_exc(100, g)
- def getFileExtension( fileName):
- """Raises Exception if none. """
- ext = os.path.splitext(fileName)[1]
- if ext == '':
- return "NO_FILE_EXTENSION"
- else:
- return ext
- def getOutputFDir(outputDir, baseString, currentFName):
- """1) Finds the "difference between file and inputDir, then adds that to outputDir,
- then output just to the folder level.
- e.g. (/output/dir, /input/dir, /input/dir/shit/fuck) => /output/dir/shit"""
- # Check if outputDir, baseString or currentFName all ends with /
- outputDir = fixSlashes( outputDir)
- baseString = fixSlashes( baseString)
- currentFName = fixSlashes( currentFName )
- baseList = baseString.split(os.path.sep)
- currentList = currentFName.split( os.path.sep)
- differenceList = currentList[len(baseList):]
- ########################################
- # Important here: we should only return up to the last directory
- # so /home/disappearedng/python.py you should just return /home/disappearedng
- # because in htmlParser the python.py will be changed to python_py.TREC
- if len(differenceList) == 0:
- raise Exception("No difference")
- elif len(differenceList) > 1:
- suffix = os.path.sep.join( differenceList)
- else:
- suffix = differenceList[0]
- # Modfication: return up to the last folder
- ret = os.path.sep.join( [outputDir, suffix])
- ret = os.path.split(ret)
- return ret[0]
- def fixSlashes(string):
- """Removes trailing / and ensures that string begins with /"""
- stringList = string.split(os.path.sep)
- # Now remove every '' except the first one
- stringList = [x for x in stringList if x]
- stringList.insert(0, '')
- return os.path.sep.join( stringList)
- if __name__ == "__main__":
- import sys
- if len(sys.argv) != 4:
- print "Usage: python main_walker.py inputDirectory outputDirectory errorLog"
- else:
- parseFunction = HtmlParser.parseFile
- walk( sys.argv[1], sys.argv[2], sys.argv[3] )
Add Comment
Please, Sign In to add comment