Advertisement
Guest User

Untitled

a guest
Sep 5th, 2014
178
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #Dependencies: catdoc
  2. #on Ubuntu, you can install it with this: sudo apt-get install catdoc
  3.  
  4. import subprocess, shlex, glob, os, zipfile, re, html.parser;
  5.  
  6. def files_with_word(path, word, delete=False):
  7.     files_with_word=[];
  8.     for x in glob.glob(os.path.join(path, "*.doc")):
  9.         command=shlex.split('catdoc -w "' + x + '"');
  10.         file_text, err=subprocess.Popen(command, stdout=subprocess.PIPE).communicate();
  11.         if word in str(file_text):
  12.             files_with_word.append(x);
  13.     for x in glob.glob(os.path.join(path, "*.docx")):
  14.         z=zipfile.ZipFile(x);
  15.         file_text=z.read("word/document.xml").decode();
  16.         file_text=re.sub(r"<[^>]*>", r"", file_text)[1:];
  17.         file_text=html.parser.HTMLParser().unescape(file_text);
  18.         if word in file_text:
  19.             files_with_word.append(x);
  20.     print("\n".join(files_with_word));
  21.     if delete==True:
  22.         for x in files_with_word:
  23.             print("Deleting " + x);
  24.             os.remove(x);
  25.  
  26. files_with_word("my/directory/path", "word or phrase", delete=True);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement