Advertisement
Guest User

Find phrase in word processor documents (doc, docx, odt)

a guest
Sep 5th, 2014
258
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2.  
  3. #Dependencies: catdoc, Python 3.x
  4. #on Ubuntu, you can install it with this: sudo apt-get install catdoc
  5.  
  6. import subprocess, shlex, glob, os, zipfile, re, html.parser, sys;
  7.  
  8. def files_with_word(word, path, delete=None):
  9.     files_with_word_list=[];
  10.     for x in glob.glob(os.path.join(path, "*.doc")):
  11.         command=shlex.split('catdoc -w "' + x + '"');
  12.         file_text, err=subprocess.Popen(command, stdout=subprocess.PIPE).communicate();
  13.         if word in str(file_text):
  14.             files_with_word_list.append(x);
  15.     for x in glob.glob(os.path.join(path, "*.docx"))+glob.glob(os.path.join(path, "*.odt")):
  16.         z=zipfile.ZipFile(x);
  17.         zippath="word/document.xml";
  18.         if x.endswith("odt")==True:
  19.             zippath="content.xml";
  20.         file_text=z.read(zippath).decode();
  21.         file_text=re.sub(r"<[^>]*>", r"", file_text)[1:];
  22.         file_text=html.parser.HTMLParser().unescape(file_text);
  23.         #print(file_text); #This is for debugging purposes.
  24.         if word in file_text:
  25.             files_with_word_list.append(x);
  26.     print("\n".join(files_with_word_list));
  27.     if delete=="delete":
  28.         for x in files_with_word_list:
  29.             print("Deleting " + x);
  30.             os.remove(x);
  31.     elif delete==None:
  32.         pass;
  33.     else:
  34.         raise ValueError("The third argument must either be the word, β€˜delete’, or it must be omitted.");
  35.  
  36. try:
  37.     files_with_word(sys.argv[1], sys.argv[2], sys.argv[3]);
  38. except IndexError:
  39.     files_with_word(sys.argv[1], sys.argv[2]);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement