Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Dependencies: catdoc
- #on Ubuntu, you can install it with this: sudo apt-get install catdoc
- import subprocess, shlex, glob, os, zipfile, re, html.parser;
- def files_with_word(path, word, delete=False):
- files_with_word=[];
- for x in glob.glob(os.path.join(path, "*.doc")):
- command=shlex.split('catdoc -w "' + x + '"');
- file_text, err=subprocess.Popen(command, stdout=subprocess.PIPE).communicate();
- if word in str(file_text):
- files_with_word.append(x);
- for x in glob.glob(os.path.join(path, "*.docx")):
- z=zipfile.ZipFile(x);
- file_text=z.read("word/document.xml").decode();
- file_text=re.sub(r"<[^>]*>", r"", file_text)[1:];
- file_text=html.parser.HTMLParser().unescape(file_text);
- if word in file_text:
- files_with_word.append(x);
- print("\n".join(files_with_word));
- if delete==True:
- for x in files_with_word:
- print("Deleting " + x);
- os.remove(x);
- files_with_word("my/directory/path", "word or phrase", delete=True);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement