Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- #Dependencies: catdoc, Python 3.x
- #on Ubuntu, you can install it with this: sudo apt-get install catdoc
- import subprocess, shlex, glob, os, zipfile, re, html.parser, sys;
- def files_with_word(word, path, delete=None):
- files_with_word_list=[];
- for x in glob.glob(os.path.join(path, "*.doc")):
- command=shlex.split('catdoc -w "' + x + '"');
- file_text, err=subprocess.Popen(command, stdout=subprocess.PIPE).communicate();
- if word in str(file_text):
- files_with_word_list.append(x);
- for x in glob.glob(os.path.join(path, "*.docx"))+glob.glob(os.path.join(path, "*.odt")):
- z=zipfile.ZipFile(x);
- zippath="word/document.xml";
- if x.endswith("odt")==True:
- zippath="content.xml";
- file_text=z.read(zippath).decode();
- file_text=re.sub(r"<[^>]*>", r"", file_text)[1:];
- file_text=html.parser.HTMLParser().unescape(file_text);
- #print(file_text); #This is for debugging purposes.
- if word in file_text:
- files_with_word_list.append(x);
- print("\n".join(files_with_word_list));
- if delete=="delete":
- for x in files_with_word_list:
- print("Deleting " + x);
- os.remove(x);
- elif delete==None:
- pass;
- else:
- raise ValueError("The third argument must either be the word, βdeleteβ, or it must be omitted.");
- try:
- files_with_word(sys.argv[1], sys.argv[2], sys.argv[3]);
- except IndexError:
- files_with_word(sys.argv[1], sys.argv[2]);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement