Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- #Dependencies: catdoc, Python 3.x
- #on Ubuntu, you can install it with this: sudo apt-get install catdoc
- import subprocess, shlex, glob, os, zipfile, re, html.parser, sys, shutil;
- def files_with_word(word, from_dir, to_dir=None):
- from_dir=os.path.abspath(from_dir);
- if to_dir!=None:
- to_dir=os.path.abspath(to_dir);
- if not os.path.exists(from_dir):
- print("Your source path does not exist.");
- return;
- if to_dir!=None:
- if not os.path.exists(to_dir):
- yn="";
- while yn.lower() not in {"y", "yes", "n", "no"}:
- yn=input("Your destination path, " +to_dir+ ", does not exist. Create? (y/n) ");
- if yn.lower().strip() in {"y", "yes"}:
- os.makedirs(to_dir);
- else:
- print("Not created (and not used). No files will be moved.");
- to_dir=None;
- files_with_word_list=[];
- for x in glob.glob(os.path.join(from_dir, "*.doc")):
- command=shlex.split('catdoc -w "' + x + '"');
- file_text, err=subprocess.Popen(command, stdout=subprocess.PIPE).communicate();
- if word in str(file_text):
- files_with_word_list.append(x);
- for x in glob.glob(os.path.join(from_dir, "*.docx"))+glob.glob(os.path.join(from_dir, "*.odt")):
- z=zipfile.ZipFile(x);
- zippath="word/document.xml";
- if x.endswith("odt")==True:
- zippath="content.xml";
- file_text=z.read(zippath).decode();
- file_text=re.sub(r"<[^>]*>", r"", file_text)[1:];
- file_text=html.parser.HTMLParser().unescape(file_text);
- #print(file_text); #This is for debugging purposes.
- if word in file_text:
- files_with_word_list.append(x);
- if len(files_with_word_list)==0:
- print("No matches found.");
- else:
- if to_dir==None:
- print("\n".join(files_with_word_list).strip());
- if to_dir!=None:
- for x in files_with_word_list:
- #os.remove(x);
- xfilename=x.split(os.sep)[-1];
- to_x=os.path.join(to_dir, xfilename);
- print("Found " + x + " moving to " + to_x);
- shutil.move(x, to_x);
- files_with_word(word="test", from_dir="", to_dir="test/backup");
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement