Advertisement
Guest User

Move search matches for word processing formats

a guest
Sep 5th, 2014
275
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2.  
  3. #Dependencies: catdoc, Python 3.x
  4. #on Ubuntu, you can install it with this: sudo apt-get install catdoc
  5.  
  6. import subprocess, shlex, glob, os, zipfile, re, html.parser, sys, shutil;
  7.  
  8. def files_with_word(word, from_dir, to_dir=None):
  9.     from_dir=os.path.abspath(from_dir);
  10.     if to_dir!=None:
  11.         to_dir=os.path.abspath(to_dir);
  12.     if not os.path.exists(from_dir):
  13.         print("Your source path does not exist.");
  14.         return;
  15.     if to_dir!=None:
  16.         if not os.path.exists(to_dir):
  17.             yn="";
  18.             while yn.lower() not in {"y", "yes", "n", "no"}:
  19.                 yn=input("Your destination path, " +to_dir+ ", does not exist. Create? (y/n) ");
  20.             if yn.lower().strip() in {"y", "yes"}:
  21.                 os.makedirs(to_dir);
  22.             else:
  23.                 print("Not created (and not used). No files will be moved.");
  24.                 to_dir=None;
  25.     files_with_word_list=[];
  26.     for x in glob.glob(os.path.join(from_dir, "*.doc")):
  27.         command=shlex.split('catdoc -w "' + x + '"');
  28.         file_text, err=subprocess.Popen(command, stdout=subprocess.PIPE).communicate();
  29.         if word in str(file_text):
  30.             files_with_word_list.append(x);
  31.     for x in glob.glob(os.path.join(from_dir, "*.docx"))+glob.glob(os.path.join(from_dir, "*.odt")):
  32.         z=zipfile.ZipFile(x);
  33.         zippath="word/document.xml";
  34.         if x.endswith("odt")==True:
  35.             zippath="content.xml";
  36.         file_text=z.read(zippath).decode();
  37.         file_text=re.sub(r"<[^>]*>", r"", file_text)[1:];
  38.         file_text=html.parser.HTMLParser().unescape(file_text);
  39.         #print(file_text); #This is for debugging purposes.
  40.         if word in file_text:
  41.             files_with_word_list.append(x);
  42.     if len(files_with_word_list)==0:
  43.         print("No matches found.");
  44.     else:
  45.         if to_dir==None:
  46.             print("\n".join(files_with_word_list).strip());
  47.     if to_dir!=None:
  48.         for x in files_with_word_list:
  49.             #os.remove(x);
  50.             xfilename=x.split(os.sep)[-1];
  51.             to_x=os.path.join(to_dir, xfilename);
  52.             print("Found " + x + " moving to " + to_x);
  53.             shutil.move(x, to_x);
  54.  
  55. files_with_word(word="test", from_dir="", to_dir="test/backup");
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement