Untitled

# -*- coding: utf-8 -*-

import re
import os
import csv
import glob
import string
from Tkinter import Tk
import unicodedata as udata
from cStringIO import StringIO
from tkFileDialog import askdirectory
from pyth.plugins.rtf15.reader import Rtf15Reader


def decode_cell(cell):
    '''The cell matched so lets handle it'''

    # variable that will hold the converted text
    temp_cell = []

    # pyth checks for the rtf syntax before processing, so 'unicode_escape' escapes the '\' so pyth doesn't complain
    cell_encode = udata.normalize('NFKD', cell.decode('unicode_escape')).encode('ascii', 'ignore')
    cell_encode = filter(lambda x: x in string.printable, cell_encode)
    cell_rtf = Rtf15Reader.read(StringIO(cell_encode))

    # turn the pyth object into readable text
    cell_txt = [x.content for x in cell_rtf.content]

    # iterate and extract the pyth object text into temp_cell
    for line in cell_txt:
        for l in line:
            temp_cell.append(l.content)


    # combine and join the extracted text into one string (for one cell)
    combined = [i for sub in temp_cell for i in sub]
    new_cell =  ' '.join(combined)

    # the non-ascii characters in your file were followed by _ so i removed them for cleanliness
    # uncomment to keep the _
    new_cell = re.sub('_', '', new_cell)

    # remove extra whitespace and return the converted cell
    # remove L at end of string
    return ' '.join(new_cell[:-1].split())


def find_rtf(row):
    '''Start looking for rtf syntax'''

    # variable that will return the row to writer
    temp_row = []

    # loop and index each cell in row
    for n, cell in enumerate(row):

        # your csv is shitty
        if type(cell) == str:
            cell = unicode(cell, "utf-8", errors="ignore")
        else:
            cell = unicode(cell)

        # if the cell text starts with {\\rtf we need to know
        if re.match(r'^{\\\\rtf', cell):

            # holder
            combined = []

            # collect all cells following matched cell
            for item in row[n:]:
                combined.append(item)

            # combine the rest of the row
            cell = ' '.join(combined)

            # send off to convert rtf
            cell_matched = decode_cell(cell)

            # add the cell, with converted rtf, back to the row
            temp_row.append(cell_matched.encode('ascii', 'ignore'))

            # we donâ€™t want to process further cells because they're now combined
            # break the loop to start at next row
            break

        else:
            # if the cell didn't have rtf just add it back to the row
            temp_row.append(cell.encode('ascii', 'ignore'))

    return temp_row


def open_csv(f_ori, f_new):
    '''Open original file, process, and save to new file'''

    # 'rU' = read 'r' and open with 'U' so the newlines inside the cell are respected
    # 'wb' = write 'w' in binary 'b' mode
    # 'with open' automatically closes the file
    with open(f_ori, 'rU') as file1, open(f_new, 'wb') as file2:

        reader = csv.reader(file1)
        writer = csv.writer(file2)

        # loop through rows in the opened csv
        for row in reader:

            # send to fx to look for rtf syntax
            new_row = find_rtf(row)

            # write the row to new file
            writer.writerow(new_row)


def add_suffix(f_ori):
    '''Append suffix to original file name'''

    suffix = '-processed'

    # explode full path into path, name, ext
    path, name = os.path.split(f_ori)
    name, ext = os.path.splitext(name)

    # function to append suffix
    mk_suffix = lambda i: os.path.join(path, '%s%s%s' % (name, i, ext))

    # process and return
    return mk_suffix(suffix)


def iterate_dir(path):
    '''Iterate files in selected dir and filter out .csv'''

    extension = '/*.csv'
    select = path + extension

    for i in glob.iglob(select):
        # create unique name for new file
        # send to opener
        open_csv(i, add_suffix(i))


def main():
    '''Initiate script and select directory to process'''

    ini_path = os.path.expanduser('~/Desktop')

    OPEN_OPTIONS = dict(
                        # specify root folder for ui
                        # uncomment initialdir entirely to remember last dir
                        #initialdir='/Users',
                        initialdir=ini_path,
                        title='Select Directory'
                        )

    Tk().withdraw()
    ask_path = askdirectory(**OPEN_OPTIONS)

    # move to fx
    iterate_dir(ask_path)


if __name__ == "__main__":
    main()