Advertisement
Guest User

Untitled

a guest
May 30th, 2015
221
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.72 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. import re
  4. import os
  5. import csv
  6. import glob
  7. import string
  8. from Tkinter import Tk
  9. import unicodedata as udata
  10. from cStringIO import StringIO
  11. from tkFileDialog import askdirectory
  12. from pyth.plugins.rtf15.reader import Rtf15Reader
  13.  
  14.  
  15.  
  16. def decode_cell(cell):
  17. '''The cell matched so lets handle it'''
  18.  
  19. # variable that will hold the converted text
  20. temp_cell = []
  21.  
  22. # pyth checks for the rtf syntax before processing, so 'unicode_escape' escapes the '\' so pyth doesn't complain
  23. cell_encode = udata.normalize('NFKD', cell.decode('unicode_escape')).encode('ascii', 'ignore')
  24. cell_encode = filter(lambda x: x in string.printable, cell_encode)
  25. cell_rtf = Rtf15Reader.read(StringIO(cell_encode))
  26.  
  27. # turn the pyth object into readable text
  28. cell_txt = [x.content for x in cell_rtf.content]
  29.  
  30. # iterate and extract the pyth object text into temp_cell
  31. for line in cell_txt:
  32. for l in line:
  33. temp_cell.append(l.content)
  34.  
  35.  
  36. # combine and join the extracted text into one string (for one cell)
  37. combined = [i for sub in temp_cell for i in sub]
  38. new_cell = ' '.join(combined)
  39.  
  40. # the non-ascii characters in your file were followed by _ so i removed them for cleanliness
  41. # uncomment to keep the _
  42. new_cell = re.sub('_', '', new_cell)
  43.  
  44. # remove extra whitespace and return the converted cell
  45. # remove L at end of string
  46. return ' '.join(new_cell[:-1].split())
  47.  
  48.  
  49.  
  50. def find_rtf(row):
  51. '''Start looking for rtf syntax'''
  52.  
  53. # variable that will return the row to writer
  54. temp_row = []
  55.  
  56. # loop and index each cell in row
  57. for n, cell in enumerate(row):
  58.  
  59. # your csv is shitty
  60. if type(cell) == str:
  61. cell = unicode(cell, "utf-8", errors="ignore")
  62. else:
  63. cell = unicode(cell)
  64.  
  65. # if the cell text starts with {\\rtf we need to know
  66. if re.match(r'^{\\\\rtf', cell):
  67.  
  68. # holder
  69. combined = []
  70.  
  71. # collect all cells following matched cell
  72. for item in row[n:]:
  73. combined.append(item)
  74.  
  75. # combine the rest of the row
  76. cell = ' '.join(combined)
  77.  
  78. # send off to convert rtf
  79. cell_matched = decode_cell(cell)
  80.  
  81. # add the cell, with converted rtf, back to the row
  82. temp_row.append(cell_matched.encode('ascii', 'ignore'))
  83.  
  84. # we don’t want to process further cells because they're now combined
  85. # break the loop to start at next row
  86. break
  87.  
  88. else:
  89. # if the cell didn't have rtf just add it back to the row
  90. temp_row.append(cell.encode('ascii', 'ignore'))
  91.  
  92. return temp_row
  93.  
  94.  
  95.  
  96. def open_csv(f_ori, f_new):
  97. '''Open original file, process, and save to new file'''
  98.  
  99. # 'rU' = read 'r' and open with 'U' so the newlines inside the cell are respected
  100. # 'wb' = write 'w' in binary 'b' mode
  101. # 'with open' automatically closes the file
  102. with open(f_ori, 'rU') as file1, open(f_new, 'wb') as file2:
  103.  
  104. reader = csv.reader(file1)
  105. writer = csv.writer(file2)
  106.  
  107. # loop through rows in the opened csv
  108. for row in reader:
  109.  
  110. # send to fx to look for rtf syntax
  111. new_row = find_rtf(row)
  112.  
  113. # write the row to new file
  114. writer.writerow(new_row)
  115.  
  116.  
  117.  
  118. def add_suffix(f_ori):
  119. '''Append suffix to original file name'''
  120.  
  121. suffix = '-processed'
  122.  
  123. # explode full path into path, name, ext
  124. path, name = os.path.split(f_ori)
  125. name, ext = os.path.splitext(name)
  126.  
  127. # function to append suffix
  128. mk_suffix = lambda i: os.path.join(path, '%s%s%s' % (name, i, ext))
  129.  
  130. # process and return
  131. return mk_suffix(suffix)
  132.  
  133.  
  134.  
  135. def iterate_dir(path):
  136. '''Iterate files in selected dir and filter out .csv'''
  137.  
  138. extension = '/*.csv'
  139. select = path + extension
  140.  
  141. for i in glob.iglob(select):
  142. # create unique name for new file
  143. # send to opener
  144. open_csv(i, add_suffix(i))
  145.  
  146.  
  147.  
  148. def main():
  149. '''Initiate script and select directory to process'''
  150.  
  151. ini_path = os.path.expanduser('~/Desktop')
  152.  
  153. OPEN_OPTIONS = dict(
  154. # specify root folder for ui
  155. # uncomment initialdir entirely to remember last dir
  156. #initialdir='/Users',
  157. initialdir=ini_path,
  158. title='Select Directory'
  159. )
  160.  
  161. Tk().withdraw()
  162. ask_path = askdirectory(**OPEN_OPTIONS)
  163.  
  164. # move to fx
  165. iterate_dir(ask_path)
  166.  
  167.  
  168.  
  169. if __name__ == "__main__":
  170. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement